Beispiel #1
0
  def populate_db_with_random_data(self,
      db_name,
      db_connectors,
      min_number_of_tables,
      max_number_of_tables,
      min_number_of_cols,
      max_number_of_cols,
      min_number_of_rows,
      max_number_of_rows,
      allowed_storage_formats,
      create_files):
    '''Create tables with a random number of cols.

       The given db_name must have already been created.
    '''
    connections = list()
    hive_connection = None
    for connector in db_connectors:
      connection = connector.create_connection(db_name=db_name)
      connections.append(connection)
      if connector.db_type == IMPALA:
        # The Impala table creator needs help from Hive for some storage formats.
        # Eventually Impala should be able to write in all formats and this can be
        # removed.
        hive_connection = DbConnector(HIVE).create_connection(db_name=db_name)
        connection.hive_connection = hive_connection
    for table_idx in xrange(randint(min_number_of_tables, max_number_of_tables)):
      table = self.create_random_table(
          'table_%s' % (table_idx + 1),
          min_number_of_cols,
          max_number_of_cols,
          allowed_storage_formats)

      for connection in connections:
        connection.bulk_load_data_file = open(
            "/tmp/%s_%s.data" % (table.name, connection.db_type.lower()), "w")
        connection.begin_bulk_load_table(table)

      row_count = randint(min_number_of_rows, max_number_of_rows)
      LOG.info('Inserting %s rows into %s', row_count, table.name)
      while row_count:
        batch_size = min(1000, row_count)
        rows = self.generate_table_data(table, number_of_rows=batch_size)
        row_count -= batch_size
        for connection in connections:
          connection.handle_bulk_load_table_data(rows)

      for connection in connections:
        connection.end_bulk_load_table()

    self.index_tables_in_database(connections)

    for connection in connections:
      connection.close()
    if hive_connection:
      hive_connection.close()
Beispiel #2
0
    def populate_db_with_random_data(self, db_name, db_connectors,
                                     min_number_of_tables,
                                     max_number_of_tables, min_number_of_cols,
                                     max_number_of_cols, min_number_of_rows,
                                     max_number_of_rows,
                                     allowed_storage_formats, create_files):
        '''Create tables with a random number of cols.

       The given db_name must have already been created.
    '''
        connections = list()
        hive_connection = None
        for connector in db_connectors:
            connection = connector.create_connection(db_name=db_name)
            connections.append(connection)
            if connector.db_type == IMPALA:
                # The Impala table creator needs help from Hive for some storage formats.
                # Eventually Impala should be able to write in all formats and this can be
                # removed.
                hive_connection = DbConnector(HIVE).create_connection(
                    db_name=db_name)
                connection.hive_connection = hive_connection
        for table_idx in xrange(
                randint(min_number_of_tables, max_number_of_tables)):
            table = self.create_random_table('table_%s' % (table_idx + 1),
                                             min_number_of_cols,
                                             max_number_of_cols,
                                             allowed_storage_formats)

            for connection in connections:
                connection.bulk_load_data_file = open(
                    "/tmp/%s_%s.data" %
                    (table.name, connection.db_type.lower()), "w")
                connection.begin_bulk_load_table(table)

            row_count = randint(min_number_of_rows, max_number_of_rows)
            LOG.info('Inserting %s rows into %s', row_count, table.name)
            while row_count:
                batch_size = min(1000, row_count)
                rows = self.generate_table_data(table,
                                                number_of_rows=batch_size)
                row_count -= batch_size
                for connection in connections:
                    connection.handle_bulk_load_table_data(rows)

            for connection in connections:
                connection.end_bulk_load_table()

        self.index_tables_in_database(connections)

        for connection in connections:
            connection.close()
        if hive_connection:
            hive_connection.close()
Beispiel #3
0
    def start_impala(self):
        '''Starts impala and creates a connection to it.
    '''
        self.impala_env.start_impala()
        self.test_connection = DbConnector(
            IMPALA,
            user_name=None,
            password=None,
            host_name=self.impala_env.host,
            port=self.impala_env.impala_port).create_connection(DATABASE_NAME)

        self.test_connection.reconnect()
        self.query_result_comparator = QueryResultComparator(
            self.ref_connection, self.test_connection)
        LOG.info('Created query result comparator')
        LOG.info(str(self.query_result_comparator.__dict__))
  def populate_db_with_random_data(self,
      db_name,
      db_connectors,
      min_number_of_tables,
      max_number_of_tables,
      min_number_of_cols,
      max_number_of_cols,
      min_number_of_rows,
      max_number_of_rows,
      allowed_storage_formats,
      create_files):
    '''Create tables with a random number of cols.

       The given db_name must have already been created.
    '''
    connections = list()
    hive_connection = None
    for connector in db_connectors:
      connection = connector.create_connection(db_name=db_name)
      connections.append(connection)
      if connector.db_type == IMPALA:
        # The Impala table creator needs help from Hive for some storage formats.
        # Eventually Impala should be able to write in all formats and this can be
        # removed.
        hive_connection = DbConnector(HIVE).create_connection(db_name=db_name)
        connection.hive_connection = hive_connection
    tables = list()
    for table_idx in xrange(randint(min_number_of_tables, max_number_of_tables)):
      table = self.create_random_table(
          'table_%s' % (table_idx + 1),
          min_number_of_cols,
          max_number_of_cols,
          allowed_storage_formats)
      tables.append(table)

    self.populate_tables_with_random_data(
        tables,
        connections,
        min_number_of_rows,
        max_number_of_rows,
        create_tables=True)

    for connection in connections:
      connection.close()
    if hive_connection:
      hive_connection.close()
Beispiel #5
0
    def populate_db_with_random_data(self, db_name, db_connectors,
                                     min_number_of_tables,
                                     max_number_of_tables, min_number_of_cols,
                                     max_number_of_cols, min_number_of_rows,
                                     max_number_of_rows,
                                     allowed_storage_formats, create_files):
        '''Create tables with a random number of cols.

       The given db_name must have already been created.
    '''
        connections = list()
        hive_connection = None
        for connector in db_connectors:
            connection = connector.create_connection(db_name=db_name)
            connections.append(connection)
            if connector.db_type == IMPALA:
                # The Impala table creator needs help from Hive for some storage formats.
                # Eventually Impala should be able to write in all formats and this can be
                # removed.
                hive_connection = DbConnector(HIVE).create_connection(
                    db_name=db_name)
                connection.hive_connection = hive_connection
        tables = list()
        for table_idx in xrange(
                randint(min_number_of_tables, max_number_of_tables)):
            table = self.create_random_table('table_%s' % (table_idx + 1),
                                             min_number_of_cols,
                                             max_number_of_cols,
                                             allowed_storage_formats)
            tables.append(table)

        self.populate_tables_with_random_data(tables,
                                              connections,
                                              min_number_of_rows,
                                              max_number_of_rows,
                                              create_tables=True)

        for connection in connections:
            connection.close()
        if hive_connection:
            hive_connection.close()
Beispiel #6
0
  def start_impala(self):
    '''Starts impala and creates a connection to it.
    '''
    self.impala_env.start_impala()
    self.test_connection = DbConnector(IMPALA,
        user_name=None,
        password=None,
        host_name=self.impala_env.host,
        port=self.impala_env.impala_port).create_connection(DATABASE_NAME)

    self.test_connection.reconnect()
    self.query_result_comparator = QueryResultComparator(
        self.ref_connection, self.test_connection)
    LOG.info('Created query result comparator')
    LOG.info(str(self.query_result_comparator.__dict__))
Beispiel #7
0
    def prepare(self):
        '''Prepares the environment and connects to Postgres and Impala running inside the
    Docker container.
    '''
        LOG.info('Starting Job Preparation')
        self.impala_env.prepare()
        LOG.info('Job Preparation Complete')

        self.ref_connection = DbConnector(
            POSTGRESQL,
            user_name=POSTGRES_USER_NAME,
            password=None,
            host_name=self.impala_env.host,
            port=self.impala_env.postgres_port).create_connection(
                DATABASE_NAME)
        LOG.info('Create Ref Connection')

        self.start_impala()

        self.git_hash = self.impala_env.get_git_hash()
    )
    group.add_option(
        '--postgresql-password',
        help='The password to use when connecting to the Postgresql database.')
    parser.add_option_group(group)

    for group in parser.option_groups + [parser]:
        for option in group.option_list:
            if option.default != NO_DEFAULT:
                option.help += " [default: %default]"

    options, args = parser.parse_args()

    basicConfig(level=options.log_level)

    impala_connection = DbConnector(IMPALA).create_connection(options.db_name)
    db_connector_param_key = options.reference_db_type.lower()
    reference_connection = DbConnector(options.reference_db_type,
        user_name=getattr(options, db_connector_param_key + '_user'),
        password=getattr(options, db_connector_param_key + '_password'),
        host_name=getattr(options, db_connector_param_key + '_host'),
        port=getattr(options, db_connector_param_key + '_port')) \
        .create_connection(options.db_name)
    if options.exclude_types:
        exclude_types = set(type_name.lower()
                            for type_name in options.exclude_types.split(','))
        filter_col_types = [
            type_ for type_ in TYPES if type_.__name__.lower() in exclude_types
        ]
    else:
        filter_col_types = []
 def connect(self):
   self.impalad_conn = DbConnector(
       IMPALA, host_name=self.impalad.host_name, port=self.impalad.port
       ).create_connection()
class QueryRunner(object):
  """Encapsulates functionality to run a query and provide a runtime report."""

  SPILLED_PATTERN = re.compile("ExecOption:.*Spilled")
  BATCH_SIZE = 1024

  def __init__(self):
    self.impalad = None
    self.impalad_conn = None

  def connect(self):
    self.impalad_conn = DbConnector(
        IMPALA, host_name=self.impalad.host_name, port=self.impalad.port
        ).create_connection()

  def disconnect(self):
    if self.impalad_conn:
      self.impalad_conn.close()
      self.impalad_conn = None

  def run_query(self, query, timeout_secs, mem_limit_mb):
    """Run a query and return an execution report."""
    if not self.impalad_conn:
      raise Exception("connect() must first be called")

    timeout_unix_time = time() + timeout_secs
    report = QueryReport()
    try:
      with self.impalad_conn.open_cursor() as cursor:
        start_time = time()
        LOG.debug("Setting mem limit to %s MB", mem_limit_mb)
        cursor.execute("SET MEM_LIMIT=%sM" % mem_limit_mb)
        LOG.debug("Using %s database", query.db_name)
        cursor.execute("USE %s" % query.db_name)
        LOG.debug("Running query with %s MB mem limit at %s with timeout secs %s:\n%s",
            mem_limit_mb, self.impalad.host_name, timeout_secs, query.sql)
        error = None
        try:
          cursor.execute_async("/* Mem: %s MB. Coordinator: %s. */\n"
              % (mem_limit_mb, self.impalad.host_name) + query.sql)
          LOG.debug("Query id is %s", cursor._last_operation_handle)
          while cursor.is_executing():
            if time() > timeout_unix_time:
              self._cancel(cursor, report)
              return report
            sleep(0.1)
          try:
            report.result_hash = self._hash_result(cursor, timeout_unix_time)
          except QueryTimeout:
            self._cancel(cursor, report)
            return report
        except Exception as error:
          LOG.debug("Error running query with id %s: %s", cursor._last_operation_handle,
              error)
          self._check_for_mem_limit_exceeded(report, cursor, error)
        if report.non_mem_limit_error or report.mem_limit_exceeded:
          return report
        report.runtime_secs = time() - start_time
        report.profile = cursor.get_profile()
        report.mem_was_spilled = \
            QueryRunner.SPILLED_PATTERN.search(report.profile) is not None
    except Exception as error:
      # A mem limit error would have been caught above, no need to check for that here.
      report.non_mem_limit_error = error
    return report


  def _cancel(self, cursor, report):
    report.timed_out = True
    if cursor._last_operation_handle:
      LOG.debug("Attempting cancellation of query with id %s",
          cursor._last_operation_handle)
      cursor.cancel_operation()

  def _check_for_mem_limit_exceeded(self, report, cursor, caught_exception):
    """To be called after a query failure to check for signs of failed due to a
       mem limit. The report will be updated accordingly.
    """
    if cursor._last_operation_handle:
      try:
        report.profile = cursor.get_profile()
      except Exception as e:
        LOG.debug("Error getting profile for query with id %s: %s",
            cursor._last_operation_handle, e)
    if "memory limit exceeded" in str(caught_exception).lower():
      report.mem_limit_exceeded = True
      return
    LOG.error("Non-mem limit error for query with id %s: %s",
        cursor._last_operation_handle, caught_exception, exc_info=True)
    report.non_mem_limit_error = caught_exception

  def _hash_result(self, cursor, timeout_unix_time):
    """Returns a hash that is independent of row order."""
    # A value of 1 indicates that the hash thread should continue to work.
    should_continue = Value("i", 1)
    def hash_result_impl():
      try:
        current_thread().result = 1
        while should_continue.value:
          LOG.debug("Fetching result for query with id %s"
              % cursor._last_operation_handle)
          rows = cursor.fetchmany(self.BATCH_SIZE)
          if not rows:
            return
          for row in rows:
            for idx, val in enumerate(row):
              # Floats returned by Impala may not be deterministic, the ending
              # insignificant digits may differ. Only the first 6 digits will be used
              # after rounding.
              if isinstance(val, float):
                sval = "%f" % val
                dot_idx = sval.find(".")
                val = round(val, 6 - dot_idx)
              current_thread().result += (idx + 1) * hash(val)
              # Modulo the result to Keep it "small" otherwise the math ops can be slow
              # since python does infinite precision math.
              current_thread().result %= maxint
      except Exception as e:
        current_thread().error = e
    hash_thread = create_and_start_daemon_thread(hash_result_impl)
    hash_thread.join(max(timeout_unix_time - time(), 0))
    if hash_thread.is_alive():
      should_continue.value = 0
      raise QueryTimeout()
    if hash_thread.error:
      raise hash_thread.error
    return hash_thread.result
    command = args[0] if args else 'populate'
    if len(args) > 1 or command not in ['populate', 'migrate']:
        raise Exception(
            'Command must either be "populate" or "migrate" but was "%s"' %
            ' '.join(args))
    if command == 'migrate' and not any(
        (options.use_mysql, options.use_postgresql)):
        raise Exception(
            'At least one destination database must be chosen with '
            '--use-<database type>')

    basicConfig(level=options.log_level)

    seed(options.randomization_seed)

    impala_connector = DbConnector(IMPALA)
    db_connectors = []
    if options.use_postgresql:
        db_connectors.append(
            DbConnector(POSTGRESQL,
                        user_name=options.postgresql_user,
                        password=options.postgresql_password,
                        host_name=options.postgresql_host,
                        port=options.postgresql_port))
    if options.use_mysql:
        db_connectors.append(
            DbConnector(MYSQL,
                        user_name=options.mysql_user,
                        password=options.mysql_password,
                        host_name=options.mysql_host,
                        port=options.mysql_port))
Beispiel #12
0
class Job(object):
  '''Represents a Query Generator Job. One ImpalaDockerEnv is associated with it. Able to
  execute queries by either generaing them based on a provided query profile or by
  extracting queries from an existing report. A report is generated when it finishes
  running.
  '''

  def __init__(self,
      query_profile,
      job_id,
      run_name = 'default',
      time_limit_sec = 24 * 3600,
      git_command = None,
      parent_job = None):
    self.git_hash = ''
    self.impala_env = ImpalaDockerEnv(git_command)
    self.job_id = job_id
    self.job_name = run_name
    self.parent_job = parent_job
    self.query_profile = query_profile
    self.ref_connection = None
    self.result_list = []
    self.start_time = time()
    self.stop_time = None
    self.target_stop_time = time() + time_limit_sec
    self.test_connection = None
    self.num_queries_executed = 0

  def __getstate__(self):
    '''For pickling'''
    result = {}
    result['job_id'] = self.job_id
    result['job_name'] = self.job_name
    result['parent_job'] = self.parent_job
    result['result_list'] = self.result_list
    result['git_hash'] = self.git_hash
    result['start_time'] = self.start_time
    result['stop_time'] = self.stop_time
    result['num_queries_executed'] = self.num_queries_executed
    return result

  def prepare(self):
    '''Prepares the environment and connects to Postgres and Impala running inside the
    Docker container.
    '''
    LOG.info('Starting Job Preparation')
    self.impala_env.prepare()
    LOG.info('Job Preparation Complete')

    self.ref_connection = DbConnector(POSTGRESQL,
        user_name=POSTGRES_USER_NAME,
        password=None,
        host_name=self.impala_env.host,
        port=self.impala_env.postgres_port).create_connection(DATABASE_NAME)
    LOG.info('Create Ref Connection')

    self.start_impala()

    self.git_hash = self.impala_env.get_git_hash()

  def get_stack(self):
    stack_trace = self.impala_env.get_stack()
    LOG.info('Stack Trace: {0}'.format(stack_trace))
    return stack_trace

  def start_impala(self):
    '''Starts impala and creates a connection to it.
    '''
    self.impala_env.start_impala()
    self.test_connection = DbConnector(IMPALA,
        user_name=None,
        password=None,
        host_name=self.impala_env.host,
        port=self.impala_env.impala_port).create_connection(DATABASE_NAME)

    self.test_connection.reconnect()
    self.query_result_comparator = QueryResultComparator(
        self.ref_connection, self.test_connection)
    LOG.info('Created query result comparator')
    LOG.info(str(self.query_result_comparator.__dict__))

  def is_impala_running(self):
    return self.impala_env.is_impala_running()

  def save_pickle(self):
    '''Saves self as pickle. This is normally done when the job finishes running.
    '''
    with open(join_path(PATH_TO_FINISHED_JOBS, self.job_id), 'w') as f:
      pickle.dump(self, f)
    LOG.info('Saved Completed Job Pickle')

  def queries_to_be_executed(self):
    '''Generator that outputs query models. They are either generated based on the query
    profile, or they are extracted from an existing report.
    '''
    if self.parent_job:
      # If parent job is specified, get the queries from the parent job report
      with open(join_path(PATH_TO_REPORTS, self.parent_job), 'r') as f:
        parent_report = pickle.load(f)
      #for error_type in ['stack', 'row_counts', 'mismatch']:
      for error_type in ['stack']:
        for query in parent_report.grouped_results[error_type]:
          yield query['model']
    else:
      # If parent job is not specified, generate queries with QueryGenerator
      num_unexpected_errors = 0
      while num_unexpected_errors < NUM_UNEXPECTED_ERRORS_THRESHOLD:
        query = None
        try:
          query = self.query_generator.create_query(self.common_tables)
        except IndexError as e:
          # This is a query generator bug that happens extremely rarely
          LOG.info('Query Generator Choice Problem, {0}'.format(e))
          continue
        except Exception as e:
          LOG.info('Unexpected error in queries_to_be_executed, {0}'.format(e))
          self.query_generator = QueryGenerator(self.query_profile)
          num_unexpected_errors += 1
          if num_unexpected_errors > NUM_UNEXPECTED_ERRORS_THRESHOLD:
            LOG.error('Num Unexpected Errors above threshold')
            raise
          else:
            continue
        query.execution = 'RAW'
        yield query

  def generate_report(self):
    '''Generate report and save it into the reports directory'''
    from report import Report
    rep = Report(self.job_id)
    rep.save_pickle()

  def start(self):
    try:
      self.prepare()
      self.query_generator = QueryGenerator(self.query_profile)
      self.common_tables = DbConnection.describe_common_tables(
          [self.ref_connection, self.test_connection])

      for query_model in self.queries_to_be_executed():
        LOG.info('About to execute query.')
        result_dict = self.run_query(query_model)
        LOG.info('Query Executed successfully.')
        if result_dict:
          self.num_queries_executed += 1
          self.result_list.append(result_dict)
        LOG.info('Time Left: {0}'.format(self.target_stop_time - time()))
        if time() > self.target_stop_time:
          break
      self.stop_time = time()
      self.save_pickle()
      self.generate_report()
      LOG.info('Generated Report')
    except:
      LOG.exception('Unexpected Exception in start')
      raise
    finally:
      self.impala_env.stop_docker()
      LOG.info('Docker Stopped')
      try:
        os.remove(join_path(PATH_TO_SCHEDULE, self.job_id))
        LOG.info('Schedule file removed')
      except OSError:
        LOG.info('Unable to remove schedule file.')

  def reproduce_crash(self, query_model):
    '''Check if the given query_model causes a crash. Returns the number of times the
    query had to be run to cause a crash.
    '''
    NUM_TRIES = 5
    self.start_impala()
    for try_num in range(1, NUM_TRIES + 1):
      self.query_result_comparator.compare_query_results(query_model)
      if not self.is_impala_running():
        return try_num

  def run_query(self, query_model):
    '''Runs a single query.'''

    if not self.is_impala_running():
      LOG.info('Impala is not running, starting Impala.')
      self.start_impala()

    def run_query_internal():
      self.comparison_result = self.query_result_comparator.compare_query_results(
          query_model)

    # 10 minute time out to avoid cursor close problem?
    self.comparison_result = None
    internal_thread = Thread(
      target=run_query_internal,
      name='run_query_internal_{0}'.format(self.job_id))
    internal_thread.daemon = True
    internal_thread.start()
    internal_thread.join(timeout=600)
    if internal_thread.is_alive():
      LOG.info('run_query_internal is alive, restarting Impala Environment')
      self.impala_env.stop_docker()
      self.prepare()
      return None
    else:
      LOG.info('run_query_internal is dead as expected')

    comparison_result = self.comparison_result

    if comparison_result.query_timed_out:
      LOG.info('Query Timeout Exception')
      restart_impala = True
    else:
      restart_impala = False

    result_dict = {}

    if self.is_impala_running():
      if comparison_result.error:
        result_dict = self.comparison_result_analysis(comparison_result)
        result_dict['model'] = query_model
    else:
      LOG.info('CRASH OCCURED')
      result_dict = self.comparison_result_analysis(comparison_result)
      result_dict['model'] = query_model
      result_dict['stack'] = self.get_stack()
      result_dict['num_tries_to_reproduce'] = self.reproduce_crash(query_model)

    if restart_impala:
      self.start_impala()

    return result_dict

  def comparison_result_analysis(self, comparison_result):
    '''Get useful information from the comparison_result.'''
    result_dict = {}
    result_dict['error'] = comparison_result.error
    result_dict['mismatch_col'] = comparison_result.mismatch_at_col_number
    result_dict['mismatch_ref_row'] = comparison_result.ref_row
    result_dict['mismatch_test_row'] = comparison_result.test_row
    result_dict['ref_row_count'] = comparison_result.ref_row_count
    result_dict['ref_sql'] = comparison_result.ref_sql
    result_dict['test_row_count'] = comparison_result.test_row_count
    result_dict['test_sql'] = comparison_result.test_sql
    return result_dict
        '--profile',
        default='default',
        choices=(sorted(profiles.keys())),
        help=
        'Determines the mix of SQL features to use during query generation.')
    # TODO: Seed the random query generator for repeatable queries?

    cli_options.add_default_values_to_help(parser)

    options, args = parser.parse_args()
    cli_options.configure_logging(options.log_level)

    db_connector_param_key = options.ref_db_type.lower()
    ref_connection = DbConnector(options.ref_db_type,
        user_name=getattr(options, db_connector_param_key + '_user'),
        password=getattr(options, db_connector_param_key + '_password'),
        host_name=getattr(options, db_connector_param_key + '_host'),
        port=getattr(options, db_connector_param_key + '_port')) \
        .create_connection(options.db_name)
    db_connector_param_key = options.test_db_type.lower()
    test_connection = DbConnector(options.test_db_type,
        user_name=getattr(options, db_connector_param_key + '_user', None),
        password=getattr(options, db_connector_param_key + '_password', None),
        host_name=getattr(options, db_connector_param_key + '_host', None),
        port=getattr(options, db_connector_param_key + '_port', None)) \
        .create_connection(options.db_name)
    # Create an instance of profile class (e.g. DefaultProfile)
    query_profile = profiles[options.profile]()
    diff_searcher = QueryResultDiffSearcher(query_profile, ref_connection,
                                            test_connection)
    query_timeout_seconds = options.timeout
    search_results = diff_searcher.search(options.query_count,
Beispiel #14
0
class Job(object):
    '''Represents a Query Generator Job. One ImpalaDockerEnv is associated with it. Able to
  execute queries by either generaing them based on a provided query profile or by
  extracting queries from an existing report. A report is generated when it finishes
  running.
  '''
    def __init__(self,
                 query_profile,
                 job_id,
                 run_name='default',
                 time_limit_sec=24 * 3600,
                 git_command=None,
                 parent_job=None):
        self.git_hash = ''
        self.impala_env = ImpalaDockerEnv(git_command)
        self.job_id = job_id
        self.job_name = run_name
        self.parent_job = parent_job
        self.query_profile = query_profile
        self.ref_connection = None
        self.result_list = []
        self.start_time = time()
        self.stop_time = None
        self.target_stop_time = time() + time_limit_sec
        self.test_connection = None
        self.num_queries_executed = 0

    def __getstate__(self):
        '''For pickling'''
        result = {}
        result['job_id'] = self.job_id
        result['job_name'] = self.job_name
        result['parent_job'] = self.parent_job
        result['result_list'] = self.result_list
        result['git_hash'] = self.git_hash
        result['start_time'] = self.start_time
        result['stop_time'] = self.stop_time
        result['num_queries_executed'] = self.num_queries_executed
        return result

    def prepare(self):
        '''Prepares the environment and connects to Postgres and Impala running inside the
    Docker container.
    '''
        LOG.info('Starting Job Preparation')
        self.impala_env.prepare()
        LOG.info('Job Preparation Complete')

        self.ref_connection = DbConnector(
            POSTGRESQL,
            user_name=POSTGRES_USER_NAME,
            password=None,
            host_name=self.impala_env.host,
            port=self.impala_env.postgres_port).create_connection(
                DATABASE_NAME)
        LOG.info('Create Ref Connection')

        self.start_impala()

        self.git_hash = self.impala_env.get_git_hash()

    def get_stack(self):
        stack_trace = self.impala_env.get_stack()
        LOG.info('Stack Trace: {0}'.format(stack_trace))
        return stack_trace

    def start_impala(self):
        '''Starts impala and creates a connection to it.
    '''
        self.impala_env.start_impala()
        self.test_connection = DbConnector(
            IMPALA,
            user_name=None,
            password=None,
            host_name=self.impala_env.host,
            port=self.impala_env.impala_port).create_connection(DATABASE_NAME)

        self.test_connection.reconnect()
        self.query_result_comparator = QueryResultComparator(
            self.ref_connection, self.test_connection)
        LOG.info('Created query result comparator')
        LOG.info(str(self.query_result_comparator.__dict__))

    def is_impala_running(self):
        return self.impala_env.is_impala_running()

    def save_pickle(self):
        '''Saves self as pickle. This is normally done when the job finishes running.
    '''
        with open(join_path(PATH_TO_FINISHED_JOBS, self.job_id), 'w') as f:
            pickle.dump(self, f)
        LOG.info('Saved Completed Job Pickle')

    def queries_to_be_executed(self):
        '''Generator that outputs query models. They are either generated based on the query
    profile, or they are extracted from an existing report.
    '''
        if self.parent_job:
            # If parent job is specified, get the queries from the parent job report
            with open(join_path(PATH_TO_REPORTS, self.parent_job), 'r') as f:
                parent_report = pickle.load(f)
            #for error_type in ['stack', 'row_counts', 'mismatch']:
            for error_type in ['stack']:
                for query in parent_report.grouped_results[error_type]:
                    yield query['model']
        else:
            # If parent job is not specified, generate queries with QueryGenerator
            num_unexpected_errors = 0
            while num_unexpected_errors < NUM_UNEXPECTED_ERRORS_THRESHOLD:
                query = None
                try:
                    query = self.query_generator.create_query(
                        self.common_tables)
                except IndexError as e:
                    # This is a query generator bug that happens extremely rarely
                    LOG.info('Query Generator Choice Problem, {0}'.format(e))
                    continue
                except Exception as e:
                    LOG.info('Unexpected error in queries_to_be_executed, {0}'.
                             format(e))
                    self.query_generator = QueryGenerator(self.query_profile)
                    num_unexpected_errors += 1
                    if num_unexpected_errors > NUM_UNEXPECTED_ERRORS_THRESHOLD:
                        LOG.error('Num Unexpected Errors above threshold')
                        raise
                    else:
                        continue
                query.execution = 'RAW'
                yield query

    def generate_report(self):
        '''Generate report and save it into the reports directory'''
        from report import Report
        rep = Report(self.job_id)
        rep.save_pickle()

    def start(self):
        try:
            self.prepare()
            self.query_generator = QueryGenerator(self.query_profile)
            self.common_tables = DbConnection.describe_common_tables(
                [self.ref_connection, self.test_connection])

            for query_model in self.queries_to_be_executed():
                LOG.info('About to execute query.')
                result_dict = self.run_query(query_model)
                LOG.info('Query Executed successfully.')
                if result_dict:
                    self.num_queries_executed += 1
                    self.result_list.append(result_dict)
                LOG.info('Time Left: {0}'.format(self.target_stop_time -
                                                 time()))
                if time() > self.target_stop_time:
                    break
            self.stop_time = time()
            self.save_pickle()
            self.generate_report()
            LOG.info('Generated Report')
        except:
            LOG.exception('Unexpected Exception in start')
            raise
        finally:
            self.impala_env.stop_docker()
            LOG.info('Docker Stopped')
            try:
                os.remove(join_path(PATH_TO_SCHEDULE, self.job_id))
                LOG.info('Schedule file removed')
            except OSError:
                LOG.info('Unable to remove schedule file.')

    def reproduce_crash(self, query_model):
        '''Check if the given query_model causes a crash. Returns the number of times the
    query had to be run to cause a crash.
    '''
        NUM_TRIES = 5
        self.start_impala()
        for try_num in range(1, NUM_TRIES + 1):
            self.query_result_comparator.compare_query_results(query_model)
            if not self.is_impala_running():
                return try_num

    def run_query(self, query_model):
        '''Runs a single query.'''

        if not self.is_impala_running():
            LOG.info('Impala is not running, starting Impala.')
            self.start_impala()

        def run_query_internal():
            self.comparison_result = self.query_result_comparator.compare_query_results(
                query_model)

        # 10 minute time out to avoid cursor close problem?
        self.comparison_result = None
        internal_thread = Thread(target=run_query_internal,
                                 name='run_query_internal_{0}'.format(
                                     self.job_id))
        internal_thread.daemon = True
        internal_thread.start()
        internal_thread.join(timeout=600)
        if internal_thread.is_alive():
            LOG.info(
                'run_query_internal is alive, restarting Impala Environment')
            self.impala_env.stop_docker()
            self.prepare()
            return None
        else:
            LOG.info('run_query_internal is dead as expected')

        comparison_result = self.comparison_result

        if comparison_result.query_timed_out:
            LOG.info('Query Timeout Exception')
            restart_impala = True
        else:
            restart_impala = False

        result_dict = {}

        if self.is_impala_running():
            if comparison_result.error:
                result_dict = self.comparison_result_analysis(
                    comparison_result)
                result_dict['model'] = query_model
        else:
            LOG.info('CRASH OCCURED')
            result_dict = self.comparison_result_analysis(comparison_result)
            result_dict['model'] = query_model
            result_dict['stack'] = self.get_stack()
            result_dict['num_tries_to_reproduce'] = self.reproduce_crash(
                query_model)

        if restart_impala:
            self.start_impala()

        return result_dict

    def comparison_result_analysis(self, comparison_result):
        '''Get useful information from the comparison_result.'''
        result_dict = {}
        result_dict['error'] = comparison_result.error
        result_dict['mismatch_col'] = comparison_result.mismatch_at_col_number
        result_dict['mismatch_ref_row'] = comparison_result.ref_row
        result_dict['mismatch_test_row'] = comparison_result.test_row
        result_dict['ref_row_count'] = comparison_result.ref_row_count
        result_dict['ref_sql'] = comparison_result.ref_sql
        result_dict['test_row_count'] = comparison_result.test_row_count
        result_dict['test_sql'] = comparison_result.test_sql
        return result_dict
Beispiel #15
0
            'populate', 'migrate', 'populate_existing'
    ]:
        raise Exception(
            'Command must either be "populate", "populate_existing" or "migrate" but was "%s"'
            % ' '.join(args))
    if command == 'migrate' and \
        not any((options.use_mysql, options.use_postgresql, options.use_oracle)):
        raise Exception(
            'At least one destination database must be chosen with '
            '--use-<database type>')

    basicConfig(level=getattr(logging, options.log_level))

    seed(options.randomization_seed)

    impala_connector = DbConnector(IMPALA)
    db_connectors = []
    if options.use_postgresql:
        db_connectors.append(
            DbConnector(POSTGRESQL,
                        user_name=options.postgresql_user,
                        password=options.postgresql_password,
                        host_name=options.postgresql_host,
                        port=options.postgresql_port))
    if options.use_oracle:
        db_connectors.append(
            DbConnector(ORACLE,
                        user_name=options.oracle_user,
                        password=options.oracle_password,
                        host_name=options.oracle_host,
                        port=options.oracle_port))