def run_fuzz_test(self, vector, unique_database, table, num_copies=1):
    """ Do some basic fuzz testing: create a copy of an existing table with randomly
    corrupted files and make sure that we don't crash or behave in an unexpected way.
    'unique_database' is used for the table, so it will be cleaned up automatically.
    If 'num_copies' is set, create that many corrupted copies of each input file.
    SCANNER_FUZZ_SEED can be set in the environment to reproduce the result (assuming that
    input files are the same).
    SCANNER_FUZZ_KEEP_FILES can be set in the environment to keep the generated files.
    """
    # Create and seed a new random number generator for reproducibility.
    rng = random.Random()
    random_seed = os.environ.get("SCANNER_FUZZ_SEED") or time.time()
    LOG.info("Using random seed %d", random_seed)
    rng.seed(long(random_seed))

    table_format = vector.get_value('table_format')
    self.change_database(self.client, table_format)

    tmp_table_dir = tempfile.mkdtemp(prefix="tmp-scanner-fuzz-%s" % table,
        dir=os.path.join(os.environ['IMPALA_HOME'], "testdata"))

    self.execute_query("create table %s.%s like %s" % (unique_database, table, table))
    fuzz_table_location = get_fs_path("/test-warehouse/{0}.db/{1}".format(
        unique_database, table))

    LOG.info("Generating corrupted version of %s in %s. Local working directory is %s",
        table, unique_database, tmp_table_dir)

    # Find the location of the existing table and get the full table directory structure.
    table_loc = self._get_table_location(table, vector)
    check_call(['hdfs', 'dfs', '-copyToLocal', table_loc + "/*", tmp_table_dir])

    partitions = self.walk_and_corrupt_table_data(tmp_table_dir, num_copies, rng)
    for partition in partitions:
      self.execute_query('alter table {0}.{1} add partition ({2})'.format(
          unique_database, table, ','.join(partition)))

    # Copy all of the local files and directories to hdfs.
    to_copy = ["%s/%s" % (tmp_table_dir, file_or_dir)
               for file_or_dir in os.listdir(tmp_table_dir)]
    check_call(['hdfs', 'dfs', '-copyFromLocal'] + to_copy + [fuzz_table_location])

    if "SCANNER_FUZZ_KEEP_FILES" not in os.environ:
      shutil.rmtree(tmp_table_dir)

    # Querying the corrupted files should not DCHECK or crash.
    self.execute_query("refresh %s.%s" % (unique_database, table))
    # Execute a query that tries to read all the columns and rows in the file.
    # Also execute a count(*) that materializes no columns, since different code
    # paths are exercised.
    queries = [
        'select count(*) from (select distinct * from {0}.{1}) q'.format(
            unique_database, table),
        'select count(*) from {0}.{1} q'.format(unique_database, table)]

    for query, batch_size, disable_codegen in \
        itertools.product(queries, self.BATCH_SIZES, self.DISABLE_CODEGEN_VALUES):
      query_options = copy(vector.get_value('exec_option'))
      query_options['batch_size'] = batch_size
      query_options['disable_codegen'] = disable_codegen
      try:
        result = self.execute_query(query, query_options = query_options)
        LOG.info('\n'.join(result.log))
      except Exception as e:
        if 'memory limit exceeded' in str(e).lower():
          # Memory limit error should fail query.
          continue
        msg = "Should not throw error when abort_on_error=0: '{0}'".format(e)
        LOG.error(msg)
        # Parquet and compressed text can fail the query for some parse errors.
        # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file
        # (IMPALA-4013).
        if table_format.file_format != 'parquet' \
            and not (table_format.file_format == 'text' and
            table_format.compression_codec != 'none'):
          raise
Esempio n. 2
0
    def run_fuzz_test(self,
                      vector,
                      src_db,
                      src_table,
                      fuzz_db,
                      fuzz_table,
                      num_copies=1,
                      custom_queries=None):
        """ Do some basic fuzz testing: create a copy of an existing table with randomly
    corrupted files and make sure that we don't crash or behave in an unexpected way.
    'unique_database' is used for the table, so it will be cleaned up automatically.
    If 'num_copies' is set, create that many corrupted copies of each input file.
    SCANNER_FUZZ_SEED can be set in the environment to reproduce the result (assuming that
    input files are the same).
    SCANNER_FUZZ_KEEP_FILES can be set in the environment to keep the generated files.
    """
        # Create and seed a new random number generator for reproducibility.
        rng = random.Random()
        random_seed = os.environ.get("SCANNER_FUZZ_SEED") or time.time()
        LOG.info("Using random seed %d", random_seed)
        rng.seed(long(random_seed))

        tmp_table_dir = tempfile.mkdtemp(
            prefix="tmp-scanner-fuzz-%s" % fuzz_table,
            dir=os.path.join(os.environ['IMPALA_HOME'], "testdata"))

        self.execute_query("create table %s.%s like %s.%s" %
                           (fuzz_db, fuzz_table, src_db, src_table))
        fuzz_table_location = get_fs_path("/test-warehouse/{0}.db/{1}".format(
            fuzz_db, fuzz_table))

        LOG.info(
            "Generating corrupted version of %s in %s. Local working directory is %s",
            fuzz_table, fuzz_db, tmp_table_dir)

        # Find the location of the existing table and get the full table directory structure.
        fq_table_name = src_db + "." + src_table
        table_loc = self._get_table_location(fq_table_name, vector)
        check_call(
            ['hdfs', 'dfs', '-copyToLocal', table_loc + "/*", tmp_table_dir])

        partitions = self.walk_and_corrupt_table_data(tmp_table_dir,
                                                      num_copies, rng)
        for partition in partitions:
            self.execute_query(
                'alter table {0}.{1} add partition ({2})'.format(
                    fuzz_db, fuzz_table, ','.join(partition)))

        # Copy all of the local files and directories to hdfs.
        to_copy = [
            "%s/%s" % (tmp_table_dir, file_or_dir)
            for file_or_dir in os.listdir(tmp_table_dir)
        ]
        self.filesystem_client.copy_from_local(to_copy, fuzz_table_location)

        if "SCANNER_FUZZ_KEEP_FILES" not in os.environ:
            shutil.rmtree(tmp_table_dir)

        # Querying the corrupted files should not DCHECK or crash.
        self.execute_query("refresh %s.%s" % (fuzz_db, fuzz_table))
        # Execute a query that tries to read all the columns and rows in the file.
        # Also execute a count(*) that materializes no columns, since different code
        # paths are exercised.
        queries = [
            'select count(*) from (select distinct * from {0}.{1}) q'.format(
                fuzz_db, fuzz_table),
            'select count(*) from {0}.{1} q'.format(fuzz_db, fuzz_table)
        ]
        if custom_queries is not None:
            queries = queries + [
                s.format(fuzz_db, fuzz_table) for s in custom_queries
            ]

        for query, batch_size, disable_codegen in \
            itertools.product(queries, self.BATCH_SIZES, self.DISABLE_CODEGEN_VALUES):
            query_options = copy(vector.get_value('exec_option'))
            query_options['batch_size'] = batch_size
            query_options['disable_codegen'] = disable_codegen
            query_options['disable_codegen_rows_threshold'] = 0
            try:
                result = self.execute_query(query, query_options=query_options)
                LOG.info('\n'.join(result.log))
            except Exception as e:
                if 'memory limit exceeded' in str(e).lower():
                    # Memory limit error should fail query.
                    continue
                msg = "Should not throw error when abort_on_error=0: '{0}'".format(
                    e)
                LOG.error(msg)
                # Parquet and compressed text can fail the query for some parse errors.
                # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file
                # (IMPALA-4013).
                table_format = vector.get_value('table_format')
                if table_format.file_format not in ['parquet', 'orc', 'rc', 'seq'] \
                    and not (table_format.file_format == 'text' and
                    table_format.compression_codec != 'none'):
                    raise
    def run_fuzz_test(self, vector, unique_database, table, num_copies=1):
        """ Do some basic fuzz testing: create a copy of an existing table with randomly
    corrupted files and make sure that we don't crash or behave in an unexpected way.
    'unique_database' is used for the table, so it will be cleaned up automatically.
    If 'num_copies' is set, create that many corrupted copies of each input file.
    SCANNER_FUZZ_SEED can be set in the environment to reproduce the result (assuming that
    input files are the same).
    SCANNER_FUZZ_KEEP_FILES can be set in the environment to keep the generated files.
    """
        # Create and seed a new random number generator for reproducibility.
        rng = random.Random()
        random_seed = os.environ.get("SCANNER_FUZZ_SEED") or time.time()
        LOG.info("Using random seed %d", random_seed)
        rng.seed(long(random_seed))

        table_format = vector.get_value('table_format')
        self.change_database(self.client, table_format)

        tmp_table_dir = tempfile.mkdtemp(prefix="tmp-scanner-fuzz-%s" % table,
                                         dir=os.path.join(
                                             os.environ['IMPALA_HOME'],
                                             "testdata"))

        self.execute_query("create table %s.%s like %s" %
                           (unique_database, table, table))
        fuzz_table_location = get_fs_path("/test-warehouse/{0}.db/{1}".format(
            unique_database, table))

        LOG.info(
            "Generating corrupted version of %s in %s. Local working directory is %s",
            table, unique_database, tmp_table_dir)

        # Find the location of the existing table and get the full table directory structure.
        table_loc = self._get_table_location(table, vector)
        check_call(
            ['hdfs', 'dfs', '-copyToLocal', table_loc + "/*", tmp_table_dir])

        partitions = self.walk_and_corrupt_table_data(tmp_table_dir,
                                                      num_copies, rng)
        for partition in partitions:
            self.execute_query(
                'alter table {0}.{1} add partition ({2})'.format(
                    unique_database, table, ','.join(partition)))

        # Copy all of the local files and directories to hdfs.
        to_copy = [
            "%s/%s" % (tmp_table_dir, file_or_dir)
            for file_or_dir in os.listdir(tmp_table_dir)
        ]
        check_call(['hdfs', 'dfs', '-copyFromLocal'] + to_copy +
                   [fuzz_table_location])

        if "SCANNER_FUZZ_KEEP_FILES" not in os.environ:
            shutil.rmtree(tmp_table_dir)

        # Querying the corrupted files should not DCHECK or crash.
        self.execute_query("refresh %s.%s" % (unique_database, table))
        # Execute a query that tries to read all the columns and rows in the file.
        # Also execute a count(*) that materializes no columns, since different code
        # paths are exercised.
        # Use abort_on_error=0 to ensure we scan all the files.
        queries = [
            'select count(*) from (select distinct * from {0}.{1}) q'.format(
                unique_database, table),
            'select count(*) from {0}.{1} q'.format(unique_database, table)
        ]

        xfail_msgs = []
        for query in queries:
            for batch_size in self.BATCH_SIZES:
                query_options = {
                    'abort_on_error': '0',
                    'batch_size': batch_size
                }
                try:
                    result = self.execute_query(query,
                                                query_options=query_options)
                    LOG.info('\n'.join(result.log))
                except Exception as e:
                    if 'memory limit exceeded' in str(e).lower():
                        # Memory limit error should fail query.
                        continue
                    msg = "Should not throw error when abort_on_error=0: '{0}'".format(
                        e)
                    LOG.error(msg)
                    # Parquet and compressed text can fail the query for some parse errors.
                    # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file
                    # (IMPALA-4013).
                    if table_format.file_format == 'parquet' or \
                        (table_format.file_format == 'text' and
                        table_format.compression_codec != 'none'):
                        xfail_msgs.append(msg)
                    else:
                        raise
        if len(xfail_msgs) != 0:
            pytest.xfail('\n'.join(xfail_msgs))