def test_info_header_fields(self):
        infos = OrderedDict([
            ('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')),
            ('I2', Info('I2', 2, 'Integer', 'desc', 'src', 'v')),
            ('IA', Info('IA', field_counts['A'], 'Float', 'desc', 'src', 'v')),
            ('IU',
             Info('IU', field_counts['.'], 'Character', 'desc', 'src', 'v')),
            ('IG', Info('IG', field_counts['G'], 'String', 'desc', 'src',
                        'v')),
            ('I0', Info('I0', 0, 'Flag', 'desc', 'src', 'v')),
            ('IA2', Info('IA2', field_counts['A'], 'Float', 'desc', 'src',
                         'v')),
            (
                'END',  # END should not be included in the generated schema.
                Info('END', 1, 'Integer', 'Special END key', 'src', 'v'))
        ])
        header_fields = vcf_header_io.VcfHeader(infos=infos)

        self._validate_schema(
            self._generate_expected_fields(
                alt_fields=['IA', 'IA2'],
                info_fields=['I1', 'I2', 'IU', 'IG', 'I0']),
            schema_converter.generate_schema_from_header_fields(
                header_fields,
                processed_variant.ProcessedVariantFactory(header_fields)))

        # Test with split_alternate_allele_info_fields=False.
        actual_schema = (schema_converter.generate_schema_from_header_fields(
            header_fields,
            processed_variant.ProcessedVariantFactory(
                header_fields, split_alternate_allele_info_fields=False)))
        self._validate_schema(
            self._generate_expected_fields(
                info_fields=['I1', 'I2', 'IA', 'IU', 'IG', 'I0', 'IA2']),
            actual_schema)
        # Verify types and modes.
        expected_type_modes = {
            'I1': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_NULLABLE),
            'I2': (TableFieldConstants.TYPE_INTEGER,
                   TableFieldConstants.MODE_REPEATED),
            'IA': (TableFieldConstants.TYPE_FLOAT,
                   TableFieldConstants.MODE_REPEATED),
            'IU': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_REPEATED),
            'IG': (TableFieldConstants.TYPE_STRING,
                   TableFieldConstants.MODE_REPEATED),
            'I0': (TableFieldConstants.TYPE_BOOLEAN,
                   TableFieldConstants.MODE_NULLABLE),
            'IA2':
            (TableFieldConstants.TYPE_FLOAT, TableFieldConstants.MODE_REPEATED)
        }
        for field in actual_schema.fields:
            if field.name in expected_type_modes:
                expected_type, expected_mode = expected_type_modes[field.name]
                self.assertEqual(expected_type, field.type)
                self.assertEqual(expected_mode, field.mode)
 def test_no_header_fields(self):
     header_fields = vcf_header_io.VcfHeader()
     self._validate_schema(
         self._generate_expected_fields(),
         schema_converter.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))
 def test_bigquery_field_name_sanitize(self):
     infos = OrderedDict([
         ('_', Info('_', 1, 'String', 'desc', 'src', 'v')),
         ('_A', Info('_A', 1, 'String', 'desc', 'src', 'v')),
         ('0a', Info('0a', 1, 'String', 'desc', 'src', 'v')),
         ('A-B*C', Info('A-B*C', 1, 'String', 'desc', 'src', 'v')),
         ('I-A', Info('I-A', field_counts['A'], 'Float', 'desc', 'src',
                      'v')),
         ('OK_info_09', Format('OK_info_09', 1, 'String', 'desc'))
     ])
     formats = OrderedDict([('a^b', Format('a^b', 1, 'String', 'desc')),
                            ('OK_format_09',
                             Format('OK_format_09', 1, 'String', 'desc'))])
     header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats)
     self._validate_schema(
         self._generate_expected_fields(alt_fields=['I_A'],
                                        call_fields=['a_b', 'OK_format_09'],
                                        info_fields=[
                                            'field__', 'field__A',
                                            'field_0a', 'A_B_C',
                                            'OK_info_09'
                                        ]),
         schema_converter.generate_schema_from_header_fields(
             header_fields,
             processed_variant.ProcessedVariantFactory(header_fields)))
Exemple #4
0
 def test_no_header_fields_with_sample_name(self):
   header_fields = vcf_header_io.VcfHeader()
   self._validate_schema(
       self._generate_expected_fields(include_call_name=True),
       schema_converter.generate_schema_from_header_fields(
           header_fields,
           processed_variant.ProcessedVariantFactory(header_fields),
           include_call_name=True))
    def test_schema_to_vcf_header_to_schema(self):
        original_schema = bigquery_schema_util.get_sample_table_schema()
        header = schema_converter.generate_header_fields_from_schema(
            original_schema)
        reconstructed_schema = (
            schema_converter.generate_schema_from_header_fields(
                header, processed_variant.ProcessedVariantFactory(header)))

        self.assertEqual(_get_fields_from_schema(reconstructed_schema),
                         _get_fields_from_schema(original_schema))
Exemple #6
0
 def test_variant_merger_modify_schema(self):
   infos = OrderedDict([
       ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')),
       ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))])
   formats = OrderedDict([('F1', createFormat('F1', 1, 'String', 'desc'))])
   header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats)
   self._validate_schema(
       self._generate_expected_fields(
           alt_fields=['IA'],
           call_fields=['F1'],
           info_fields=['I1', 'ADDED_BY_MERGER']),
       schema_converter.generate_schema_from_header_fields(
           header_fields,
           processed_variant.ProcessedVariantFactory(header_fields),
           variant_merger=_DummyVariantMergeStrategy()))
    def __init__(
            self,
            output_path,  # type: str
            header_fields,  # type: vcf_header_io.VcfHeader
            proc_var_factory,  # type: processed_variant.ProcessedVariantFactory
            variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            null_numeric_value_replacement=None  # type: int
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_path: The path under which output Avro files are generated.
      header_fields: Representative header fields for all variants. This is
        needed for dynamically generating the schema.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
    """
        self._output_path = output_path
        self._proc_var_factory = proc_var_factory
        table_schema = (schema_converter.generate_schema_from_header_fields(
            header_fields, proc_var_factory, variant_merger))
        self._avro_schema = avro.schema.parse(
            schema_converter.convert_table_schema_to_json_avro_schema(
                table_schema))
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(table_schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
Exemple #8
0
  def test_vcf_header_to_schema_to_vcf_header(self):
    infos = OrderedDict([
        ('I1', createInfo('I1', '.', 'String', 'desc', None, None)),
        ('IA', createInfo('IA', '.', 'Integer', 'desc', None, None))])
    formats = OrderedDict([
        ('F1', createFormat('F1', '.', 'String', 'desc')),
        ('F2', createFormat('F2', '.', 'Integer', 'desc')),
        ('FU', createFormat('FU', '.', 'Float', 'desc'))])
    original_header = vcf_header_io.VcfHeader(infos=infos, formats=formats)

    schema = schema_converter.generate_schema_from_header_fields(
        original_header,
        processed_variant.ProcessedVariantFactory(original_header))
    reconstructed_header = (
        schema_converter.generate_header_fields_from_schema(
            schema))

    self.assertEqual(original_header, reconstructed_header)
Exemple #9
0
 def test_info_and_format_header_fields(self):
   infos = OrderedDict([
       ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')),
       ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))])
   # GT and PS should not be set as they're already included in special
   # 'genotype' and 'phaseset' fields.
   formats = OrderedDict([
       ('F1', createFormat('F1', 1, 'String', 'desc')),
       ('F2', createFormat('F2', 2, 'Integer', 'desc')),
       ('FU', createFormat('FU', '.', 'Float', 'desc')),
       ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')),
       ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))])
   header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats)
   self._validate_schema(
       self._generate_expected_fields(
           alt_fields=['IA'],
           call_fields=['F1', 'F2', 'FU'],
           info_fields=['I1']),
       schema_converter.generate_schema_from_header_fields(
           header_fields,
           processed_variant.ProcessedVariantFactory(header_fields)))
Exemple #10
0
def run(argv=None):
    # type: (List[str]) -> None
    """Runs VCF to BigQuery pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = pipeline_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)

    if known_args.auto_flags_experiment:
        _get_input_dimensions(known_args, pipeline_args)

    annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args)

    all_patterns = ([annotated_vcf_pattern]
                    if annotated_vcf_pattern else known_args.all_patterns)

    variant_merger = _get_variant_merge_strategy(known_args)

    pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns)

    beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
    avro_root_path = _get_avro_root_path(beam_pipeline_options)
    # Starts a pipeline to merge VCF headers in beam if the total files that
    # match the input pattern exceeds _SMALL_DATA_THRESHOLD
    _merge_headers(known_args, pipeline_args, pipeline_mode, avro_root_path,
                   annotated_vcf_pattern)

    # Retrieve merged headers prior to launching the pipeline. This is needed
    # since the BigQuery schema cannot yet be dynamically created based on input.
    # See https://issues.apache.org/jira/browse/BEAM-2801.
    header_fields = vcf_header_parser.get_vcf_headers(
        known_args.representative_header_file)
    counter_factory = metrics_util.CounterFactory()
    processed_variant_factory = processed_variant.ProcessedVariantFactory(
        header_fields, known_args.split_alternate_allele_info_fields,
        known_args.allow_malformed_records, known_args.annotation_fields,
        known_args.use_allele_num, known_args.minimal_vep_alt_matching,
        known_args.infer_annotation_types, counter_factory)

    schema = schema_converter.generate_schema_from_header_fields(
        header_fields, processed_variant_factory, variant_merger,
        known_args.use_1_based_coordinate, known_args.include_call_name)

    sharding = variant_sharding.VariantSharding(
        known_args.sharding_config_path)
    if sharding.should_keep_shard(sharding.get_residual_index()):
        num_shards = sharding.get_num_shards()
    else:
        num_shards = sharding.get_num_shards() - 1

    if known_args.update_schema_on_append:
        for i in range(num_shards):
            table_suffix = sharding.get_output_table_suffix(i)
            table_name = bigquery_util.compose_table_name(
                known_args.output_table, table_suffix)
            bigquery_util.update_bigquery_schema_on_append(
                schema.fields, table_name)

    pipeline = beam.Pipeline(options=beam_pipeline_options)
    variants = _read_variants(
        all_patterns,
        pipeline,
        known_args,
        pipeline_mode,
        use_1_based_coordinate=known_args.use_1_based_coordinate)
    if known_args.allow_malformed_records:
        variants |= 'DropMalformedRecords' >> filter_variants.FilterVariants()
    sharded_variants = variants | 'ShardVariants' >> beam.Partition(
        shard_variants.ShardVariants(sharding), sharding.get_num_shards())
    variants = []
    for i in range(num_shards):
        suffix = sharding.get_output_table_suffix(i)
        # Convert tuples to list
        variants.append(sharded_variants[i])
        if variant_merger:
            variants[i] |= ('MergeVariants' + suffix >>
                            merge_variants.MergeVariants(variant_merger))
        variants[i] |= (
            'ProcessVariants' + suffix >>
            beam.Map(processed_variant_factory.create_processed_variant). \
            with_output_types(processed_variant.ProcessedVariant))
        _ = (variants[i]
             | 'VariantToAvro' + suffix >> variant_to_avro.VariantToAvroFiles(
                 avro_root_path + suffix,
                 schema,
                 allow_incompatible_records=known_args.
                 allow_incompatible_records,
                 omit_empty_sample_calls=known_args.omit_empty_sample_calls,
                 null_numeric_value_replacement=(
                     known_args.null_numeric_value_replacement),
                 include_call_name=known_args.include_call_name))
    result = pipeline.run()
    try:
        state = result.wait_until_finish()
        if state != beam.runners.runner.PipelineState.DONE:
            logging.error(
                'Dataflow pipeline terminated in an unexpected state: %s',
                state)
            raise AssertionError(
                'Dataflow pipeline terminated in {} state'.format(state))
    except Exception as e:
        logging.error('Dataflow pipeline failed.')
        raise e
    else:
        logging.info('Dataflow pipeline finished successfully.')
        metrics_util.log_all_counters(result)

    # After pipeline is done, create output tables and load AVRO files into them.
    schema_file = _write_schema_to_temp_file(schema, avro_root_path)
    suffixes = []
    try:
        for i in range(num_shards):
            suffixes.append(sharding.get_output_table_suffix(i))
            partition_range_end = sharding.get_output_table_partition_range_end(
                i)
            if not known_args.append:
                table_name = bigquery_util.compose_table_name(
                    known_args.output_table, suffixes[i])
                partitioning.create_bq_table(
                    table_name, schema_file,
                    bigquery_util.ColumnKeyConstants.START_POSITION,
                    partition_range_end)
                _record_newly_created_table(table_name)
                logging.info('Integer range partitioned table %s was created.',
                             table_name)
        if not known_args.append:
            _record_newly_created_table(
                sample_info_table_schema_generator.create_sample_info_table(
                    known_args.output_table))

        suffixes.append(
            sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
        load_avro = avro_util.LoadAvro(avro_root_path, known_args.output_table,
                                       suffixes, False)
        not_empty_variant_suffixes = load_avro.start_loading()
        logging.info('Following tables were loaded with at least 1 row:')
        for suffix in not_empty_variant_suffixes:
            logging.info(
                bigquery_util.compose_table_name(known_args.output_table,
                                                 suffix))
        # Remove sample_info table from both lists to avoid duplicating it when
        # --sample_lookup_optimized_output_table flag is set
        suffixes.remove(
            sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
        if sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX in\
            not_empty_variant_suffixes:
            not_empty_variant_suffixes.remove(
                sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
    except Exception as e:
        logging.error(
            'Something unexpected happened during the loading of AVRO '
            'files to BigQuery: %s', str(e))
        logging.info(
            'Since the write to BigQuery stage failed, we did not delete '
            'AVRO files in your GCS bucket. You can manually import them '
            'to BigQuery. To avoid extra storage charges, delete them if '
            'you do not need them, AVRO files are located at: %s',
            avro_root_path)
        raise e
    else:
        logging.warning('All AVRO files were successfully loaded to BigQuery.')
        if known_args.keep_intermediate_avro_files:
            logging.info(
                'Since "--keep_intermediate_avro_files" flag is set, the '
                'AVRO files are kept and stored at: %s', avro_root_path)
        else:
            if bigquery_util.delete_gcs_files(avro_root_path) != 0:
                logging.error(
                    'Deletion of intermediate AVRO files located at "%s" has '
                    'failed.', avro_root_path)

    if known_args.sample_lookup_optimized_output_table:
        flatten_call_column = partitioning.FlattenCallColumn(
            known_args.output_table, not_empty_variant_suffixes,
            known_args.append)
        try:
            flatten_schema_file = tempfile.mkstemp(
                suffix=_BQ_SCHEMA_FILE_SUFFIX)[1]
            if not flatten_call_column.get_flatten_table_schema(
                    flatten_schema_file):
                raise ValueError('Failed to extract schema of flatten table')
            # Create output flatten tables if needed
            if not known_args.append:
                # Create all sample optimized tables including those that will be empty.
                for suffix in suffixes:
                    output_table_id = bigquery_util.compose_table_name(
                        known_args.sample_lookup_optimized_output_table,
                        suffix)
                    partitioning.create_bq_table(
                        output_table_id, flatten_schema_file,
                        bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID,
                        partitioning.MAX_RANGE_END)
                    _record_newly_created_table(output_table_id)
                    logging.info(
                        'Sample lookup optimized table %s was created.',
                        output_table_id)
            # Copy to flatten sample lookup tables from the variant lookup tables.
            # Note: uses WRITE_TRUNCATE to overwrite the existing tables (issue #607).
            flatten_call_column.copy_to_flatten_table(
                known_args.sample_lookup_optimized_output_table)
            logging.info(
                'All sample lookup optimized tables are fully loaded.')
        except Exception as e:
            logging.error(
                'Something unexpected happened during the loading rows to '
                'sample optimized table stage: %s', str(e))
            raise e
Exemple #11
0
    def __init__(
            self,
            output_table,  # type: str
            header_fields,  # type: vcf_header_io.VcfHeader
            variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
            proc_var_factory=None,  # type: processed_variant.ProcessedVariantFactory
            # TODO(bashir2): proc_var_factory is a required argument and if `None` is
            # supplied this will fail in schema generation.
        append=False,  # type: bool
            update_schema_on_append=False,  # type: bool
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            num_bigquery_write_shards=1,  # type: int
            null_numeric_value_replacement=None  # type: int
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_table: Full path of the output BigQuery table.
      header_fields: Representative header fields for all variants. This is
        needed for dynamically generating the schema.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      update_schema_on_append: If true, BigQuery schema will be updated by
        combining the existing schema and the new schema if they are compatible.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      num_bigquery_write_shards: If > 1, we will limit number of sources which
        are used for writing to the output BigQuery table.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
    """
        self._output_table = output_table
        self._header_fields = header_fields
        self._variant_merger = variant_merger
        self._proc_var_factory = proc_var_factory
        self._append = append
        self._schema = (schema_converter.generate_schema_from_header_fields(
            self._header_fields, self._proc_var_factory, self._variant_merger))
        # Resolver makes extra effort to resolve conflict when flag
        # allow_incompatible_records is set.
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(self._schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
        self._num_bigquery_write_shards = num_bigquery_write_shards
        if update_schema_on_append:
            bigquery_util.update_bigquery_schema_on_append(
                self._schema.fields, self._output_table)