Esempio n. 1
0
    def __init__(
            self,
            output_path,  # type: str
            schema,  # type: bigquery.TableSchema
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            null_numeric_value_replacement=None  # type: int
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_path: The path under which output Avro files are generated.
      schema: Schema of the table to be generated.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
    """
        self._output_path = output_path
        self._avro_schema = avro.schema.parse(
            schema_converter.convert_table_schema_to_json_avro_schema(schema))
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
Esempio n. 2
0
    def _infer_annotation_type_info_fields(self, variant, infos,
                                           defined_headers):
        # type: (vcfio.Variant, Dict[str, Info], vcf_header_io.VcfHeader) -> None
        """Updates `infos` with inferred annotation type info fields.

    All annotation headers in each annotation field are converted to Info header
    lines where the new ID corresponds to the given annotation field and header,
    and the new TYPE corresponds to inferred type of the original header. Since
    each variant potentially contains multiple values for each annotation
    header, a small 'merge' of value types is performed before VcfHeader
    creation for each variant.
    Args:
      variant: variant object
      infos: dict of (info_key, `Info`) for any info field in
        `variant` that is not defined in the header or the definition mismatches
        the field values.
      defined_headers: header fields defined in header section of VCF files.
    """
        def _check_annotation_lists_lengths(names, values):
            lengths = set(len(v) for v in values)
            lengths.add(len(names))
            if len(lengths) != 1:
                error = (
                    'Annotation lists have inconsistent lengths: {}.\nnames={}\n'
                    'values={}').format(lengths, names, values)
                raise ValueError(error)

        resolver = vcf_field_conflict_resolver.FieldConflictResolver(
            resolve_always=True)
        for field in self._annotation_fields_to_infer:
            if field not in variant.info:
                continue
            annotation_names = annotation_parser.extract_annotation_names(
                defined_headers.infos[field][_HeaderKeyConstants.DESC])
            # First element (ALT) is ignored, since its type is hard-coded as string
            annotation_values = [
                annotation_parser.extract_annotation_list_with_alt(annotation)
                [1:] for annotation in variant.info[field]
            ]
            _check_annotation_lists_lengths(annotation_names,
                                            annotation_values)
            annotation_values = zip(*annotation_values)
            for name, values in zip(annotation_names, annotation_values):
                variant_merged_type = None
                for v in values:
                    if not v:
                        continue
                    variant_merged_type = resolver.resolve_attribute_conflict(
                        _HeaderKeyConstants.TYPE, variant_merged_type,
                        self._get_field_type(v))
                    if variant_merged_type == _HeaderTypeConstants.STRING:
                        break
                key_id = get_inferred_annotation_type_header_key(field, name)
                infos[key_id] = Info(
                    key_id,
                    1,  # field count
                    variant_merged_type,
                    ('Inferred type field for annotation {}.'.format(name)),
                    '',  # UNKNOWN_SOURCE
                    '')  # UNKNOWN_VERSION
Esempio n. 3
0
    def setUp(self):
        self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor(
            _get_table_schema())
        self._conflict_resolver = (
            vcf_field_conflict_resolver.FieldConflictResolver())

        self._row_generator = bigquery_vcf_data_converter.BigQueryRowGenerator(
            self._schema_descriptor, self._conflict_resolver)
Esempio n. 4
0
    def setUp(self):
        self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor(
            self._get_table_schema())
        self._conflict_resolver = (
            vcf_field_conflict_resolver.FieldConflictResolver())

        self._row_generator = bigquery_row_generator.VariantCallRowGenerator(
            self._schema_descriptor, self._conflict_resolver)
  def __init__(
      self,
      output_table,  # type: str
      header_fields,  # type: vcf_header_io.VcfHeader
      variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
      proc_var_factory=None,  # type: processed_variant.ProcessedVariantFactory
      append=False,  # type: bool
      update_schema_on_append=False,  # type: bool
      allow_incompatible_records=False,  # type: bool
      omit_empty_sample_calls=False,  # type: bool
      num_bigquery_write_shards=1  # type: int
      ):
    # type: (...) -> None
    """Initializes the transform.

    Args:
      output_table: Full path of the output BigQuery table.
      header_fields: Representative header fields for all variants. This is
        needed for dynamically generating the schema.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      update_schema_on_append: If true, BigQuery schema will be updated by
        combining the existing schema and the new schema if they are compatible.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      num_bigquery_write_shards: If > 1, we will limit number of sources which
        are used for writing to the output BigQuery table.
    """
    self._output_table = output_table
    self._header_fields = header_fields
    self._variant_merger = variant_merger
    self._proc_var_factory = proc_var_factory
    self._append = append
    self._schema = bigquery_vcf_schema.generate_schema_from_header_fields(
        self._header_fields, self._proc_var_factory, self._variant_merger)
    # Resolver makes extra effort to resolve conflict when flag
    # allow_incompatible_records is set.
    self._bigquery_row_generator = bigquery_row_generator.BigQueryRowGenerator(
        bigquery_schema_descriptor.SchemaDescriptor(self._schema),
        vcf_field_conflict_resolver.FieldConflictResolver(
            resolve_always=allow_incompatible_records))

    self._allow_incompatible_records = allow_incompatible_records
    self._omit_empty_sample_calls = omit_empty_sample_calls
    self._num_bigquery_write_shards = num_bigquery_write_shards
    if update_schema_on_append:
      self._update_bigquery_schema_on_append()
Esempio n. 6
0
 def __init__(self,
              schema_descriptor,
              allow_incompatible_records=False,
              omit_empty_sample_calls=False):
   # type: (bigquery_schema_descriptor.SchemaDescriptor, bool, bool) -> None
   super(_ConvertToBigQueryTableRow, self).__init__()
   self._schema_descriptor = schema_descriptor
   self._conflict_resolver = (
       vcf_field_conflict_resolver.FieldConflictResolver())
   self._allow_incompatible_records = allow_incompatible_records
   self._omit_empty_sample_calls = omit_empty_sample_calls
Esempio n. 7
0
    def __init__(self, split_alternate_allele_info_fields=True):
        # type: (bool) -> None
        """Initializes :class:`MergeHeaders` object.

    Args:
      split_alternate_allele_info_fields: Whether INFO fields with
        `Number=A` are store under the alternate_bases record. This is relevant
        as it changes the header compatibility rules as it changes the schema.
    """
        super(MergeHeaders, self).__init__()
        self._header_merger = _HeaderMerger(
            vcf_field_conflict_resolver.FieldConflictResolver(
                split_alternate_allele_info_fields))
Esempio n. 8
0
    def __init__(
            self,
            output_table,  # type: str
            schema,  # type: bigquery.TableSchema
            append=False,  # type: bool
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            num_bigquery_write_shards=1,  # type: int
            null_numeric_value_replacement=None,  # type: int
            include_call_name=False,  # type: bool
            move_hom_ref_calls=False  # type: bool
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_table: Full path of the output BigQuery table.
      schema: Schema of the table to be generated.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      allow_incompatible_records: If true, field values are casted to Bigquery
        schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      num_bigquery_write_shards: If > 1, we will limit number of sources which
        are used for writing to the output BigQuery table.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
      include_call_name: If true, sample name will be included in addition to
        sample ID.
      move_hom_ref_calls: If true, filter out 0 GT data out of call list and add
        the call name to a hom_ref_calls column.
    """
        self._output_table = output_table
        self._append = append
        self._schema = schema
        # Resolver makes extra effort to resolve conflict when flag
        # allow_incompatible_records is set.
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(self._schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement, include_call_name,
                move_hom_ref_calls))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
        self._num_bigquery_write_shards = num_bigquery_write_shards
    def __init__(
            self,
            output_path,  # type: str
            header_fields,  # type: vcf_header_io.VcfHeader
            proc_var_factory,  # type: processed_variant.ProcessedVariantFactory
            variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            null_numeric_value_replacement=None  # type: int
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_path: The path under which output Avro files are generated.
      header_fields: Representative header fields for all variants. This is
        needed for dynamically generating the schema.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
    """
        self._output_path = output_path
        self._proc_var_factory = proc_var_factory
        table_schema = (schema_converter.generate_schema_from_header_fields(
            header_fields, proc_var_factory, variant_merger))
        self._avro_schema = avro.schema.parse(
            schema_converter.convert_table_schema_to_json_avro_schema(
                table_schema))
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(table_schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
    def test_combine_pipeline(self):
        headers_1 = self._get_header_from_lines(FILE_1_LINES)
        headers_2 = self._get_header_from_lines(FILE_2_LINES)

        # TODO(nmousavi): Either use TestPipeline or combiner_fn.* everywhere.
        # After moving out _HeaderMerger to its file, it makes sense to use
        # TestPipeline everywhere.
        header_merger = HeaderMerger(
            vcf_field_conflict_resolver.FieldConflictResolver(
                split_alternate_allele_info_fields=True))
        expected = vcf_header_io.VcfHeader()
        header_merger.merge(expected, headers_1)
        header_merger.merge(expected, headers_2)

        pipeline = TestPipeline()
        merged_headers = (pipeline
                          | Create([headers_1, headers_2])
                          | 'MergeHeaders' >> merge_headers.MergeHeaders())

        assert_that(merged_headers, equal_to([expected]))
Esempio n. 11
0
    def __init__(self,
                 split_alternate_allele_info_fields=True,
                 allow_incompatible_records=False):
        # type: (bool, bool) -> None
        """Initializes :class:`MergeHeaders` object.

    Args:
      split_alternate_allele_info_fields: Whether INFO fields with
        `Number=A` are store under the alternate_bases record. This is relevant
        as it changes the header compatibility rules as it changes the schema.
      allow_incompatible_records: If true, header definition with type mismatch
        (e.g., string vs float) are always resolved.
    """
        super(MergeHeaders, self).__init__()
        # Resolver makes extra efforts to resolve conflict in header definitions
        # when flag allow_incompatible_records is set. For example, it resolves
        # type conflict of string and float into string.
        self._header_merger = _HeaderMerger(
            vcf_field_conflict_resolver.FieldConflictResolver(
                split_alternate_allele_info_fields,
                resolve_always=allow_incompatible_records))
 def setUp(self):
   self._resolver = vcf_field_conflict_resolver.FieldConflictResolver()
   self._resolver_allele = vcf_field_conflict_resolver.FieldConflictResolver(
       split_alternate_allele_info_fields=True)
   self._resolver_always = vcf_field_conflict_resolver.FieldConflictResolver(
       resolve_always=True)
Esempio n. 13
0
    def test_all_fields_with_hom_ref(self):
        schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor(
            _get_table_schema(move_hom_ref_calls=True))
        conflict_resolver = (
            vcf_field_conflict_resolver.FieldConflictResolver())

        variant = vcfio.Variant(
            reference_name='chr19',
            start=11,
            end=12,
            reference_bases='C',
            alternate_bases=['A', 'TT'],
            names=['rs1', 'rs2'],
            quality=2,
            filters=['PASS'],
            info={
                'IFR': [0.1, 0.2],
                'IFR2': [0.2, 0.3],
                'IS': 'some data',
                'ISR': ['data1', 'data2']
            },
            hom_ref_calls=[('Sample2', hash_name('Sample2')),
                           ('Sample3', hash_name('Sample3'))],
            calls=[
                vcfio.VariantCall(sample_id=hash_name('Sample1'),
                                  name='Sample1',
                                  genotype=[0, 1],
                                  phaseset='*',
                                  info={
                                      'GQ': 20,
                                      'FIR': [10, 20]
                                  })
            ])
        header_num_dict = {'IFR': 'A', 'IFR2': 'A', 'IS': '1', 'ISR': '2'}
        expected_row = {
            ColumnKeyConstants.REFERENCE_NAME:
            'chr19',
            ColumnKeyConstants.START_POSITION:
            11,
            ColumnKeyConstants.END_POSITION:
            12,
            ColumnKeyConstants.REFERENCE_BASES:
            'C',
            ColumnKeyConstants.ALTERNATE_BASES: [{
                ColumnKeyConstants.ALTERNATE_BASES_ALT:
                'A',
                'IFR':
                0.1,
                'IFR2':
                0.2
            }, {
                ColumnKeyConstants.ALTERNATE_BASES_ALT:
                'TT',
                'IFR':
                0.2,
                'IFR2':
                0.3
            }],
            ColumnKeyConstants.NAMES: ['rs1', 'rs2'],
            ColumnKeyConstants.QUALITY:
            2,
            ColumnKeyConstants.FILTER: ['PASS'],
            ColumnKeyConstants.HOM_REF_CALLS: [{
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample2'),
                ColumnKeyConstants.CALLS_NAME:
                'Sample2'
            }, {
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample3'),
                ColumnKeyConstants.CALLS_NAME:
                'Sample3'
            }],
            ColumnKeyConstants.CALLS: [{
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample1'),
                ColumnKeyConstants.CALLS_NAME:
                'Sample1',
                ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
                ColumnKeyConstants.CALLS_PHASESET:
                '*',
                'GQ':
                20,
                'FIR': [10, 20]
            }],
            'IS':
            'some data',
            'ISR': ['data1', 'data2']
        }
        proc_variant = _get_processed_variant(variant, header_num_dict)
        row_generator = bigquery_row_generator.VariantCallRowGenerator(
            schema_descriptor,
            conflict_resolver,
            include_call_name=True,
            move_hom_ref_calls=True)
        self.assertEqual([expected_row],
                         list(row_generator.get_rows(proc_variant)))
Esempio n. 14
0
    def __init__(
            self,
            output_table,  # type: str
            header_fields,  # type: vcf_header_io.VcfHeader
            variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
            proc_var_factory=None,  # type: processed_variant.ProcessedVariantFactory
            # TODO(bashir2): proc_var_factory is a required argument and if `None` is
            # supplied this will fail in schema generation.
        append=False,  # type: bool
            update_schema_on_append=False,  # type: bool
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            num_bigquery_write_shards=1,  # type: int
            null_numeric_value_replacement=None  # type: int
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_table: Full path of the output BigQuery table.
      header_fields: Representative header fields for all variants. This is
        needed for dynamically generating the schema.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      update_schema_on_append: If true, BigQuery schema will be updated by
        combining the existing schema and the new schema if they are compatible.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      num_bigquery_write_shards: If > 1, we will limit number of sources which
        are used for writing to the output BigQuery table.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
    """
        self._output_table = output_table
        self._header_fields = header_fields
        self._variant_merger = variant_merger
        self._proc_var_factory = proc_var_factory
        self._append = append
        self._schema = (schema_converter.generate_schema_from_header_fields(
            self._header_fields, self._proc_var_factory, self._variant_merger))
        # Resolver makes extra effort to resolve conflict when flag
        # allow_incompatible_records is set.
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(self._schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
        self._num_bigquery_write_shards = num_bigquery_write_shards
        if update_schema_on_append:
            bigquery_util.update_bigquery_schema_on_append(
                self._schema.fields, self._output_table)
 def _get_combiner_fn(self, split_alternate_allele_info_fields=True):
   resolver = vcf_field_conflict_resolver.FieldConflictResolver(
       split_alternate_allele_info_fields)
   header_merger = HeaderMerger(resolver)
   combiner_fn = merge_headers._MergeHeadersFn(header_merger)
   return combiner_fn
 def setUp(self):
     self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor(
         self._get_table_schema())
     self._conflict_resolver = (
         vcf_field_conflict_resolver.FieldConflictResolver())
Esempio n. 17
0
 def _get_header_merger(self, split_alternate_allele_info_fields=True):
   resolver = vcf_field_conflict_resolver.FieldConflictResolver(
       split_alternate_allele_info_fields)
   merger = HeaderMerger(resolver)
   return merger