def __init__( self, output_path, # type: str schema, # type: bigquery.TableSchema allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool null_numeric_value_replacement=None # type: int ): # type: (...) -> None """Initializes the transform. Args: output_path: The path under which output Avro files are generated. schema: Schema of the table to be generated. allow_incompatible_records: If true, field values are casted to Bigquery + schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. null_numeric_value_replacement: the value to use instead of null for numeric (float/int/long) lists. For instance, [0, None, 1] will become [0, `null_numeric_value_replacement`, 1]. If not set, the value will set to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT. """ self._output_path = output_path self._avro_schema = avro.schema.parse( schema_converter.convert_table_schema_to_json_avro_schema(schema)) self._bigquery_row_generator = ( bigquery_row_generator.VariantCallRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records), null_numeric_value_replacement)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls
def _infer_annotation_type_info_fields(self, variant, infos, defined_headers): # type: (vcfio.Variant, Dict[str, Info], vcf_header_io.VcfHeader) -> None """Updates `infos` with inferred annotation type info fields. All annotation headers in each annotation field are converted to Info header lines where the new ID corresponds to the given annotation field and header, and the new TYPE corresponds to inferred type of the original header. Since each variant potentially contains multiple values for each annotation header, a small 'merge' of value types is performed before VcfHeader creation for each variant. Args: variant: variant object infos: dict of (info_key, `Info`) for any info field in `variant` that is not defined in the header or the definition mismatches the field values. defined_headers: header fields defined in header section of VCF files. """ def _check_annotation_lists_lengths(names, values): lengths = set(len(v) for v in values) lengths.add(len(names)) if len(lengths) != 1: error = ( 'Annotation lists have inconsistent lengths: {}.\nnames={}\n' 'values={}').format(lengths, names, values) raise ValueError(error) resolver = vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=True) for field in self._annotation_fields_to_infer: if field not in variant.info: continue annotation_names = annotation_parser.extract_annotation_names( defined_headers.infos[field][_HeaderKeyConstants.DESC]) # First element (ALT) is ignored, since its type is hard-coded as string annotation_values = [ annotation_parser.extract_annotation_list_with_alt(annotation) [1:] for annotation in variant.info[field] ] _check_annotation_lists_lengths(annotation_names, annotation_values) annotation_values = zip(*annotation_values) for name, values in zip(annotation_names, annotation_values): variant_merged_type = None for v in values: if not v: continue variant_merged_type = resolver.resolve_attribute_conflict( _HeaderKeyConstants.TYPE, variant_merged_type, self._get_field_type(v)) if variant_merged_type == _HeaderTypeConstants.STRING: break key_id = get_inferred_annotation_type_header_key(field, name) infos[key_id] = Info( key_id, 1, # field count variant_merged_type, ('Inferred type field for annotation {}.'.format(name)), '', # UNKNOWN_SOURCE '') # UNKNOWN_VERSION
def setUp(self): self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor( _get_table_schema()) self._conflict_resolver = ( vcf_field_conflict_resolver.FieldConflictResolver()) self._row_generator = bigquery_vcf_data_converter.BigQueryRowGenerator( self._schema_descriptor, self._conflict_resolver)
def setUp(self): self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor( self._get_table_schema()) self._conflict_resolver = ( vcf_field_conflict_resolver.FieldConflictResolver()) self._row_generator = bigquery_row_generator.VariantCallRowGenerator( self._schema_descriptor, self._conflict_resolver)
def __init__( self, output_table, # type: str header_fields, # type: vcf_header_io.VcfHeader variant_merger=None, # type: variant_merge_strategy.VariantMergeStrategy proc_var_factory=None, # type: processed_variant.ProcessedVariantFactory append=False, # type: bool update_schema_on_append=False, # type: bool allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool num_bigquery_write_shards=1 # type: int ): # type: (...) -> None """Initializes the transform. Args: output_table: Full path of the output BigQuery table. header_fields: Representative header fields for all variants. This is needed for dynamically generating the schema. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. proc_var_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. append: If true, existing records in output_table will not be overwritten. New records will be appended to those that already exist. update_schema_on_append: If true, BigQuery schema will be updated by combining the existing schema and the new schema if they are compatible. allow_incompatible_records: If true, field values are casted to Bigquery + schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. num_bigquery_write_shards: If > 1, we will limit number of sources which are used for writing to the output BigQuery table. """ self._output_table = output_table self._header_fields = header_fields self._variant_merger = variant_merger self._proc_var_factory = proc_var_factory self._append = append self._schema = bigquery_vcf_schema.generate_schema_from_header_fields( self._header_fields, self._proc_var_factory, self._variant_merger) # Resolver makes extra effort to resolve conflict when flag # allow_incompatible_records is set. self._bigquery_row_generator = bigquery_row_generator.BigQueryRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(self._schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls self._num_bigquery_write_shards = num_bigquery_write_shards if update_schema_on_append: self._update_bigquery_schema_on_append()
def __init__(self, schema_descriptor, allow_incompatible_records=False, omit_empty_sample_calls=False): # type: (bigquery_schema_descriptor.SchemaDescriptor, bool, bool) -> None super(_ConvertToBigQueryTableRow, self).__init__() self._schema_descriptor = schema_descriptor self._conflict_resolver = ( vcf_field_conflict_resolver.FieldConflictResolver()) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls
def __init__(self, split_alternate_allele_info_fields=True): # type: (bool) -> None """Initializes :class:`MergeHeaders` object. Args: split_alternate_allele_info_fields: Whether INFO fields with `Number=A` are store under the alternate_bases record. This is relevant as it changes the header compatibility rules as it changes the schema. """ super(MergeHeaders, self).__init__() self._header_merger = _HeaderMerger( vcf_field_conflict_resolver.FieldConflictResolver( split_alternate_allele_info_fields))
def __init__( self, output_table, # type: str schema, # type: bigquery.TableSchema append=False, # type: bool allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool num_bigquery_write_shards=1, # type: int null_numeric_value_replacement=None, # type: int include_call_name=False, # type: bool move_hom_ref_calls=False # type: bool ): # type: (...) -> None """Initializes the transform. Args: output_table: Full path of the output BigQuery table. schema: Schema of the table to be generated. append: If true, existing records in output_table will not be overwritten. New records will be appended to those that already exist. allow_incompatible_records: If true, field values are casted to Bigquery schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. num_bigquery_write_shards: If > 1, we will limit number of sources which are used for writing to the output BigQuery table. null_numeric_value_replacement: the value to use instead of null for numeric (float/int/long) lists. For instance, [0, None, 1] will become [0, `null_numeric_value_replacement`, 1]. If not set, the value will set to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT. include_call_name: If true, sample name will be included in addition to sample ID. move_hom_ref_calls: If true, filter out 0 GT data out of call list and add the call name to a hom_ref_calls column. """ self._output_table = output_table self._append = append self._schema = schema # Resolver makes extra effort to resolve conflict when flag # allow_incompatible_records is set. self._bigquery_row_generator = ( bigquery_row_generator.VariantCallRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(self._schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records), null_numeric_value_replacement, include_call_name, move_hom_ref_calls)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls self._num_bigquery_write_shards = num_bigquery_write_shards
def __init__( self, output_path, # type: str header_fields, # type: vcf_header_io.VcfHeader proc_var_factory, # type: processed_variant.ProcessedVariantFactory variant_merger=None, # type: variant_merge_strategy.VariantMergeStrategy allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool null_numeric_value_replacement=None # type: int ): # type: (...) -> None """Initializes the transform. Args: output_path: The path under which output Avro files are generated. header_fields: Representative header fields for all variants. This is needed for dynamically generating the schema. proc_var_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. allow_incompatible_records: If true, field values are casted to Bigquery + schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. null_numeric_value_replacement: the value to use instead of null for numeric (float/int/long) lists. For instance, [0, None, 1] will become [0, `null_numeric_value_replacement`, 1]. If not set, the value will set to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT. """ self._output_path = output_path self._proc_var_factory = proc_var_factory table_schema = (schema_converter.generate_schema_from_header_fields( header_fields, proc_var_factory, variant_merger)) self._avro_schema = avro.schema.parse( schema_converter.convert_table_schema_to_json_avro_schema( table_schema)) self._bigquery_row_generator = ( bigquery_row_generator.VariantCallRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(table_schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records), null_numeric_value_replacement)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls
def test_combine_pipeline(self): headers_1 = self._get_header_from_lines(FILE_1_LINES) headers_2 = self._get_header_from_lines(FILE_2_LINES) # TODO(nmousavi): Either use TestPipeline or combiner_fn.* everywhere. # After moving out _HeaderMerger to its file, it makes sense to use # TestPipeline everywhere. header_merger = HeaderMerger( vcf_field_conflict_resolver.FieldConflictResolver( split_alternate_allele_info_fields=True)) expected = vcf_header_io.VcfHeader() header_merger.merge(expected, headers_1) header_merger.merge(expected, headers_2) pipeline = TestPipeline() merged_headers = (pipeline | Create([headers_1, headers_2]) | 'MergeHeaders' >> merge_headers.MergeHeaders()) assert_that(merged_headers, equal_to([expected]))
def __init__(self, split_alternate_allele_info_fields=True, allow_incompatible_records=False): # type: (bool, bool) -> None """Initializes :class:`MergeHeaders` object. Args: split_alternate_allele_info_fields: Whether INFO fields with `Number=A` are store under the alternate_bases record. This is relevant as it changes the header compatibility rules as it changes the schema. allow_incompatible_records: If true, header definition with type mismatch (e.g., string vs float) are always resolved. """ super(MergeHeaders, self).__init__() # Resolver makes extra efforts to resolve conflict in header definitions # when flag allow_incompatible_records is set. For example, it resolves # type conflict of string and float into string. self._header_merger = _HeaderMerger( vcf_field_conflict_resolver.FieldConflictResolver( split_alternate_allele_info_fields, resolve_always=allow_incompatible_records))
def setUp(self): self._resolver = vcf_field_conflict_resolver.FieldConflictResolver() self._resolver_allele = vcf_field_conflict_resolver.FieldConflictResolver( split_alternate_allele_info_fields=True) self._resolver_always = vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=True)
def test_all_fields_with_hom_ref(self): schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor( _get_table_schema(move_hom_ref_calls=True)) conflict_resolver = ( vcf_field_conflict_resolver.FieldConflictResolver()) variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'IFR': [0.1, 0.2], 'IFR2': [0.2, 0.3], 'IS': 'some data', 'ISR': ['data1', 'data2'] }, hom_ref_calls=[('Sample2', hash_name('Sample2')), ('Sample3', hash_name('Sample3'))], calls=[ vcfio.VariantCall(sample_id=hash_name('Sample1'), name='Sample1', genotype=[0, 1], phaseset='*', info={ 'GQ': 20, 'FIR': [10, 20] }) ]) header_num_dict = {'IFR': 'A', 'IFR2': 'A', 'IS': '1', 'ISR': '2'} expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.ALTERNATE_BASES: [{ ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A', 'IFR': 0.1, 'IFR2': 0.2 }, { ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT', 'IFR': 0.2, 'IFR2': 0.3 }], ColumnKeyConstants.NAMES: ['rs1', 'rs2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.HOM_REF_CALLS: [{ ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample2'), ColumnKeyConstants.CALLS_NAME: 'Sample2' }, { ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample3'), ColumnKeyConstants.CALLS_NAME: 'Sample3' }], ColumnKeyConstants.CALLS: [{ ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample1'), ColumnKeyConstants.CALLS_NAME: 'Sample1', ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'GQ': 20, 'FIR': [10, 20] }], 'IS': 'some data', 'ISR': ['data1', 'data2'] } proc_variant = _get_processed_variant(variant, header_num_dict) row_generator = bigquery_row_generator.VariantCallRowGenerator( schema_descriptor, conflict_resolver, include_call_name=True, move_hom_ref_calls=True) self.assertEqual([expected_row], list(row_generator.get_rows(proc_variant)))
def __init__( self, output_table, # type: str header_fields, # type: vcf_header_io.VcfHeader variant_merger=None, # type: variant_merge_strategy.VariantMergeStrategy proc_var_factory=None, # type: processed_variant.ProcessedVariantFactory # TODO(bashir2): proc_var_factory is a required argument and if `None` is # supplied this will fail in schema generation. append=False, # type: bool update_schema_on_append=False, # type: bool allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool num_bigquery_write_shards=1, # type: int null_numeric_value_replacement=None # type: int ): # type: (...) -> None """Initializes the transform. Args: output_table: Full path of the output BigQuery table. header_fields: Representative header fields for all variants. This is needed for dynamically generating the schema. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. proc_var_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. append: If true, existing records in output_table will not be overwritten. New records will be appended to those that already exist. update_schema_on_append: If true, BigQuery schema will be updated by combining the existing schema and the new schema if they are compatible. allow_incompatible_records: If true, field values are casted to Bigquery + schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. num_bigquery_write_shards: If > 1, we will limit number of sources which are used for writing to the output BigQuery table. null_numeric_value_replacement: the value to use instead of null for numeric (float/int/long) lists. For instance, [0, None, 1] will become [0, `null_numeric_value_replacement`, 1]. If not set, the value will set to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT. """ self._output_table = output_table self._header_fields = header_fields self._variant_merger = variant_merger self._proc_var_factory = proc_var_factory self._append = append self._schema = (schema_converter.generate_schema_from_header_fields( self._header_fields, self._proc_var_factory, self._variant_merger)) # Resolver makes extra effort to resolve conflict when flag # allow_incompatible_records is set. self._bigquery_row_generator = ( bigquery_row_generator.VariantCallRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(self._schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records), null_numeric_value_replacement)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls self._num_bigquery_write_shards = num_bigquery_write_shards if update_schema_on_append: bigquery_util.update_bigquery_schema_on_append( self._schema.fields, self._output_table)
def _get_combiner_fn(self, split_alternate_allele_info_fields=True): resolver = vcf_field_conflict_resolver.FieldConflictResolver( split_alternate_allele_info_fields) header_merger = HeaderMerger(resolver) combiner_fn = merge_headers._MergeHeadersFn(header_merger) return combiner_fn
def setUp(self): self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor( self._get_table_schema()) self._conflict_resolver = ( vcf_field_conflict_resolver.FieldConflictResolver())
def _get_header_merger(self, split_alternate_allele_info_fields=True): resolver = vcf_field_conflict_resolver.FieldConflictResolver( split_alternate_allele_info_fields) merger = HeaderMerger(resolver) return merger