def __init__( self, output_path, # type: str schema, # type: bigquery.TableSchema allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool null_numeric_value_replacement=None # type: int ): # type: (...) -> None """Initializes the transform. Args: output_path: The path under which output Avro files are generated. schema: Schema of the table to be generated. allow_incompatible_records: If true, field values are casted to Bigquery + schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. null_numeric_value_replacement: the value to use instead of null for numeric (float/int/long) lists. For instance, [0, None, 1] will become [0, `null_numeric_value_replacement`, 1]. If not set, the value will set to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT. """ self._output_path = output_path self._avro_schema = avro.schema.parse( schema_converter.convert_table_schema_to_json_avro_schema(schema)) self._bigquery_row_generator = ( bigquery_row_generator.VariantCallRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records), null_numeric_value_replacement)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls
def setUp(self): self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor( self._get_table_schema()) self._conflict_resolver = ( vcf_field_conflict_resolver.FieldConflictResolver()) self._row_generator = bigquery_row_generator.VariantCallRowGenerator( self._schema_descriptor, self._conflict_resolver)
def __init__( self, output_table, # type: str schema, # type: bigquery.TableSchema append=False, # type: bool allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool num_bigquery_write_shards=1, # type: int null_numeric_value_replacement=None, # type: int include_call_name=False, # type: bool move_hom_ref_calls=False # type: bool ): # type: (...) -> None """Initializes the transform. Args: output_table: Full path of the output BigQuery table. schema: Schema of the table to be generated. append: If true, existing records in output_table will not be overwritten. New records will be appended to those that already exist. allow_incompatible_records: If true, field values are casted to Bigquery schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. num_bigquery_write_shards: If > 1, we will limit number of sources which are used for writing to the output BigQuery table. null_numeric_value_replacement: the value to use instead of null for numeric (float/int/long) lists. For instance, [0, None, 1] will become [0, `null_numeric_value_replacement`, 1]. If not set, the value will set to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT. include_call_name: If true, sample name will be included in addition to sample ID. move_hom_ref_calls: If true, filter out 0 GT data out of call list and add the call name to a hom_ref_calls column. """ self._output_table = output_table self._append = append self._schema = schema # Resolver makes extra effort to resolve conflict when flag # allow_incompatible_records is set. self._bigquery_row_generator = ( bigquery_row_generator.VariantCallRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(self._schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records), null_numeric_value_replacement, include_call_name, move_hom_ref_calls)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls self._num_bigquery_write_shards = num_bigquery_write_shards
def __init__( self, output_path, # type: str header_fields, # type: vcf_header_io.VcfHeader proc_var_factory, # type: processed_variant.ProcessedVariantFactory variant_merger=None, # type: variant_merge_strategy.VariantMergeStrategy allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool null_numeric_value_replacement=None # type: int ): # type: (...) -> None """Initializes the transform. Args: output_path: The path under which output Avro files are generated. header_fields: Representative header fields for all variants. This is needed for dynamically generating the schema. proc_var_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. allow_incompatible_records: If true, field values are casted to Bigquery + schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. null_numeric_value_replacement: the value to use instead of null for numeric (float/int/long) lists. For instance, [0, None, 1] will become [0, `null_numeric_value_replacement`, 1]. If not set, the value will set to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT. """ self._output_path = output_path self._proc_var_factory = proc_var_factory table_schema = (schema_converter.generate_schema_from_header_fields( header_fields, proc_var_factory, variant_merger)) self._avro_schema = avro.schema.parse( schema_converter.convert_table_schema_to_json_avro_schema( table_schema)) self._bigquery_row_generator = ( bigquery_row_generator.VariantCallRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(table_schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records), null_numeric_value_replacement)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls
def test_convert_variant_to_bigquery_row_allow_incompatible_recoreds(self): variant, row, header_num_dict = ( self._get_sample_variant_with_incompatible_records()) header_fields = vcf_header_util.make_header(header_num_dict) proc_var = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant) pipeline = TestPipeline(blocking=True) bigquery_rows = ( pipeline | Create([proc_var]) | 'ConvertToRow' >> beam.ParDo( ConvertVariantToRow(self._row_generator, allow_incompatible_records=True))) assert_that(bigquery_rows, equal_to([row])) pipeline.run() self._row_generator = bigquery_row_generator.VariantCallRowGenerator( self._schema_descriptor, self._conflict_resolver)
def test_convert_variant_with_sample_name_to_bigquery_row(self): self._row_generator = bigquery_row_generator.VariantCallRowGenerator( self._schema_descriptor, self._conflict_resolver, include_call_name=True) variant, row, header_num_dict = self._get_sample_variant_with_sample_name( ) header_fields = vcf_header_util.make_header(header_num_dict) proc_var = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant) pipeline = TestPipeline(blocking=True) bigquery_rows = ( pipeline | Create([proc_var]) | 'ConvertToRow' >> beam.ParDo( ConvertVariantToRow(self._row_generator, omit_empty_sample_calls=True))) assert_that(bigquery_rows, equal_to([row])) pipeline.run()
def __init__( self, output_table, # type: str header_fields, # type: vcf_header_io.VcfHeader variant_merger=None, # type: variant_merge_strategy.VariantMergeStrategy proc_var_factory=None, # type: processed_variant.ProcessedVariantFactory # TODO(bashir2): proc_var_factory is a required argument and if `None` is # supplied this will fail in schema generation. append=False, # type: bool update_schema_on_append=False, # type: bool allow_incompatible_records=False, # type: bool omit_empty_sample_calls=False, # type: bool num_bigquery_write_shards=1, # type: int null_numeric_value_replacement=None # type: int ): # type: (...) -> None """Initializes the transform. Args: output_table: Full path of the output BigQuery table. header_fields: Representative header fields for all variants. This is needed for dynamically generating the schema. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. proc_var_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. append: If true, existing records in output_table will not be overwritten. New records will be appended to those that already exist. update_schema_on_append: If true, BigQuery schema will be updated by combining the existing schema and the new schema if they are compatible. allow_incompatible_records: If true, field values are casted to Bigquery + schema if there is a mismatch. omit_empty_sample_calls: If true, samples that don't have a given call will be omitted. num_bigquery_write_shards: If > 1, we will limit number of sources which are used for writing to the output BigQuery table. null_numeric_value_replacement: the value to use instead of null for numeric (float/int/long) lists. For instance, [0, None, 1] will become [0, `null_numeric_value_replacement`, 1]. If not set, the value will set to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT. """ self._output_table = output_table self._header_fields = header_fields self._variant_merger = variant_merger self._proc_var_factory = proc_var_factory self._append = append self._schema = (schema_converter.generate_schema_from_header_fields( self._header_fields, self._proc_var_factory, self._variant_merger)) # Resolver makes extra effort to resolve conflict when flag # allow_incompatible_records is set. self._bigquery_row_generator = ( bigquery_row_generator.VariantCallRowGenerator( bigquery_schema_descriptor.SchemaDescriptor(self._schema), vcf_field_conflict_resolver.FieldConflictResolver( resolve_always=allow_incompatible_records), null_numeric_value_replacement)) self._allow_incompatible_records = allow_incompatible_records self._omit_empty_sample_calls = omit_empty_sample_calls self._num_bigquery_write_shards = num_bigquery_write_shards if update_schema_on_append: bigquery_util.update_bigquery_schema_on_append( self._schema.fields, self._output_table)
def test_all_fields_with_hom_ref(self): schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor( _get_table_schema(move_hom_ref_calls=True)) conflict_resolver = ( vcf_field_conflict_resolver.FieldConflictResolver()) variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'IFR': [0.1, 0.2], 'IFR2': [0.2, 0.3], 'IS': 'some data', 'ISR': ['data1', 'data2'] }, hom_ref_calls=[('Sample2', hash_name('Sample2')), ('Sample3', hash_name('Sample3'))], calls=[ vcfio.VariantCall(sample_id=hash_name('Sample1'), name='Sample1', genotype=[0, 1], phaseset='*', info={ 'GQ': 20, 'FIR': [10, 20] }) ]) header_num_dict = {'IFR': 'A', 'IFR2': 'A', 'IS': '1', 'ISR': '2'} expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.ALTERNATE_BASES: [{ ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A', 'IFR': 0.1, 'IFR2': 0.2 }, { ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT', 'IFR': 0.2, 'IFR2': 0.3 }], ColumnKeyConstants.NAMES: ['rs1', 'rs2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.HOM_REF_CALLS: [{ ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample2'), ColumnKeyConstants.CALLS_NAME: 'Sample2' }, { ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample3'), ColumnKeyConstants.CALLS_NAME: 'Sample3' }], ColumnKeyConstants.CALLS: [{ ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample1'), ColumnKeyConstants.CALLS_NAME: 'Sample1', ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'GQ': 20, 'FIR': [10, 20] }], 'IS': 'some data', 'ISR': ['data1', 'data2'] } proc_variant = _get_processed_variant(variant, header_num_dict) row_generator = bigquery_row_generator.VariantCallRowGenerator( schema_descriptor, conflict_resolver, include_call_name=True, move_hom_ref_calls=True) self.assertEqual([expected_row], list(row_generator.get_rows(proc_variant)))