def __init__( self, info_keys_to_move_to_calls_regex, # type: str copy_quality_to_calls, # type: bool copy_filter_to_calls, # type: bool window_size=DEFAULT_WINDOW_SIZE # type: int ): # type: (...) -> None """Initializes the strategy. Args: info_keys_to_move_to_calls_regex: A regular expression specifying info fields that should be moved to calls. copy_quality_to_calls: Whether to copy the quality field to the associated calls in each record. copy_filter_to_calls: Whether to copy filter field to the associated calls in each record. window_size: Size of windows that variants will be grouped in based on the start position of the variant. """ self._move_to_calls = move_to_calls_strategy.MoveToCallsStrategy( info_keys_to_move_to_calls_regex=info_keys_to_move_to_calls_regex, copy_quality_to_calls=copy_quality_to_calls, copy_filter_to_calls=copy_filter_to_calls) self._window_size = window_size
def test_get_merge_keys(self): strategy = move_to_calls_strategy.MoveToCallsStrategy(None, None, None) def get_expected_key(reference_name, start, end, reference_bases, alternate_bases): return '%s:%s:%s:%s:%s' % ( reference_name or '', str(start or ''), str( end or ''), strategy._get_hash(reference_bases or ''), strategy._get_hash(','.join(alternate_bases or []))) variant = vcfio.Variant() self.assertEqual(get_expected_key(None, None, None, None, None), next(strategy.get_merge_keys(variant))) variant.reference_name = '19' self.assertEqual(get_expected_key(19, None, None, None, None), next(strategy.get_merge_keys(variant))) variant.start = 123 variant.end = 125 variant.reference_bases = 'AT' self.assertEqual(get_expected_key(19, 123, 125, 'AT', None), next(strategy.get_merge_keys(variant))) variant.alternate_bases = ['A', 'C'] self.assertEqual(get_expected_key(19, 123, 125, 'AT', ['A', 'C']), next(strategy.get_merge_keys(variant)))
def test_get_merged_variants_no_custom_options(self): strategy = move_to_calls_strategy.MoveToCallsStrategy( info_keys_to_move_to_calls_regex=None, copy_quality_to_calls=False, copy_filter_to_calls=False) variants = self._get_sample_variants() # Test single variant merge. self.assertEqual([variants[0]], strategy.get_merged_variants([variants[0]])) # Test multiple variant merge. merged_variant = strategy.get_merged_variants(variants)[0] self._assert_common_expected_merged_fields(merged_variant) self.assertEqual([ vcfio.VariantCall(name='Sample1', genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True }), vcfio.VariantCall(name='Sample3', genotype=[1, 1]), vcfio.VariantCall(name='Sample4', genotype=[1, 0], info={'GQ': 20}) ], merged_variant.calls) self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) self.assertTrue(merged_variant.info['A1'] in ('some data', 'some data2')) self.assertEqual(['data1', 'data2'], merged_variant.info['A2']) self.assertEqual(['data3', 'data4'], merged_variant.info['A3'])
def test_get_merged_variants_move_info_to_calls(self): strategy = move_to_calls_strategy.MoveToCallsStrategy( info_keys_to_move_to_calls_regex='^A1$', copy_quality_to_calls=False, copy_filter_to_calls=False) variants = self._get_sample_variants() # Test single variant merge. single_merged_variant = strategy.get_merged_variants([variants[0]])[0] self.assertEqual([ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], 'A1': 'some data' }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, 'A1': 'some data' }) ], single_merged_variant.calls) # Test multiple variant merge. merged_variant = strategy.get_merged_variants(variants)[0] self._assert_common_expected_merged_fields(merged_variant) self.assertEqual([ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], 'A1': 'some data' }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, 'A1': 'some data' }), vcfio.VariantCall(sample_id=hash_name('Sample3'), genotype=[1, 1], info={'A1': 'some data2'}), vcfio.VariantCall(sample_id=hash_name('Sample4'), genotype=[1, 0], info={ 'GQ': 20, 'A1': 'some data2' }) ], merged_variant.calls) self.assertItemsEqual(['A2', 'A3'], merged_variant.info.keys()) self.assertEqual(['data1', 'data2'], merged_variant.info['A2']) self.assertEqual(['data3', 'data4'], merged_variant.info['A3'])
def test_modify_bigquery_schema_duplicate_keys(self): strategy = move_to_calls_strategy.MoveToCallsStrategy( info_keys_to_move_to_calls_regex='.*', copy_quality_to_calls=True, copy_filter_to_calls=True) info_keys = [ColumnKeyConstants.CALLS_SAMPLE_ID] base_schema = self._get_base_schema(info_keys) try: strategy.modify_bigquery_schema(base_schema, info_keys) self.fail('Duplicate keys should throw error.') except ValueError: pass
def test_modify_bigquery_schema_move_info_to_calls(self): strategy = move_to_calls_strategy.MoveToCallsStrategy( info_keys_to_move_to_calls_regex='INFO.*1', copy_quality_to_calls=False, copy_filter_to_calls=False) info_keys = ['INFO_KEY1', 'INFO_KEY2'] base_schema = self._get_base_schema(info_keys) strategy.modify_bigquery_schema(base_schema, info_keys) self.assertEqual([ ColumnKeyConstants.REFERENCE_NAME, ColumnKeyConstants.QUALITY, ColumnKeyConstants.FILTER, ColumnKeyConstants.CALLS, '.'.join([ ColumnKeyConstants.CALLS, ColumnKeyConstants.CALLS_SAMPLE_ID ]), '.'.join([ColumnKeyConstants.CALLS, 'INFO_KEY1']), 'INFO_KEY2' ], self._get_fields_from_schema(base_schema))
def test_merge_variants(self): variant_merger = move_to_calls_strategy.MoveToCallsStrategy( '^A1$', False, False) variant_list, merged_variant = self._get_sample_merged_variants() unmerged_variant_list = self._get_sample_unmerged_variants() pipeline = TestPipeline() merged_variants = ( pipeline | Create(variant_list + unmerged_variant_list, reshuffle=False) | 'MergeVariants' >> merge_variants.MergeVariants(variant_merger)) assert_that(merged_variants, asserts.variants_equal_to_ignore_order([merged_variant] + unmerged_variant_list)) pipeline.run()
def _get_variant_merge_strategy(known_args # type: argparse.Namespace ): # type: (...) -> Optional(variant_merge_strategy.VariantMergeStrategy) merge_options = variant_transform_options.MergeOptions if (not known_args.variant_merge_strategy or known_args.variant_merge_strategy == merge_options.NONE): return None elif known_args.variant_merge_strategy == merge_options.MOVE_TO_CALLS: return move_to_calls_strategy.MoveToCallsStrategy( known_args.info_keys_to_move_to_calls_regex, known_args.copy_quality_to_calls, known_args.copy_filter_to_calls) elif (known_args.variant_merge_strategy == merge_options.MERGE_WITH_NON_VARIANTS): return merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( known_args.info_keys_to_move_to_calls_regex, known_args.copy_quality_to_calls, known_args.copy_filter_to_calls) else: raise ValueError('Merge strategy is not supported.')
def test_get_merged_variants_move_everything_to_calls(self): strategy = move_to_calls_strategy.MoveToCallsStrategy( info_keys_to_move_to_calls_regex='.*', copy_quality_to_calls=True, copy_filter_to_calls=True) variants = self._get_sample_variants() # Test single variant merge. single_merged_variant = strategy.get_merged_variants([variants[0]])[0] self.assertEqual([ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], 'A1': 'some data', 'A2': ['data1', 'data2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, 'A1': 'some data', 'A2': ['data1', 'data2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }) ], single_merged_variant.calls) merged_variant = strategy.get_merged_variants(variants)[0] self._assert_common_expected_merged_fields(merged_variant) self.assertEqual([ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], 'A1': 'some data', 'A2': ['data1', 'data2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, 'A1': 'some data', 'A2': ['data1', 'data2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }), vcfio.VariantCall(sample_id=hash_name('Sample3'), genotype=[1, 1], info={ 'A1': 'some data2', 'A3': ['data3', 'data4'], ColumnKeyConstants.QUALITY: 20, ColumnKeyConstants.FILTER: ['q10'] }), vcfio.VariantCall(sample_id=hash_name('Sample4'), genotype=[1, 0], info={ 'GQ': 20, 'A1': 'some data2', 'A3': ['data3', 'data4'], ColumnKeyConstants.QUALITY: 20, ColumnKeyConstants.FILTER: ['q10'] }) ], merged_variant.calls) self.assertEqual([], merged_variant.info.keys())
def test_get_merged_variants_move_quality_and_filter_to_calls(self): strategy = move_to_calls_strategy.MoveToCallsStrategy( info_keys_to_move_to_calls_regex='', copy_quality_to_calls=True, copy_filter_to_calls=True) variants = self._get_sample_variants() # Test single variant merge. single_merged_variant = strategy.get_merged_variants([variants[0]])[0] self.assertEqual([ vcfio.VariantCall(name='Sample1', genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }) ], single_merged_variant.calls) # Test multiple variant merge. merged_variant = strategy.get_merged_variants(variants)[0] self._assert_common_expected_merged_fields(merged_variant) self.assertEqual([ vcfio.VariantCall(name='Sample1', genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }), vcfio.VariantCall(name='Sample3', genotype=[1, 1], info={ ColumnKeyConstants.QUALITY: 20, ColumnKeyConstants.FILTER: ['q10'] }), vcfio.VariantCall(name='Sample4', genotype=[1, 0], info={ 'GQ': 20, ColumnKeyConstants.QUALITY: 20, ColumnKeyConstants.FILTER: ['q10'] }) ], merged_variant.calls) self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) self.assertTrue(merged_variant.info['A1'].data in ('some data', 'some data2')) self.assertEqual(vcfio.VariantInfo(['data1', 'data2'], '2'), merged_variant.info['A2']) self.assertEqual(vcfio.VariantInfo(['data3', 'data4'], '2'), merged_variant.info['A3'])