def test_create_processed_variant_annotation_alt_allele_num(self): csq_info = parser._Info( id=None, num='.', type=None, desc='some desc Allele|Consequence|IMPACT|ALLELE_NUM', source=None, version=None) header_fields = vcf_header_io.VcfHeader(infos={'CSQ': csq_info}) variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', # The following represent a SNV and an insertion, resp. alternate_bases=['T', 'CT'], names=['rs1'], quality=2, filters=['PASS'], # Note that in the minimal mode of VEP, 'T' is an ambiguous annotation # ALT because it can map to either the 'T' SNV or the 'CT' insertion. # But because there is ALLELE_NUM there should be no ambiguity. # The last four annotations have incorrect ALLELE_NUMs. info={ 'CSQ': vcfio.VariantInfo(data=[ 'T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3', 'T|C5|I5|TEST', 'T|C6|I6|' ], field_count='.') }) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], use_allele_num=True, minimal_match=True, # This should be ignored by the factory method. counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('T') alt1._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'T', 'Consequence': 'C1', 'IMPACT': 'I1', 'ALLELE_NUM': '1' }] } alt2 = processed_variant.AlternateBaseData('CT') alt2._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'T', 'Consequence': 'C2', 'IMPACT': 'I2', 'ALLELE_NUM': '2' }] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)
def test_create_processed_variant_annotation_alt_long_prefix(self): # The returned variant is ignored as we create a custom one next. _, header_fields = self._get_sample_variant_and_header_with_csq() variant = vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='CC', alternate_bases=['CCT', 'CCC', 'CCCC'], names=['rs1'], quality=2, filters=['PASS'], info={ 'CSQ': vcfio.VariantInfo(data=[ 'CT|C1|I1|S1|G1', 'CC|C2|I2|S2|G2', 'CCC|C3|I3|S3|G3' ], field_count='.') }) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('CCT') alt1._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'CT', 'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1' }] } alt2 = processed_variant.AlternateBaseData('CCC') alt2._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'CC', 'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2' }] } alt3 = processed_variant.AlternateBaseData('CCCC') alt3._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'CCC', 'Consequence': 'C3', 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3' }] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 3) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
def test_create_processed_variant_annotation_alt_minimal(self): # The returned variant is ignored as we create a custom one next. _, header_fields = self._get_sample_variant_and_header_with_csq() variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='CC', # The following represent a SNV, an insertion, and a deletion, resp. alternate_bases=['CT', 'CCT', 'C'], names=['rs1'], quality=2, filters=['PASS'], # Note that in the minimal mode, 'T' is an ambiguous annotation ALT # because it can map to either the 'CT' SNV or the 'CCT' insertion. # It is not ambiguous in the non-minimal mode (it only maps to `CT`). info={ 'CSQ': vcfio.VariantInfo(data=['T|C1|I1|S1|G1', '-|C2|I2|S2|G2'], field_count='.') }) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], minimal_match=True, counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('CT') alt1._info = {} alt2 = processed_variant.AlternateBaseData('CCT') alt2._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'T', processed_variant._ANNOTATION_ALT_AMBIGUOUS: True, 'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1' }] } alt3 = processed_variant.AlternateBaseData('C') alt3._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: '-', processed_variant._ANNOTATION_ALT_AMBIGUOUS: False, 'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2' }] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 2) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 1)
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args( argv, _COMMAND_LINE_OPTIONS) if known_args.auto_flags_experiment: _get_input_dimensions(known_args, pipeline_args) annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args) all_patterns = ([annotated_vcf_pattern] if annotated_vcf_pattern else known_args.all_patterns) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) avro_root_path = _get_avro_root_path(beam_pipeline_options) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode, avro_root_path, annotated_vcf_pattern) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.allow_malformed_records, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, known_args.infer_annotation_types, counter_factory) schema = schema_converter.generate_schema_from_header_fields( header_fields, processed_variant_factory, variant_merger, known_args.use_1_based_coordinate, known_args.include_call_name) sharding = variant_sharding.VariantSharding( known_args.sharding_config_path) if sharding.should_keep_shard(sharding.get_residual_index()): num_shards = sharding.get_num_shards() else: num_shards = sharding.get_num_shards() - 1 if known_args.update_schema_on_append: for i in range(num_shards): table_suffix = sharding.get_output_table_suffix(i) table_name = bigquery_util.compose_table_name( known_args.output_table, table_suffix) bigquery_util.update_bigquery_schema_on_append( schema.fields, table_name) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants( all_patterns, pipeline, known_args, pipeline_mode, use_1_based_coordinate=known_args.use_1_based_coordinate) if known_args.allow_malformed_records: variants |= 'DropMalformedRecords' >> filter_variants.FilterVariants() sharded_variants = variants | 'ShardVariants' >> beam.Partition( shard_variants.ShardVariants(sharding), sharding.get_num_shards()) variants = [] for i in range(num_shards): suffix = sharding.get_output_table_suffix(i) # Convert tuples to list variants.append(sharded_variants[i]) if variant_merger: variants[i] |= ('MergeVariants' + suffix >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVariants' + suffix >> beam.Map(processed_variant_factory.create_processed_variant). \ with_output_types(processed_variant.ProcessedVariant)) _ = (variants[i] | 'VariantToAvro' + suffix >> variant_to_avro.VariantToAvroFiles( avro_root_path + suffix, schema, allow_incompatible_records=known_args. allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, null_numeric_value_replacement=( known_args.null_numeric_value_replacement), include_call_name=known_args.include_call_name)) result = pipeline.run() try: state = result.wait_until_finish() if state != beam.runners.runner.PipelineState.DONE: logging.error( 'Dataflow pipeline terminated in an unexpected state: %s', state) raise AssertionError( 'Dataflow pipeline terminated in {} state'.format(state)) except Exception as e: logging.error('Dataflow pipeline failed.') raise e else: logging.info('Dataflow pipeline finished successfully.') metrics_util.log_all_counters(result) # After pipeline is done, create output tables and load AVRO files into them. schema_file = _write_schema_to_temp_file(schema, avro_root_path) suffixes = [] try: for i in range(num_shards): suffixes.append(sharding.get_output_table_suffix(i)) partition_range_end = sharding.get_output_table_partition_range_end( i) if not known_args.append: table_name = bigquery_util.compose_table_name( known_args.output_table, suffixes[i]) partitioning.create_bq_table( table_name, schema_file, bigquery_util.ColumnKeyConstants.START_POSITION, partition_range_end) _record_newly_created_table(table_name) logging.info('Integer range partitioned table %s was created.', table_name) if not known_args.append: _record_newly_created_table( sample_info_table_schema_generator.create_sample_info_table( known_args.output_table)) suffixes.append( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) load_avro = avro_util.LoadAvro(avro_root_path, known_args.output_table, suffixes, False) not_empty_variant_suffixes = load_avro.start_loading() logging.info('Following tables were loaded with at least 1 row:') for suffix in not_empty_variant_suffixes: logging.info( bigquery_util.compose_table_name(known_args.output_table, suffix)) # Remove sample_info table from both lists to avoid duplicating it when # --sample_lookup_optimized_output_table flag is set suffixes.remove( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) if sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX in\ not_empty_variant_suffixes: not_empty_variant_suffixes.remove( sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX) except Exception as e: logging.error( 'Something unexpected happened during the loading of AVRO ' 'files to BigQuery: %s', str(e)) logging.info( 'Since the write to BigQuery stage failed, we did not delete ' 'AVRO files in your GCS bucket. You can manually import them ' 'to BigQuery. To avoid extra storage charges, delete them if ' 'you do not need them, AVRO files are located at: %s', avro_root_path) raise e else: logging.warning('All AVRO files were successfully loaded to BigQuery.') if known_args.keep_intermediate_avro_files: logging.info( 'Since "--keep_intermediate_avro_files" flag is set, the ' 'AVRO files are kept and stored at: %s', avro_root_path) else: if bigquery_util.delete_gcs_files(avro_root_path) != 0: logging.error( 'Deletion of intermediate AVRO files located at "%s" has ' 'failed.', avro_root_path) if known_args.sample_lookup_optimized_output_table: flatten_call_column = partitioning.FlattenCallColumn( known_args.output_table, not_empty_variant_suffixes, known_args.append) try: flatten_schema_file = tempfile.mkstemp( suffix=_BQ_SCHEMA_FILE_SUFFIX)[1] if not flatten_call_column.get_flatten_table_schema( flatten_schema_file): raise ValueError('Failed to extract schema of flatten table') # Create output flatten tables if needed if not known_args.append: # Create all sample optimized tables including those that will be empty. for suffix in suffixes: output_table_id = bigquery_util.compose_table_name( known_args.sample_lookup_optimized_output_table, suffix) partitioning.create_bq_table( output_table_id, flatten_schema_file, bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID, partitioning.MAX_RANGE_END) _record_newly_created_table(output_table_id) logging.info( 'Sample lookup optimized table %s was created.', output_table_id) # Copy to flatten sample lookup tables from the variant lookup tables. # Note: uses WRITE_TRUNCATE to overwrite the existing tables (issue #607). flatten_call_column.copy_to_flatten_table( known_args.sample_lookup_optimized_output_table) logging.info( 'All sample lookup optimized tables are fully loaded.') except Exception as e: logging.error( 'Something unexpected happened during the loading rows to ' 'sample optimized table stage: %s', str(e)) raise e
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args( argv, _COMMAND_LINE_OPTIONS) annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args) input_pattern = annotated_vcf_pattern or known_args.input_pattern variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = pipeline_common.get_pipeline_mode( input_pattern, known_args.optimize_for_large_inputs) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args.input_pattern, known_args, pipeline_args, pipeline_mode, annotated_vcf_pattern) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.allow_malformed_records, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, known_args.infer_annotation_types, counter_factory) partitioner = None if ((known_args.optimize_for_large_inputs and variant_merger) or known_args.partition_config_path): partitioner = variant_partition.VariantPartition( known_args.partition_config_path) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants(input_pattern, pipeline, known_args, pipeline_mode) variants |= 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) if partitioner: num_partitions = partitioner.get_num_partitions() partitioned_variants = variants | 'PartitionVariants' >> beam.Partition( partition_variants.PartitionVariants(partitioner), num_partitions) variants = [] for i in range(num_partitions): if partitioner.should_keep_partition(i): variants.append(partitioned_variants[i]) else: num_partitions -= 1 else: # By default we don't partition the data, so we have only 1 partition. num_partitions = 1 variants = [variants] for i in range(num_partitions): if variant_merger: variants[i] |= ('MergeVariants' + str(i) >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVariants' + str(i) >> beam.Map(processed_variant_factory.create_processed_variant).\ with_output_types(processed_variant.ProcessedVariant)) if partitioner and partitioner.should_flatten(): variants = [variants | 'FlattenPartitions' >> beam.Flatten()] num_partitions = 1 if known_args.output_table: for i in range(num_partitions): table_suffix = '' if partitioner and partitioner.get_partition_name(i): table_suffix = '_' + partitioner.get_partition_name(i) table_name = known_args.output_table + table_suffix _ = ( variants[i] | 'VariantToBigQuery' + table_suffix >> variant_to_bigquery.VariantToBigQuery( table_name, header_fields, variant_merger, processed_variant_factory, append=known_args.append, update_schema_on_append=known_args.update_schema_on_append, allow_incompatible_records=known_args. allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, num_bigquery_write_shards=known_args. num_bigquery_write_shards, null_numeric_value_replacement=( known_args.null_numeric_value_replacement))) if known_args.output_avro_path: # TODO(bashir2): Add an integration test that outputs to Avro files and # also imports to BigQuery. Then import those Avro outputs using the bq # tool and verify that the two tables are identical. _ = (variants | 'FlattenToOnePCollection' >> beam.Flatten() | 'VariantToAvro' >> variant_to_avro.VariantToAvroFiles( known_args.output_avro_path, header_fields, processed_variant_factory, variant_merger=variant_merger, allow_incompatible_records=known_args. allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, null_numeric_value_replacement=( known_args.null_numeric_value_replacement))) result = pipeline.run() result.wait_until_finish() metrics_util.log_all_counters(result)
def _get_processed_variant(variant, header_num_dict=None): header_fields = vcf_header_util.make_header(header_num_dict or {}) return processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant)
def run(argv=None): # type: (List[str]) -> None """Runs VCF to BigQuery pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args(argv, _COMMAND_LINE_OPTIONS) # Note VepRunner creates new input files, so it should be run before any # other access to known_args.input_pattern. if known_args.run_annotation_pipeline: runner = vep_runner.create_runner_and_update_args(known_args, pipeline_args) runner.run_on_all_files() runner.wait_until_done() logging.info('Using VEP processed files: %s', known_args.input_pattern) variant_merger = _get_variant_merge_strategy(known_args) pipeline_mode = vcf_to_bq_common.get_pipeline_mode( known_args.input_pattern, known_args.optimize_for_large_inputs) # Starts a pipeline to merge VCF headers in beam if the total files that # match the input pattern exceeds _SMALL_DATA_THRESHOLD _merge_headers(known_args, pipeline_args, pipeline_mode) # Retrieve merged headers prior to launching the pipeline. This is needed # since the BigQuery schema cannot yet be dynamically created based on input. # See https://issues.apache.org/jira/browse/BEAM-2801. header_fields = vcf_header_parser.get_vcf_headers( known_args.representative_header_file) counter_factory = metrics_util.CounterFactory() processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching, counter_factory) partitioner = None if known_args.optimize_for_large_inputs or known_args.partition_config_path: partitioner = variant_partition.VariantPartition( known_args.partition_config_path) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) pipeline = beam.Pipeline(options=beam_pipeline_options) variants = _read_variants(pipeline, known_args) variants |= 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) if partitioner: num_partitions = partitioner.get_num_partitions() partitioned_variants = variants | 'PartitionVariants' >> beam.Partition( partition_variants.PartitionVariants(partitioner), num_partitions) variants = [] for i in range(num_partitions): if partitioner.should_keep_partition(i): variants.append(partitioned_variants[i]) else: num_partitions -= 1 else: # By default we don't partition the data, so we have only 1 partition. num_partitions = 1 variants = [variants] for i in range(num_partitions): if variant_merger: variants[i] |= ('MergeVariants' + str(i) >> merge_variants.MergeVariants(variant_merger)) variants[i] |= ( 'ProcessVaraints' + str(i) >> beam.Map(processed_variant_factory.create_processed_variant).\ with_output_types(processed_variant.ProcessedVariant)) if partitioner and partitioner.should_flatten(): variants = [variants | 'FlattenPartitions' >> beam.Flatten()] num_partitions = 1 for i in range(num_partitions): table_suffix = '' if partitioner and partitioner.get_partition_name(i): table_suffix = '_' + partitioner.get_partition_name(i) table_name = known_args.output_table + table_suffix _ = (variants[i] | 'VariantToBigQuery' + table_suffix >> variant_to_bigquery.VariantToBigQuery( table_name, header_fields, variant_merger, processed_variant_factory, append=known_args.append, update_schema_on_append=known_args.update_schema_on_append, allow_incompatible_records=known_args.allow_incompatible_records, omit_empty_sample_calls=known_args.omit_empty_sample_calls, num_bigquery_write_shards=known_args.num_bigquery_write_shards)) result = pipeline.run() result.wait_until_finish() metrics_util.log_all_counters(result)
def test_create_processed_variant_symbolic_and_breakend_annotation_alt( self): # The returned variant is ignored as we create a custom one next. _, header_fields = self._get_sample_variant_and_header_with_csq() variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['<SYMBOLIC>', '[13:123457[.', 'C[10:10357[.'], names=['rs1'], quality=2, filters=['PASS'], info={ 'CSQ': [ 'SYMBOLIC|C1|I1|S1|G1', '[13|C2|I2|S2|G2', 'C[10|C3|I3|S3|G3', 'C[1|C3|I3|S3|G3' ] }) # The last one does not match any alts. counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('<SYMBOLIC>') alt1._info = { 'CSQ': [{ annotation_parser.ANNOTATION_ALT: 'SYMBOLIC', 'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1' }] } alt2 = processed_variant.AlternateBaseData('[13:123457[.') alt2._info = { 'CSQ': [{ annotation_parser.ANNOTATION_ALT: '[13', 'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2' }] } alt3 = processed_variant.AlternateBaseData('C[10:10357[.') alt3._info = { 'CSQ': [{ annotation_parser.ANNOTATION_ALT: 'C[10', 'Consequence': 'C3', 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3' }] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 3) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 1)