def test_eq(self): reference = Record(None, Variant("1", 20, "A", "G"), set(), 0.0, set(), InfoData(None, {}), SampleData([], []), False) self.assertTrue( reference == Record(None, Variant("1", 20, "A", "G"), set( ), 0.0, set(), InfoData(None, {}), SampleData([], []), False)) self.assertFalse( reference == Record(None, Variant("2", 20, "A", "G"), set( ), 0.0, set(), InfoData(None, {}), SampleData([], []), False)) self.assertFalse(reference == Record(None, Variant( "1", 20, "A", "G"), set("rs0"), 0.0, set(), InfoData(None, {}), SampleData([], []), False)) self.assertFalse( reference == Record(None, Variant("1", 20, "A", "G"), set( ), 5.0, set(), InfoData(None, {}), SampleData([], []), False)) self.assertFalse( reference == Record(None, Variant("1", 20, "A", "G"), set( ), 0.0, set("CV"), InfoData(None, {}), SampleData([], []), False)) self.assertFalse(reference == Record(None, Variant( "1", 20, "A", "G"), set(), 0.0, set(), InfoData(None, {'AF': []}), SampleData([], []), False)) self.assertFalse(reference == Record( None, Variant("1", 20, "A", "G"), set(), 0.0, set(), InfoData(None, {}), SampleData([], ['NA12787']), False)) self.assertFalse( reference == Record(None, Variant("1", 20, "A", "G"), set( ), 0.0, set(), InfoData(None, {}), SampleData([], []), True))
def test_should_format_multiple_values(self): schema = Schema() schema.set_info_data('K', 'A', 'Float', 'K') schema.set_info_data('K2', 'A', 'String', 'K') schema.set_info_data('K3', '0', 'Flag', 'K') schema.set_info_data('K4', 'A', 'String', 'K') info_data = InfoData(schema, { 'K3': None, 'K2': ['S2'], 'K': [1.0, 2.66, 3.0], 'K4': ['S4'] }) self.assertEqual('K=1.0,2.66,3.0;K2=S2;K3;K4=S4', info_data.to_vcf())
def test_doesnt_give_a_flying_damn_about_spurious_info(self): chrom = "22" variant = Variant(chrom, 11, "A", "C") gv_builder = VCFBuilder(join(self.work_dir, "genotype.vcf")) gv_builder.with_record_from_variant( variant, info=InfoData(None, {"#f$@$e%$%^&k**()7!": ["#o$@$f%$%f^&**()7!"]})) gv_builder.build().index() driver = SVCDriver(self) dodgy_sample = "bobs_your_uncle" driver.with_ref_sequence( "ACGCCCCCTGCAAAAAAAAAA", chrom=chrom, pos_from=0).with_read( "...........C.........", n_fwd=5, n_rev=5, chrom=chrom, sample_name=dodgy_sample).with_genotype_alleles( gv_builder.compressed_filename) expect = driver.call(expected_success=True) expect.with_output_vcf() \ .has_record_for_variant(variant)\ .with_sample(dodgy_sample)\ .has_genotype("1/1")
def generate_record_from_variant(self, variant, **kwargs): annotations = { 'variant_id': set(), 'quality': None, 'filters': set(), 'info': InfoData(self.schema, {}), 'sample_info': SampleData([key for key, _ in self.schema.iter_sample_data()], self.schema.samples), 'from_multi_alt': False, } for key, value in kwargs.items(): annotations[key] = value return Record(schema=self.schema, variant=variant, **annotations)
def calls_variants(self, ref, sequence_list, candidate_ascii_haplotypes, prior, expected_ascii_haplotypes): sample_bank = SampleBank(ref) sample_bank.add_sample_with_seqs_and_quals("TEST", sequence_list, 1, 0) variant_generator = AsciiVariantGenerator(sample_bank.reference) candidate_variants = variant_generator.get_variants(candidate_ascii_haplotypes) expected_variants = variant_generator.get_variants(expected_ascii_haplotypes) candidate_variant_list = VCFBuilder(path.join(self.work_dir, "candiate_variants.vcf")) candidate_variant_list.schema.set_info_data('AF', 'A', 'Float', 'Allele Frequency') for var in candidate_variants: candidate_variant_list.with_record_from_variant( var, info=InfoData(candidate_variant_list.schema, {"AF": prior}) ) candidate_variant_list.build().index() vc_wrapper_builder = VariantCallerBuilderFromSampleBank(sample_bank, self.work_dir) vc_wrapper_builder.configuration[CANDIDATE_VARIANTS_FILE_KEY] = candidate_variant_list.compressed_filename callset = vc_wrapper_builder.build().run().get_variant_callset(self) self.assertEqual(callset.get_variants(), set(expected_variants))
def test_should_have_zero_bad_reads_for_candidate_variant_with_no_reads_covering_variant(self): chrom = "1" candidate_variant_list = VCFBuilder(path.join(self.work_dir, "candiate_variants.vcf")) candidate_variant_list.schema.set_info_data('AF', 'A', 'Float', 'Allele Frequency') variant_1 = Variant(chrom, 30, 'T', 'C') candidate_variant_list.with_record_from_variant( variant_1, info=InfoData(candidate_variant_list.schema, {"AF": [0.72]})) candidate_variant_list.build().index() svc_driver = SVCDriver(self)\ .with_allow_MNP_calls(True)\ .with_ref_sequence( "TGTTATTAATCCCTTGTCAGATGTTATTAATCCCTTGTCAGTCCCTTGTCAGT", chrom=chrom)\ .with_read( "...........................C.. ......................", n_fwd=10, n_rev=10, sample_name='sample_1')\ .with_read( " ", n_fwd=10, n_rev=10, sample_name='sample_2')\ .with_candidate_variants_file(candidate_variant_list.compressed_filename) expect = svc_driver.call() vcf_expect = expect.with_output_vcf() vcf_expect.missing_record_for_variant(variant_1)
def test_should_format_a_float_list(self): schema = Schema() schema.set_info_data('K', 'A', 'Integer', 'K') info_data = InfoData(schema, {'K': [1.0, 2.66, 3.0]}) self.assertEqual('K=1.0,2.66,3.0', info_data.to_vcf())
def test_should_format_an_int_list(self): schema = Schema() schema.set_info_data('K', 'A', 'Integer', 'K') info_data = InfoData(schema, {'K': [1, 2, 3]}) self.assertEqual('K=1,2,3', info_data.to_vcf())
def test_should_format_a_string_list(self): schema = Schema() schema.set_info_data('K', 'A', 'String', 'K') info_data = InfoData(schema, {'K': ['V1', 'V2']}) self.assertEqual('K=V1,V2', info_data.to_vcf())
def test_should_format_a_string(self): info_data = InfoData(None, {'K': 'V'}) self.assertEqual('K=V', info_data.to_vcf())
def test_should_format_no_data(self): info_data = InfoData(None, {}) self.assertEqual('.', info_data.to_vcf())
def test_should_format_a_present_flag(self): schema = Schema() schema.set_info_data('F', '0', 'Flag', 'Flag') info_data = InfoData(schema, {"F": None}) self.assertEqual('F', info_data.to_vcf())
def generate_records(schema, cols): alts = cols[ALT_COL].split(',') vars = [Variant(cols[CHROM_COL], int(cols[POS_COL]) - 1, cols[REF_COL], alt) for alt in alts] info_data_list = [] if len(alts) == 1: # deferred parsing is simple with a single alt info_data_list.append( DeferredInfoData( schema, lambda: defer_parse_info_field( schema, cols[INFO_COL]))) else: # extract and split info data into lists of length n_alts split_info_data = OrderedDict() for key, value in parse_info_field(cols[INFO_COL]): try: info_metadata = schema.get_info_data(key) except KeyError: split_info_data[key] = [ DeferredInfoValue( schema, key, value) for index in range( len(alts))] else: split_info_data[key] = info_metadata.split_alts( value if isinstance(value, list) else value.split(','), n_alts=len(alts) ) # construct InfoData objects from prepared info data for index in range(len(alts)): info_dict = OrderedDict([ (key, values[index]) for key, values in list(split_info_data.items()) ]) info_data_list.append(InfoData(schema, info_dict)) try: sample_format = cols[FORMAT_COL].split(':') except IndexError: sample_data_list = repeat(None) else: # extract sample format split_sample_data = {sample_name: sample_field.split( ':') for sample_name, sample_field in zip(schema.samples, cols[SAMPLE_COL:])} sample_data_list = [ SampleData( cols[FORMAT_COL].split(':'), schema.samples) for _ in alts] for sample_name, sample_items in list(split_sample_data.items()): split_sample_items = {} # extract data from sample fields gt = None for key, item in zip(sample_format, sample_items): try: if key == GENOTYPE_KEY: gt = GenotypeCall(item) values = [ GenotypeCall(gt.deliminator().join( # Note: default value should be '.', but # downstream tools aren't good enough to use it {None: '.', 0: '0', 1 + index: '1'}.get(gt_index, '0') for gt_index in gt )) for index in range(len(alts)) ] elif key == GENOTYPE_LIKELIHOODS_KEY or key == GENOTYPE_PHRED_LIKELIHOODS_KEY: values = schema.get_sample_data(key).split_alts( item.split(','), len(alts), gt) else: values = schema.get_sample_data(key).split_alts( item.split(','), len(alts), None) split_sample_items[key] = values except Exception as e: raise type(e)( "Error parsing field {} for sample {}: {}".format( key, sample_name, e)) # distribute data to each split sample meta-data container for index in range(len(alts)): sample_data = sample_data_list[index] for key, value in list(split_sample_items.items()): sample_data.add_sample_data(sample_name, key, value[index]) # generate & return record objects for var, info_data, sample_data in zip( vars, info_data_list, sample_data_list): qual = variant_quality_from_vcf(cols[QUALITY_COL]) ids = variant_ids_from_vcf(cols[ID_COL]) filts = filters_from_vcf(cols[FILTER_COL]) yield Record(schema, var, ids, qual, filts, info_data, sample_data, len(alts) > 1)