Example #1
0
    def test_eq(self):
        reference = Record(None, Variant("1", 20, "A", "G"), set(), 0.0, set(),
                           InfoData(None, {}), SampleData([], []), False)

        self.assertTrue(
            reference == Record(None, Variant("1", 20, "A", "G"), set(
            ), 0.0, set(), InfoData(None, {}), SampleData([], []), False))

        self.assertFalse(
            reference == Record(None, Variant("2", 20, "A", "G"), set(
            ), 0.0, set(), InfoData(None, {}), SampleData([], []), False))

        self.assertFalse(reference == Record(None, Variant(
            "1", 20, "A", "G"), set("rs0"), 0.0, set(), InfoData(None, {}),
                                             SampleData([], []), False))

        self.assertFalse(
            reference == Record(None, Variant("1", 20, "A", "G"), set(
            ), 5.0, set(), InfoData(None, {}), SampleData([], []), False))

        self.assertFalse(
            reference == Record(None, Variant("1", 20, "A", "G"), set(
            ), 0.0, set("CV"), InfoData(None, {}), SampleData([], []), False))

        self.assertFalse(reference == Record(None, Variant(
            "1", 20, "A", "G"), set(), 0.0, set(), InfoData(None, {'AF': []}),
                                             SampleData([], []), False))

        self.assertFalse(reference == Record(
            None, Variant("1", 20, "A", "G"), set(), 0.0, set(),
            InfoData(None, {}), SampleData([], ['NA12787']), False))

        self.assertFalse(
            reference == Record(None, Variant("1", 20, "A", "G"), set(
            ), 0.0, set(), InfoData(None, {}), SampleData([], []), True))
Example #2
0
 def test_should_format_multiple_values(self):
     schema = Schema()
     schema.set_info_data('K', 'A', 'Float', 'K')
     schema.set_info_data('K2', 'A', 'String', 'K')
     schema.set_info_data('K3', '0', 'Flag', 'K')
     schema.set_info_data('K4', 'A', 'String', 'K')
     info_data = InfoData(schema, {
         'K3': None,
         'K2': ['S2'],
         'K': [1.0, 2.66, 3.0],
         'K4': ['S4']
     })
     self.assertEqual('K=1.0,2.66,3.0;K2=S2;K3;K4=S4', info_data.to_vcf())
Example #3
0
    def test_doesnt_give_a_flying_damn_about_spurious_info(self):
        chrom = "22"
        variant = Variant(chrom, 11, "A", "C")

        gv_builder = VCFBuilder(join(self.work_dir, "genotype.vcf"))
        gv_builder.with_record_from_variant(
            variant,
            info=InfoData(None,
                          {"#f$@$e%$%^&k**()7!": ["#o$@$f%$%f^&**()7!"]}))
        gv_builder.build().index()
        driver = SVCDriver(self)

        dodgy_sample = "bobs_your_uncle"
        driver.with_ref_sequence(
            "ACGCCCCCTGCAAAAAAAAAA", chrom=chrom, pos_from=0).with_read(
                "...........C.........",
                n_fwd=5,
                n_rev=5,
                chrom=chrom,
                sample_name=dodgy_sample).with_genotype_alleles(
                    gv_builder.compressed_filename)

        expect = driver.call(expected_success=True)
        expect.with_output_vcf() \
            .has_record_for_variant(variant)\
            .with_sample(dodgy_sample)\
            .has_genotype("1/1")
Example #4
0
    def generate_record_from_variant(self, variant, **kwargs):
        annotations = {
            'variant_id':
            set(),
            'quality':
            None,
            'filters':
            set(),
            'info':
            InfoData(self.schema, {}),
            'sample_info':
            SampleData([key for key, _ in self.schema.iter_sample_data()],
                       self.schema.samples),
            'from_multi_alt':
            False,
        }
        for key, value in kwargs.items():
            annotations[key] = value

        return Record(schema=self.schema, variant=variant, **annotations)
Example #5
0
    def calls_variants(self, ref, sequence_list, candidate_ascii_haplotypes, prior, expected_ascii_haplotypes):
        sample_bank = SampleBank(ref)
        sample_bank.add_sample_with_seqs_and_quals("TEST", sequence_list, 1, 0)

        variant_generator = AsciiVariantGenerator(sample_bank.reference)
        candidate_variants = variant_generator.get_variants(candidate_ascii_haplotypes)
        expected_variants = variant_generator.get_variants(expected_ascii_haplotypes)

        candidate_variant_list = VCFBuilder(path.join(self.work_dir, "candiate_variants.vcf"))
        candidate_variant_list.schema.set_info_data('AF', 'A', 'Float', 'Allele Frequency')
        for var in candidate_variants:
            candidate_variant_list.with_record_from_variant(
                var, info=InfoData(candidate_variant_list.schema, {"AF": prior})
            )
        candidate_variant_list.build().index()

        vc_wrapper_builder = VariantCallerBuilderFromSampleBank(sample_bank, self.work_dir)
        vc_wrapper_builder.configuration[CANDIDATE_VARIANTS_FILE_KEY] = candidate_variant_list.compressed_filename
        callset = vc_wrapper_builder.build().run().get_variant_callset(self)

        self.assertEqual(callset.get_variants(), set(expected_variants))
Example #6
0
    def test_should_have_zero_bad_reads_for_candidate_variant_with_no_reads_covering_variant(self):
        chrom = "1"
        candidate_variant_list = VCFBuilder(path.join(self.work_dir, "candiate_variants.vcf"))
        candidate_variant_list.schema.set_info_data('AF', 'A', 'Float', 'Allele Frequency')
        variant_1 = Variant(chrom, 30, 'T', 'C')
        candidate_variant_list.with_record_from_variant(
            variant_1, info=InfoData(candidate_variant_list.schema, {"AF": [0.72]}))
        candidate_variant_list.build().index()

        svc_driver = SVCDriver(self)\
            .with_allow_MNP_calls(True)\
            .with_ref_sequence(
                "TGTTATTAATCCCTTGTCAGATGTTATTAATCCCTTGTCAGTCCCTTGTCAGT", chrom=chrom)\
            .with_read(
                "...........................C.. ......................", n_fwd=10, n_rev=10, sample_name='sample_1')\
            .with_read(
                "                                                     ", n_fwd=10, n_rev=10, sample_name='sample_2')\
            .with_candidate_variants_file(candidate_variant_list.compressed_filename)

        expect = svc_driver.call()

        vcf_expect = expect.with_output_vcf()
        vcf_expect.missing_record_for_variant(variant_1)
Example #7
0
 def test_should_format_a_float_list(self):
     schema = Schema()
     schema.set_info_data('K', 'A', 'Integer', 'K')
     info_data = InfoData(schema, {'K': [1.0, 2.66, 3.0]})
     self.assertEqual('K=1.0,2.66,3.0', info_data.to_vcf())
Example #8
0
 def test_should_format_an_int_list(self):
     schema = Schema()
     schema.set_info_data('K', 'A', 'Integer', 'K')
     info_data = InfoData(schema, {'K': [1, 2, 3]})
     self.assertEqual('K=1,2,3', info_data.to_vcf())
Example #9
0
 def test_should_format_a_string_list(self):
     schema = Schema()
     schema.set_info_data('K', 'A', 'String', 'K')
     info_data = InfoData(schema, {'K': ['V1', 'V2']})
     self.assertEqual('K=V1,V2', info_data.to_vcf())
Example #10
0
 def test_should_format_a_string(self):
     info_data = InfoData(None, {'K': 'V'})
     self.assertEqual('K=V', info_data.to_vcf())
Example #11
0
 def test_should_format_no_data(self):
     info_data = InfoData(None, {})
     self.assertEqual('.', info_data.to_vcf())
Example #12
0
 def test_should_format_a_present_flag(self):
     schema = Schema()
     schema.set_info_data('F', '0', 'Flag', 'Flag')
     info_data = InfoData(schema, {"F": None})
     self.assertEqual('F', info_data.to_vcf())
Example #13
0
def generate_records(schema, cols):
    alts = cols[ALT_COL].split(',')
    vars = [Variant(cols[CHROM_COL], int(cols[POS_COL]) -
                    1, cols[REF_COL], alt) for alt in alts]

    info_data_list = []
    if len(alts) == 1:
        # deferred parsing is simple with a single alt
        info_data_list.append(
            DeferredInfoData(
                schema,
                lambda: defer_parse_info_field(
                    schema,
                    cols[INFO_COL])))

    else:
        # extract and split info data into lists of length n_alts
        split_info_data = OrderedDict()
        for key, value in parse_info_field(cols[INFO_COL]):
            try:
                info_metadata = schema.get_info_data(key)
            except KeyError:
                split_info_data[key] = [
                    DeferredInfoValue(
                        schema, key, value) for index in range(
                        len(alts))]
            else:
                split_info_data[key] = info_metadata.split_alts(
                    value if isinstance(value, list) else value.split(','), n_alts=len(alts)
                )

        # construct InfoData objects from prepared info data
        for index in range(len(alts)):
            info_dict = OrderedDict([
                (key, values[index]) for key, values in list(split_info_data.items())
            ])
            info_data_list.append(InfoData(schema, info_dict))

    try:
        sample_format = cols[FORMAT_COL].split(':')
    except IndexError:
        sample_data_list = repeat(None)
    else:
        # extract sample format
        split_sample_data = {sample_name: sample_field.split(
            ':') for sample_name, sample_field in zip(schema.samples, cols[SAMPLE_COL:])}

        sample_data_list = [
            SampleData(
                cols[FORMAT_COL].split(':'),
                schema.samples) for _ in alts]
        for sample_name, sample_items in list(split_sample_data.items()):
            split_sample_items = {}

            # extract data from sample fields
            gt = None
            for key, item in zip(sample_format, sample_items):
                try:
                    if key == GENOTYPE_KEY:
                        gt = GenotypeCall(item)
                        values = [
                            GenotypeCall(gt.deliminator().join(
                                # Note: default value should be '.', but
                                # downstream tools aren't good enough to use it
                                {None: '.', 0: '0', 1 + index: '1'}.get(gt_index, '0') for gt_index in gt
                            ))
                            for index in range(len(alts))
                        ]
                    elif key == GENOTYPE_LIKELIHOODS_KEY or key == GENOTYPE_PHRED_LIKELIHOODS_KEY:
                        values = schema.get_sample_data(key).split_alts(
                            item.split(','), len(alts), gt)
                    else:
                        values = schema.get_sample_data(key).split_alts(
                            item.split(','), len(alts), None)
                    split_sample_items[key] = values
                except Exception as e:
                    raise type(e)(
                        "Error parsing field {} for sample {}: {}".format(
                            key, sample_name, e))

            # distribute data to each split sample meta-data container
            for index in range(len(alts)):
                sample_data = sample_data_list[index]
                for key, value in list(split_sample_items.items()):
                    sample_data.add_sample_data(sample_name, key, value[index])

    # generate & return record objects
    for var, info_data, sample_data in zip(
            vars, info_data_list, sample_data_list):
        qual = variant_quality_from_vcf(cols[QUALITY_COL])
        ids = variant_ids_from_vcf(cols[ID_COL])
        filts = filters_from_vcf(cols[FILTER_COL])
        yield Record(schema, var, ids, qual, filts, info_data, sample_data, len(alts) > 1)