Esempio n. 1
0
    def test_walk(self):
        # easy case: all same sites
        reader1 = vcf.Reader(fh('example-4.0.vcf'))
        reader2 = vcf.Reader(fh('example-4.0.vcf'))
        reader3 = vcf.Reader(fh('example-4.0.vcf'))

        n = 0
        for x in utils.walk_together(reader1, reader2, reader3):
            assert len(x) == 3
            assert (x[0] == x[1]) and (x[1] == x[2])
            n+= 1
        assert n == 5

        # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left

        expected = 'llrrttrl'
        reader1 = vcf.Reader(fh('walk_left.vcf'))
        reader2 = vcf.Reader(fh('example-4.0.vcf'))

        for ex, recs in zip(expected, utils.walk_together(reader1, reader2)):

            if ex == 'l':
                assert recs[0] is not None
                assert recs[1] is None
            if ex == 'r':
                assert recs[1] is not None
                assert recs[0] is None
            if ex == 't':
                assert recs[0] is not None
                assert recs[1] is not None
Esempio n. 2
0
    def test_walk(self):
        # easy case: all same sites
        reader1 = vcf.Reader(fh('example-4.0.vcf'))
        reader2 = vcf.Reader(fh('example-4.0.vcf'))
        reader3 = vcf.Reader(fh('example-4.0.vcf'))

        n = 0
        for x in utils.walk_together(reader1, reader2, reader3):
            assert len(x) == 3
            assert (x[0] == x[1]) and (x[1] == x[2])
            n+= 1
        assert n == 5

        # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left

        expected = 'llrrttrl'
        reader1 = vcf.Reader(fh('walk_left.vcf'))
        reader2 = vcf.Reader(fh('example-4.0.vcf'))

        for ex, recs in zip(expected, utils.walk_together(reader1, reader2)):

            if ex == 'l':
                assert recs[0] is not None
                assert recs[1] is None
            if ex == 'r':
                assert recs[1] is not None
                assert recs[0] is None
            if ex == 't':
                assert recs[0] is not None
                assert recs[1] is not None
Esempio n. 3
0
    def test_walk(self):
        # easy case: all same sites
        reader1 = vcf.Reader(fh('example-4.0.vcf'))
        reader2 = vcf.Reader(fh('example-4.0.vcf'))
        reader3 = vcf.Reader(fh('example-4.0.vcf'))

        n = 0
        for x in utils.walk_together(reader1, reader2, reader3):
            self.assertEqual(len(x), 3)
            self.assertEqual(x[0], x[1])
            self.assertEqual(x[1], x[2])
            n += 1
        self.assertEqual(n, 5)

        # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left
        expected = 'llrrttrl'
        reader1 = vcf.Reader(fh('walk_left.vcf'))
        reader2 = vcf.Reader(fh('example-4.0.vcf'))

        for ex, recs in zip(expected, utils.walk_together(reader1, reader2)):
            if ex == 'l':
                assert recs[0] is not None
                assert recs[1] is None
            if ex == 'r':
                assert recs[1] is not None
                assert recs[0] is None
            if ex == 't':
                assert recs[0] is not None
                assert recs[1] is not None

        # test files with many chromosomes, set 'vcf_record_sort_key' to define chromosome order
        chr_order = map(str, range(1, 30)) + ['X', 'Y', 'M']
        get_key = lambda r: (chr_order.index(r.CHROM.replace('chr', '')), r.POS
                             )
        reader1 = vcf.Reader(fh('issue-140-file1.vcf'))
        reader2 = vcf.Reader(fh('issue-140-file2.vcf'))
        reader3 = vcf.Reader(fh('issue-140-file3.vcf'))
        expected = "66642577752767662466"  # each char is an integer bit flag - like file permissions
        for ex, recs in zip(
                expected,
                utils.walk_together(reader1,
                                    reader2,
                                    reader3,
                                    vcf_record_sort_key=get_key)):
            ex = int(ex)
            for i, flag in enumerate([0x4, 0x2, 0x1]):
                if ex & flag:
                    self.assertNotEqual(recs[i], None)
                else:
                    self.assertEqual(recs[i], None)
Esempio n. 4
0
def create_vcf_iter(*vcf_paths):
    """Create independent VCF reader iterators.
    Returns a list of readers and a VCF iterator.
    """
    vcf_readers = [vcf.Reader(open(path)) for path in vcf_paths]
    vcf_iter = utils.walk_together(*vcf_readers, vcf_record_sort_key=lambda r: (r.CHROM, r.POS, r.REF, r.ALT))
    return vcf_readers, vcf_iter
Esempio n. 5
0
    def get_variants_shared_by_trio(self):
        '''
        Return the number of identified variants shared by father, mother and son
        Also returns a file containing the variants.
        :return:
        '''
        ## Oeffnen mit vcf.Reader
        self.file_mother = vcf.Reader(open(self.filename_mother, 'r'))
        self.file_father = vcf.Reader(open(self.filename_father, 'r'))
        self.file_son = vcf.Reader(open(self.filename_son, 'r'))

        anzahl = 0

        geteilt = utils.walk_together(self.file_mother, self.file_father,
                                      self.file_son)
        mother_father_son = open("mother_father_son.vcf", "w")
        for record in geteilt:
            ## record[0] entspricht der Mutter, record[1] entspricht dem Vater und record[2] entspricht dem Sohn
            ## wenn diese records nicht leer sind, dann wird die Anzahl um 1 erhoeht.
            if not record[0] is None and not record[1] is None and not record[
                    2] is None:
                anzahl += 1
                for eintrag in record:
                    writer = vcf.Writer(mother_father_son, self.file_son, "\n")
                    writer.write_record(eintrag)

        return anzahl
Esempio n. 6
0
    def get_variants_shared_by_father_and_son(self):
        '''
        Return the number of identified variants shared by mother and son
        Also returns a file containing the variants.
        :return:
        '''
        ## Oeffnen mit vcf.Reader
        self.file_father = vcf.Reader(open(self.filename_father, 'r'))
        self.file_son = vcf.Reader(open(self.filename_son, 'r'))

        anzahl = 0

        ## geteilt verwendet utils.walk_together um ueber mehrere Dateien zu iterieren
        geteilt = utils.walk_together(self.file_father, self.file_son)
        father_son = open("father_son.vcf", "w")

        for record in geteilt:
            ## record[0] entspricht dem Vater, record[1] entspricht dem Sohn
            ## wenn diese records nicht leer sind, dann wird die Anzahl um 1 erhoeht.
            if not record[0] is None and not record[1] is None:
                anzahl += 1
                ## Durch einen Hinweis von Frank Ruge habe erst verstanden, dass hier nicht nur die Anzahl der Variants,
                ## sondern auch die Variants an sich gefragt sind. Deshalb wurden sie auf diese Weise angefuegt.
                for eintrag in record:
                    writer = vcf.Writer(father_son, self.file_son, "\n")
                    writer.write_record(eintrag)

        return anzahl
Esempio n. 7
0
def diff(input_handles, output_handle, precision=10):
    """
    Calculate the Jaccard distance between two VCF files.


    :arg input_handles: List of two open readable handles to VCF files.
    :type input_handles: list(stream)
    :arg output_handle: An open writable handle.
    :type output_handle: stream
    :arg precision: Number of decimals in the output.
    :type precision: int
    """
    first_vcf = vcf.Reader(input_handles[0])
    second_vcf = vcf.Reader(input_handles[1])
    symmetric_difference = 0
    total = 0

    walker = vcfutils.walk_together(first_vcf, second_vcf)
    for first_record, second_record in walker:
        if first_record and second_record and not (first_record.is_indel
                                                   or second_record.is_indel):
            if (first_record.alleles[1].sequence !=
                    second_record.alleles[1].sequence):
                symmetric_difference += 1
            total += 1
        #if
    #for
    output_handle.write('{value:.{precision}f}\n'.format(
        value=symmetric_difference / total, precision=precision))
Esempio n. 8
0
    def test_walk(self):
        # easy case: all same sites
        reader1 = vcf.Reader(fh('example-4.0.vcf'))
        reader2 = vcf.Reader(fh('example-4.0.vcf'))
        reader3 = vcf.Reader(fh('example-4.0.vcf'))

        n = 0
        for x in utils.walk_together(reader1, reader2, reader3):
            self.assertEqual(len(x), 3)
            self.assertEqual(x[0], x[1])
            self.assertEqual(x[1], x[2])
            n+= 1
        self.assertEqual(n, 5)

        # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left
        expected = 'llrrttrl'
        reader1 = vcf.Reader(fh('walk_left.vcf'))
        reader2 = vcf.Reader(fh('example-4.0.vcf'))

        for ex, recs in zip(expected, utils.walk_together(reader1, reader2)):
            if ex == 'l':
                assert recs[0] is not None
                assert recs[1] is None
            if ex == 'r':
                assert recs[1] is not None
                assert recs[0] is None
            if ex == 't':
                assert recs[0] is not None
                assert recs[1] is not None

        # test files with many chromosomes, set 'vcf_record_sort_key' to define chromosome order
        chr_order = map(str, range(1, 30)) + ['X', 'Y', 'M']
        get_key = lambda r: (chr_order.index(r.CHROM.replace('chr','')), r.POS)
        reader1 = vcf.Reader(fh('issue-140-file1.vcf'))
        reader2 = vcf.Reader(fh('issue-140-file2.vcf'))
        reader3 = vcf.Reader(fh('issue-140-file3.vcf'))
        expected = "66642577752767662466" # each char is an integer bit flag - like file permissions
        for ex, recs in zip(expected, utils.walk_together(reader1, reader2, reader3, vcf_record_sort_key = get_key)):
            ex = int(ex)
            for i, flag in enumerate([0x4, 0x2, 0x1]):
                if ex & flag:
                     self.assertNotEqual(recs[i], None)
                else:
                     self.assertEqual(recs[i], None)
Esempio n. 9
0
 def get_variants_shared_by_mother_and_son(self):
     self.file_mother = vcf.Reader(open(self.filename_mother, 'r'))
     self.file_son = vcf.Reader(open(self.filename_son, 'r'))
     MandS = 0
     # see father and son, it is identical
     momson = utils.walk_together(self.file_mother, self.file_son)
     for record in momson:
         if not record[0] is None and not record[1] is None:
             MandS += 1
     return MandS
Esempio n. 10
0
 def get_variants_shared_by_mother_and_son(self):
     print "\n+++++++++++++++++++\nReturn the number of identified variants shared by mother and son:"
     count = 0
     lines = utils.walk_together(vcf.Reader(open(file_mother, "r")),
                                 vcf.Reader(open(file_son, "r")))
     for entry in lines:
         if not entry[0] is None and not entry[1] is None:
             count += 1
     print count
     return count
Esempio n. 11
0
    def __init__(self, out_dir, dna_vcf, *rna_vcfs):

        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        self.out_dir = out_dir

        self.dna_reader = vcf.Reader(filename=dna_vcf)
        self.dna_name = os.path.basename(dna_vcf)

        rna_readers = []
        rna_names = []
        outs_per_sample = OrderedDict()
        for rna_vcf in rna_vcfs:
            basename = os.path.basename(rna_vcf)
            rna_names.append(basename)
            rna_readers.append(vcf.Reader(filename=rna_vcf))
            # also create open file handles for each sample to write results to
            out_file = os.path.join(out_dir, os.path.splitext(basename)[0])
            out_file = out_file + '.results'
            outf = open(out_file, 'w')
            outf.write('\t'.join(PERSAMPLE_COLS) + '\n')
            outs_per_sample[basename] = outf

        self.rna_readers = rna_readers
        self.rna_names = rna_names
        self.outs_per_sample= outs_per_sample

        outov = open(os.path.join(out_dir, 'overlap.results'), 'w')
        outov.write('#RNA_SAMPLE_ORDER: ' + ', '.join(rna_names) + '\n')
        outov.write('\t'.join(OVERLAP_COLS) + '\n')
        self.outs_overlap = outov

        samerec = lambda rec: (rec.CHROM, rec.POS, rec.REF)

        for calls in utils.walk_together(self.dna_reader, *self.rna_readers,
                                         vcf_record_sort_key=samerec):
            # select for heterozygous calls present in DNA and any one of the RNAs
            if calls[0] is not None and calls[0].samples[0].is_het and \
                    any(calls[1:]):
                # gene annotation
                gene = '?'
                if 'ANN' in calls[0].INFO:
                    gene = ','.join(calls[0].INFO['ANN'])

                if all(calls[1:]):
                    self.write_overlap(gene, calls[0], calls[1:])
                    print('Found in DNA and all RNA:', calls[0].CHROM,
                          calls[0].POS, gene, file=sys.stderr)
                else:
                    print('Found in DNA and {0} RNA:'.format(len([x for x in
                          calls[1:] if x])), calls[0].CHROM, calls[0].POS,
                          gene, file=sys.stderr)

                self.write_per_rna(gene, calls[0], calls[1:])
def create_vcf_walktogether(vcf_files):
    """Create a VCF walk-together generator.

    Arguments:
        vcf_files: A list of `vcf.Reader` objects

    Returns:
        A `vcf.utils.walk_together` generator
    """
    def vcf_record_sort_key(r):
        return r.CHROM, r.POS, r.REF, r.ALT
    return pyvcf_utils.walk_together(*vcf_files, vcf_record_sort_key=vcf_record_sort_key)
Esempio n. 13
0
    def merge_mother_father_son_into_one_vcf(self):

        print "\n+++++++++++++++++++\nCreates one VCF containing all variants of the trio (merge VCFs):"

        merge = open("merge.vcf", "w")
        writer = vcf.Writer(merge, vcf.Reader(open(file_mother, "r")), "\n")
        for lines in utils.walk_together(vcf.Reader(open(file_mother, "r")),
                                         vcf.Reader(open(file_father, "r")),
                                         vcf.Reader(open(file_son, "r"))):
            for entry in lines:
                if entry is not None:
                    writer.write_record(entry)
        print "merge files ok"
 def get_variants_shared_by_mother_and_son(self):
     MS_list = []
     self.mother_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24143.vcf"))
     self.son_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24385.vcf"))
     shared = utils.walk_together(self.mother_vcf, self.son_vcf)
     count=0
     for i in shared:
         if i[0] == i[1]:
             count +=1 #MS_list.append(i)
         else:
             pass
     #print(len(MS_list))
     return count #MS_list
 def get_variants_shared_by_trio(self):
     self.father_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24149.vcf"))
     self.mother_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24143.vcf"))
     self.son_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24385.vcf"))
     shared = utils.walk_together(self.father_vcf, self.son_vcf, self.mother_vcf)
     count = 0 #FSM_list = []
     for i in shared:
         if i[0] == i[1] == i[2]: #makes sure that the variants are the same
             count += 1 #FSM_list.append(i)
         else:
             pass
     #print(len(FSM_list))
     return count #FSM_list
Esempio n. 16
0
 def get_variants_shared_by_trio(self):
     self.file_mother = vcf.Reader(open(self.filename_mother, 'r'))
     self.file_father = vcf.Reader(open(self.filename_father, 'r'))
     self.file_son = vcf.Reader(open(self.filename_son, 'r'))
     trio = 0
     trios = utils.walk_together(self.file_mother, self.file_father,
                                 self.file_son)
     #identical to father and son as well as mother and son, just with an added comparison
     for record in trios:
         if not record[0] is None and not record[1] is None and not record[
                 2] is None:
             trio += 1
     return trio
Esempio n. 17
0
 def get_variants_shared_by_father_and_son(self):
     self.file_father = vcf.Reader(open(self.filename_father, 'r'))
     self.file_son = vcf.Reader(open(self.filename_son, 'r'))
     DandS = 0
     #here we need to use two files, we do this by using utils.walk_together
     dadson = utils.walk_together(
         self.file_father, self.file_son
     )  #the two files are now a list called record, we can access the father under position 0 and the son under position 1
     for record in dadson:
         if not record[0] is None and not record[
                 1] is None:  #if the father's (record[0]) and the son's (record[1]) are not empty, count 1
             DandS += 1
     return DandS
Esempio n. 18
0
 def merge_mother_father_son_into_one_vcf(self):
     self.file_mother = vcf.Reader(open(self.filename_mother, 'r'))
     self.file_father = vcf.Reader(open(self.filename_father, 'r'))
     self.file_son = vcf.Reader(open(self.filename_son, 'r'))
     trio_file = open("trio_file.vcf", 'w')
     #to merge the file, we use vcf.Writer and supply a file to write in, a template (the mother) and a line terminator ("\n")
     writer = vcf.Writer(trio_file, self.file_mother, "\n")
     for record in utils.walk_together(self.file_mother, self.file_father,
                                       self.file_son):
         for entry in record:
             if entry is not None:  #if there is an entry, write it into the new file
                 writer.write_record(entry)
     result = "The files have been merged into trio_file.vcf"
     return result
Esempio n. 19
0
def merge_hc_mity(fhc, fmity, fout, priority):
    """Merges the given HaplotypeCaller and UnifiedGenotyper VCFs into a new
    VCF."""

    hc = vcf.Reader(fhc)
    mity = vcf.Reader(fmity)

    # some sanity checks
    # TODO: possible to make it handle different samples in the two VCFs?
    if sorted(hc.samples) != sorted(mity.samples):
        raise ValueError(
            "Input VCF files must have the same sample column headers.")
    if sorted(hc.contigs.keys()) != sorted(mity.contigs.keys()):
        raise ValueError("Input VCF files must denote the same contigs.")
    if sorted(hc.formats.keys()) != sorted(mity.formats.keys()):
        raise ValueError("Input VCF files must contain the same formats.")

    # NOTE: arbitrarily picking mity as the base template ~ we're doing
    # dict updates, so the hc values will take precedence
    # merge infos
    mity.infos.update(hc.infos)
    # merge formats ~ not necessary since they're equal
    # TODO: merge filters?
    # merge metadata
    if 'GATKCommandLine' in mity.metadata:
        mity.metadata['UnifiedGenotyperCommandLine'] = \
            mity.metadata['GATKCommandLine']
    if 'GATKCommandLine' in hc.metadata:
        mity.metadata['HaplotypeCallerCommandLine'] = \
            hc.metadata['GATKCommandLine']
    del mity.metadata['GATKCommandLine']
    del hc.metadata['GATKCommandLine']
    mity.metadata.update(hc.metadata)
    # add custom INFO field, denoting the variant caller for each variant
    # iterate over both, picking the priority when variants are called by both
    # files
    mity.infos['GATKCaller'] = _Info(
        'GATKCaller', '.', 'String', 'GATK '
        'variant caller used to call the variant')

    out_writer = vcf.Writer(fout, mity)
    for hc_rec, mity_rec in walk_together(hc, mity):
        if hc_rec.CHROM != "MT":
            out_writer.write_record(hc_rec)
        elif mity_rec.CHROM == "MT":
            out_writer.write_record(mity_rec)
        else:
            assert False, "We should not be here!"
    def merge_mother_father_son_into_one_vcf(self):
        '''
        Creates one VCF containing all variants of the trio (merge VCFs)
        :return:
        '''

        print "\n---------------\nMerging files.."

        vcf_readerson = vcf.Reader(open(self.vcf_son, 'r'))
        vcf_readermother = vcf.Reader(open(self.vcf_mother, 'r'))
        vcf_readerfather = vcf.Reader(open(self.vcf_father, 'r'))

        merge_file = open("merge_file.vcf", "w")
        writer = vcf.Writer(merge_file, vcf_readermother, "\n")

        for records in utils.walk_together(vcf_readermother, vcf_readerfather, vcf_readerson):
            for entry in records:
                if entry is not None:
                    writer.write_record(entry)

        print("Successfully merged files: Outputfile = merge_file.vcf")
    def merge_mother_father_son_into_one_vcf(self):
        self.father_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24149.vcf"))
        self.mother_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24143.vcf"))
        self.son_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24385.vcf"))
        FS = open("FSM.vcf", "w")
        FSVCF = vcf.Writer(FS, self.son_vcf, "\n")
        shared = utils.walk_together(self.father_vcf, self.son_vcf, self.mother_vcf)
        count = 0
        for i in shared:
            if i[0] or i[1] or i[2]:  #
                if i[0] is None:
                    if i[1] is None:
                        FSVCF.write_record(i[2]);count += 1
                    elif i[2] is None:
                        FSVCF.write_record(i[1]);count += 1
                    else: #if i[2] and i[1] are true
                        FSVCF.write_record(i[1]);count += 1
                elif i[1] is None:
                    if i[0] is None:
                        FSVCF.write_record(i[2]);count += 1
                    elif i[2] is None:
                        FSVCF.write_record(i[0]);count += 1
                    else: #if i[2] and i[0] are true
                        FSVCF.write_record(i[2]);count += 1
                elif i[2] is None:
                    if i[0] is None:
                        FSVCF.write_record(i[1]);count += 1
                    elif i[1] is None:
                        FSVCF.write_record(i[0]);count += 1
                    else:  # if i[1] and i[0] are true
                            FSVCF.write_record(i[0]);count += 1
                else:
                    FSVCF.write_record(i[0]);count += 1

        FSVCF.close()
        FS.close()
        print("the three have " + str(count) + " variants\n they are saved in FSM.vcf") #53227
        return
        '''
Esempio n. 22
0
    def merge_mother_father_son_into_one_vcf(self):
        '''
        Creates one VCF containing all variants of the trio (merge VCFs)
        :return: 
        '''
        ## class vcf.Writer(stream, template, lineterminator='n')[source]
        ## der Writer benoetigt ein template, aus dem die metadaten uebernommen werden
        ## ich habe mich fuer den Sohn entschieden
        ## ein Stream im Modus write wird geoeffnet
        trio_file = open("trio_file.vcf", "w")
        writer = vcf.Writer(trio_file, self.file_son, "\n")
        ## http://nullege.com/codes/search/vcf.utils.walk_together
        ## um mehrere vcf Files gleichzeitig zu bearbeiten kann man vcf.utils.walk_together benutzen
        for records in utils.walk_together(self.file_father, self.file_father,
                                           self.file_son):
            ## jeder Eintrag der nicht None ist wird in das VCF geschrieben
            for eintrag in records:
                if eintrag is not None:
                    writer.write_record(eintrag)

        success = "The file has been merged successfully to trio_file.vcf"

        return success
Esempio n. 23
0
def main(args):

    options = parse_args()

    vcf_reader_A = vcf.Reader(open(options.inVCFA, 'r'))
    vcf_reader_B = vcf.Reader(open(options.inVCFB, 'r'))

    x_list = list()
    y_list = list()
    rare_x_list = list()
    rare_y_list = list()
    common_x_list = list()
    common_y_list = list()
    for records in vcfutils.walk_together(vcf_reader_A, vcf_reader_B):
        if None in records:
            print(records)
            continue
        num_hom_alts = len(records[0].get_hom_alts())
        num_hom_refs = len(records[0].get_hom_refs())
        num_hets = len(records[0].get_hets())
        total_genotypes = float(num_hom_alts + num_hom_refs + num_hets)
        minor_allele_count = (num_hom_alts * 2) + num_hets
        minor_af_a = float(minor_allele_count) / float((total_genotypes * 2))
        num_hom_alts = len(records[1].get_hom_alts())
        num_hom_refs = len(records[1].get_hom_refs())
        num_hets = len(records[1].get_hets())
        total_genotypes = float(num_hom_alts + num_hom_refs + num_hets)
        minor_allele_count = (num_hom_alts * 2) + num_hets
        minor_af_b = float(minor_allele_count) / float((total_genotypes * 2))
        if minor_af_a == 0.0 or minor_af_b == 0.0: continue
        x_list.append(minor_af_a)
        y_list.append(minor_af_b)
        if minor_af_a == 0.01 or minor_af_b <= 0.01:
            rare_x_list.append(minor_af_a)
            rare_y_list.append(minor_af_b)
        else:
            common_x_list.append(minor_af_a)
            common_y_list.append(minor_af_b)
        #if minor_af_a != 0.0 or minor_af_b != 0.0:
        #    if minor_af_a == 0.0: minor_af_a = 1.0
        #    if minor_af_b == 0.0: minor_af_b = 1.0
        #    x_list.append(minor_af_a)
        #    y_list.append(minor_af_b)

    adjust_figure = False
    if min(x_list) < 0.001:
        print("WARNING an x value is less than 0.001")
        adjust_figure = True

    if min(y_list) < 0.001:
        print("WARNING an y value is less than 0.001")
        adjust_figure = True

    # Calculate ks stats
    total_ks_stat, total_ks_pvalue = ks_2samp(x_list, y_list)
    rare_ks_stat, rare_ks_pvalue = ks_2samp(rare_x_list, rare_y_list)
    common_ks_stat, common_ks_pvalue = ks_2samp(common_x_list, common_y_list)
    with open('AF_comparison.raw_AF.{}.ks_stats.tsv'.format(options.outReport),
              'w') as ks_output:
        ks_output.write('variant_type\tsample_size\tks_stat\tks_pvalue\n')
        ks_output.write('total\t{}\t{}\t{}\n'.format(len(x_list),
                                                     total_ks_stat,
                                                     total_ks_pvalue))
        ks_output.write('rare\t{}\t{}\t{}\n'.format(len(rare_x_list),
                                                    rare_ks_stat,
                                                    rare_ks_pvalue))
        ks_output.write('common\t{}\t{}\t{}\n'.format(len(common_x_list),
                                                      common_ks_stat,
                                                      common_ks_pvalue))

    if adjust_figure: pdb.set_trace()
    # Plot total variant figure
    fig = plt.figure()
    ax = plt.gca()
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.scatter(x_list, y_list, color='black', s=8)
    ax.set_xlabel('Allele Frequency Sample Set Broad')
    ax.set_ylabel('Allele Frequency Sample Set Not Broad')
    ax.grid()
    ax.plot([0.0, 1.0], [0.0, 1.0], color="red")
    plt.xticks([0.001, 0.01, 0.1, 1])
    plt.yticks([0.001, 0.01, 0.1, 1])
    plt.title("Allele frequency for {}".format(options.outReport))
    fig.savefig("AF_comparison.raw_AF.{}.png".format(options.outReport))
    plt.close(fig)

    # Plot rare variant figure
    fig = plt.figure()
    ax = plt.gca()
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.scatter(rare_x_list, rare_y_list, color='black', s=8)
    ax.set_xlabel('Allele Frequency Sample Set Broad')
    ax.set_ylabel('Allele Frequency Sample Set Not Broad')
    ax.grid()
    ax.plot([0.0, 1.0], [0.0, 1.0], color="red")
    plt.xticks([0.001, 0.01, 0.1, 1])
    plt.yticks([0.001, 0.01, 0.1, 1])
    plt.title("Allele frequency for {}".format(options.outReport))
    fig.savefig(
        "AF_comparison.raw_AF.either_less_than_or_equal_0.01.{}.png".format(
            options.outReport))
    plt.close(fig)

    # Plot common variant figure
    fig = plt.figure()
    ax = plt.gca()
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.scatter(common_x_list, common_y_list, color='black', s=8)
    ax.set_xlabel('Allele Frequency Sample Set Broad')
    ax.set_ylabel('Allele Frequency Sample Set Not Broad')
    ax.grid()
    ax.plot([0.0, 1.0], [0.0, 1.0], color="red")
    plt.xticks([0.001, 0.01, 0.1, 1])
    plt.yticks([0.001, 0.01, 0.1, 1])
    plt.title("Allele frequency for {}".format(options.outReport))
    fig.savefig("AF_comparison.raw_AF.both_more_than_0.01.{}.png".format(
        options.outReport))
    plt.close(fig)
Esempio n. 24
0
    def test_walk(self):
        # easy case: all same sites
        reader1 = vcf.Reader(fh('example-4.0.vcf'))
        reader2 = vcf.Reader(fh('example-4.0.vcf'))
        reader3 = vcf.Reader(fh('example-4.0.vcf'))

        n = 0
        for x in utils.walk_together(reader1, reader2, reader3):
            assert len(x) == 3
            assert (x[0] == x[1]) and (x[1] == x[2])
            n+= 1
        assert n == 5

        # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left

        expected = 'llrrttrl'
        reader1 = vcf.Reader(fh('walk_left.vcf'))
        reader2 = vcf.Reader(fh('example-4.0.vcf'))

        for ex, recs in zip(expected, utils.walk_together(reader1, reader2)):

            if ex == 'l':
                assert recs[0] is not None
                assert recs[1] is None
            if ex == 'r':
                assert recs[1] is not None
                assert recs[0] is None
            if ex == 't':
                assert recs[0] is not None
                assert recs[1] is not None

        # case with working custom equality function

        # without custom function, exception should be raised

        reader1 = vcf.Reader(fh('example-4.0.vcf'))
        reader2 = vcf.Reader(fh('walk_refcall.vcf'))
        self.assertRaises(AttributeError, next,
                          utils.walk_together(reader1, reader2))

        # with custom function, iteration works

        reader1 = vcf.Reader(fh('example-4.0.vcf'))
        reader2 = vcf.Reader(fh('walk_refcall.vcf'))

        def custom_eq(rec1, rec2):
            # check for equality only on CHROM, POS, and REF
            if rec1 is None or rec2 is None:
                return False
            return rec1.CHROM == rec2.CHROM and rec1.POS == rec2.POS and \
                    rec1.REF == rec2.REF

        nrecs, ncomps = 0, 0
        for x in utils.walk_together(reader1, reader2, eq_func=custom_eq):
            assert len(x) == 2
            # avoid assert() when one record is None
            if x[0] is not None and x[1] is not None:
                assert (custom_eq(x[0], x[1]) and custom_eq(x[1], x[0]))
                ncomps += 1
            # still increment counter to ensure iteration is finished for all
            # records
            nrecs += 1
        # check number of records total
        assert nrecs == 5
        # check how many records found in all files
        assert ncomps == 4
parser.add_option("-v",
                  "--vcf",
                  action="store",
                  type="string",
                  dest="vcf_filename")
parser.add_option("-n",
                  "--nea",
                  action="store",
                  type="string",
                  dest="nea_filename")
(options, args) = parser.parse_args()

vcf_reader = vcf.Reader(open(options.vcf_filename, 'r'))
nea_reader = vcf.Reader(open(options.nea_filename, 'r'))

for record in utils.walk_together(vcf_reader, nea_reader):
    human_record = record[0]
    neand_record = record[1]
    if (human_record is not None) & (neand_record is not None):
        neand_gt = neand_record.genotype('AltaiNea').gt_bases
        if (neand_gt is not None):
            if ((human_record.INFO['AFR_AF'][0] < 0.01) &
                ((human_record.INFO['EAS_AF'][0] > 0.01) |
                 (human_record.INFO['EUR_AF'][0] > 0.01))):
                if ((neand_gt[0] is not human_record.REF) |
                    (neand_gt[2] is not human_record.REF)):
                    print human_record.CHROM, human_record.POS, human_record.POS, human_record.ID, human_record.REF, human_record.ALT[
                        0], human_record.QUAL,
                    print neand_gt[0], neand_gt[2], neand_record.QUAL,
                    print human_record.INFO['AFR_AF'][0], human_record.INFO[
                        'EUR_AF'][0], human_record.INFO['EAS_AF'][0]
Esempio n. 26
0
    def test_walk(self):
        # easy case: all same sites
        reader1 = vcf.Reader(fh('example-4.0.vcf'))
        reader2 = vcf.Reader(fh('example-4.0.vcf'))
        reader3 = vcf.Reader(fh('example-4.0.vcf'))

        n = 0
        for x in utils.walk_together(reader1, reader2, reader3):
            assert len(x) == 3
            assert (x[0] == x[1]) and (x[1] == x[2])
            n += 1
        assert n == 5

        # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left

        expected = 'llrrttrl'
        reader1 = vcf.Reader(fh('walk_left.vcf'))
        reader2 = vcf.Reader(fh('example-4.0.vcf'))

        for ex, recs in zip(expected, utils.walk_together(reader1, reader2)):

            if ex == 'l':
                assert recs[0] is not None
                assert recs[1] is None
            if ex == 'r':
                assert recs[1] is not None
                assert recs[0] is None
            if ex == 't':
                assert recs[0] is not None
                assert recs[1] is not None

        # case with working custom equality function

        # without custom function, exception should be raised

        reader1 = vcf.Reader(fh('example-4.0.vcf'))
        reader2 = vcf.Reader(fh('walk_refcall.vcf'))
        self.assertRaises(AttributeError, next,
                          utils.walk_together(reader1, reader2))

        # with custom function, iteration works

        reader1 = vcf.Reader(fh('example-4.0.vcf'))
        reader2 = vcf.Reader(fh('walk_refcall.vcf'))

        def custom_eq(rec1, rec2):
            # check for equality only on CHROM, POS, and REF
            if rec1 is None or rec2 is None:
                return False
            return rec1.CHROM == rec2.CHROM and rec1.POS == rec2.POS and \
                    rec1.REF == rec2.REF

        nrecs, ncomps = 0, 0
        for x in utils.walk_together(reader1, reader2, eq_func=custom_eq):
            assert len(x) == 2
            # avoid assert() when one record is None
            if x[0] is not None and x[1] is not None:
                assert (custom_eq(x[0], x[1]) and custom_eq(x[1], x[0]))
                ncomps += 1
            # still increment counter to ensure iteration is finished for all
            # records
            nrecs += 1
        # check number of records total
        assert nrecs == 5
        # check how many records found in all files
        assert ncomps == 4
Esempio n. 27
0
def main():

    #dev_vcf = '/Users/atoutoud/Projects/testCompare/data/NA12891.dev_short.vcf'
    #truth_vcf = '/Users/atoutoud/Projects/testCompare/data/NA12891.truth_short.vcf'

    truth_vcf = sys.argv[1]
    dev_vcf = sys.argv[2]

    #dev_reader= vcf.Reader(open(dev_vcf, 'r'))
    #truth_reader = vcf.Reader(open(truth_vcf,'r'))

    print('Sample Comparison')
    dev_reader = vcf.Reader(filename=dev_vcf)
    truth_reader = vcf.Reader(filename=truth_vcf)

    #Checks if a filename is provided. pyVCF looks for the filename in the header line, for replicates of the same sample with different filenames
    # the correct ones should be provided otherwise it will fail.
    if len(sys.argv) == 4:
        sample = sys.argv[3]
    else:
        sample = os.path.basename(truth_vcf).split(os.extsep)[0]

    print(sample)
    summary = Comparison()
    records_dont_match = []
    call_difference = []
    percent_difference = []
    DP_range = []

    #Walk_together is a pyVCF inbuilt function to read two vcfs at the same time.
    for dev_rec, truth_rec in walk_together(dev_reader, truth_reader):
        # A record corresponds to [CHROM,POS,REF,ALT], if the same it checks the metrics differences.
        if dev_rec == truth_rec:
            try:
                #If the DP is different between the records.
                if dev_rec.genotype(sample)['DP'] != truth_rec.genotype(
                        sample)['DP']:
                    summary.diff_metrics += 1
                    #count_metrics += 1
                    print('')
                    print(dev_rec.CHROM, dev_rec.POS, dev_rec.REF, dev_rec.ALT,
                          dev_rec.QUAL)
                    print(
                        '--------------------------------------------------------------'
                    )
                    print('\t'.join(dev_rec.FORMAT.split(':')))

                    for entry in truth_rec.genotype(sample).data:
                        print(entry, end='\t')
                    print('')

                    for entry in dev_rec.genotype(sample).data:
                        print(entry, end='\t')

                    true_DP = truth_rec.genotype(sample)['DP']
                    test_DP = dev_rec.genotype(sample)['DP']
                    DP_range.append(true_DP)
                    DP_range.append(test_DP)

                    if true_DP == 0:
                        difference = 0  # had to set, as the % different calculation divides by the true_DP, so if that is 0 it breaks
                    else:
                        difference = round(
                            abs((test_DP - true_DP) / true_DP * 100), 4)

                    percent_difference.append(difference)

                    DP_diff = true_DP - test_DP
                    call_difference.append(DP_diff)

                    print('\nDP difference {}'.format(DP_diff))
                    print(difference, '%')
                    print('')

                    if dev_rec.genotype(sample)['GQ'] <= 20:
                        summary.dev_GQ += 1
                    elif truth_rec.genotype(sample)['GQ'] <= 20:
                        summary.truth_GQ += 1

                else:
                    summary.matches += 1

                if dev_rec.genotype(sample)['GQ'] != truth_rec.genotype(
                        sample)['GQ']:
                    summary.diff_GQ += 1

            #
            except AttributeError:
                summary.no_format_count += 1
                summary.no_formats.append([dev_rec, dev_rec.INFO])
                print('No format fields {} at position:{}'.format(
                    dev_rec.CHROM, dev_rec.POS))

        else:
            #count_no_match +=1
            summary.diff += 1

            #Stores the different values so they can be explorted all together at the end.
            if truth_rec is None:
                records_dont_match.append({
                    "truth": (truth_rec),
                    "dev":
                    (dev_rec.CHROM, dev_rec.POS, dev_rec.REF, dev_rec.ALT)
                })
            elif dev_rec is None:
                records_dont_match.append({
                    "truth": (truth_rec.CHROM, truth_rec.POS, truth_rec.REF,
                              truth_rec.ALT),
                    "dev": (dev_rec)
                })
            else:
                records_dont_match.append({
                    "truth": (truth_rec.CHROM, truth_rec.POS, truth_rec.REF,
                              truth_rec.ALT),
                    "dev":
                    (dev_rec.CHROM, dev_rec.POS, dev_rec.REF, dev_rec.ALT)
                })
                #print ('** Records do not match **',dev_rec,truth_rec)

    summary.total_count()

    stats1, stats2 = summary.get_stats()
    #summary.output_no_format()

    #Prints the summary metrics for the entirety of the vcf files
    summary.print_metrics(round(stats1, 4), round(stats2, 4))

    print("\nRecords that didn't match first is truth, second is dev")
    for i in records_dont_match:
        print(i)

    #Outputs histogram values for DP difference and percent difference.
    calculate_hist(call_difference,
                   [1, 2, 3, 4, 5, 6, 7, 8, 9,
                    max(call_difference)])
    print(
        '***Please note last bin contains values of difference greater than 8 calls***'
    )
    calculate_hist(percent_difference, [
        0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0,
        max(percent_difference)
    ])
    print(
        '***Please note last bin contains entities with a percent change greater than 5% ***'
    )

    print('\nDP range:', min(DP_range), max(DP_range))