def test_walk(self): # easy case: all same sites reader1 = vcf.Reader(fh('example-4.0.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) reader3 = vcf.Reader(fh('example-4.0.vcf')) n = 0 for x in utils.walk_together(reader1, reader2, reader3): assert len(x) == 3 assert (x[0] == x[1]) and (x[1] == x[2]) n+= 1 assert n == 5 # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left expected = 'llrrttrl' reader1 = vcf.Reader(fh('walk_left.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) for ex, recs in zip(expected, utils.walk_together(reader1, reader2)): if ex == 'l': assert recs[0] is not None assert recs[1] is None if ex == 'r': assert recs[1] is not None assert recs[0] is None if ex == 't': assert recs[0] is not None assert recs[1] is not None
def test_walk(self): # easy case: all same sites reader1 = vcf.Reader(fh('example-4.0.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) reader3 = vcf.Reader(fh('example-4.0.vcf')) n = 0 for x in utils.walk_together(reader1, reader2, reader3): self.assertEqual(len(x), 3) self.assertEqual(x[0], x[1]) self.assertEqual(x[1], x[2]) n += 1 self.assertEqual(n, 5) # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left expected = 'llrrttrl' reader1 = vcf.Reader(fh('walk_left.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) for ex, recs in zip(expected, utils.walk_together(reader1, reader2)): if ex == 'l': assert recs[0] is not None assert recs[1] is None if ex == 'r': assert recs[1] is not None assert recs[0] is None if ex == 't': assert recs[0] is not None assert recs[1] is not None # test files with many chromosomes, set 'vcf_record_sort_key' to define chromosome order chr_order = map(str, range(1, 30)) + ['X', 'Y', 'M'] get_key = lambda r: (chr_order.index(r.CHROM.replace('chr', '')), r.POS ) reader1 = vcf.Reader(fh('issue-140-file1.vcf')) reader2 = vcf.Reader(fh('issue-140-file2.vcf')) reader3 = vcf.Reader(fh('issue-140-file3.vcf')) expected = "66642577752767662466" # each char is an integer bit flag - like file permissions for ex, recs in zip( expected, utils.walk_together(reader1, reader2, reader3, vcf_record_sort_key=get_key)): ex = int(ex) for i, flag in enumerate([0x4, 0x2, 0x1]): if ex & flag: self.assertNotEqual(recs[i], None) else: self.assertEqual(recs[i], None)
def create_vcf_iter(*vcf_paths): """Create independent VCF reader iterators. Returns a list of readers and a VCF iterator. """ vcf_readers = [vcf.Reader(open(path)) for path in vcf_paths] vcf_iter = utils.walk_together(*vcf_readers, vcf_record_sort_key=lambda r: (r.CHROM, r.POS, r.REF, r.ALT)) return vcf_readers, vcf_iter
def get_variants_shared_by_trio(self): ''' Return the number of identified variants shared by father, mother and son Also returns a file containing the variants. :return: ''' ## Oeffnen mit vcf.Reader self.file_mother = vcf.Reader(open(self.filename_mother, 'r')) self.file_father = vcf.Reader(open(self.filename_father, 'r')) self.file_son = vcf.Reader(open(self.filename_son, 'r')) anzahl = 0 geteilt = utils.walk_together(self.file_mother, self.file_father, self.file_son) mother_father_son = open("mother_father_son.vcf", "w") for record in geteilt: ## record[0] entspricht der Mutter, record[1] entspricht dem Vater und record[2] entspricht dem Sohn ## wenn diese records nicht leer sind, dann wird die Anzahl um 1 erhoeht. if not record[0] is None and not record[1] is None and not record[ 2] is None: anzahl += 1 for eintrag in record: writer = vcf.Writer(mother_father_son, self.file_son, "\n") writer.write_record(eintrag) return anzahl
def get_variants_shared_by_father_and_son(self): ''' Return the number of identified variants shared by mother and son Also returns a file containing the variants. :return: ''' ## Oeffnen mit vcf.Reader self.file_father = vcf.Reader(open(self.filename_father, 'r')) self.file_son = vcf.Reader(open(self.filename_son, 'r')) anzahl = 0 ## geteilt verwendet utils.walk_together um ueber mehrere Dateien zu iterieren geteilt = utils.walk_together(self.file_father, self.file_son) father_son = open("father_son.vcf", "w") for record in geteilt: ## record[0] entspricht dem Vater, record[1] entspricht dem Sohn ## wenn diese records nicht leer sind, dann wird die Anzahl um 1 erhoeht. if not record[0] is None and not record[1] is None: anzahl += 1 ## Durch einen Hinweis von Frank Ruge habe erst verstanden, dass hier nicht nur die Anzahl der Variants, ## sondern auch die Variants an sich gefragt sind. Deshalb wurden sie auf diese Weise angefuegt. for eintrag in record: writer = vcf.Writer(father_son, self.file_son, "\n") writer.write_record(eintrag) return anzahl
def diff(input_handles, output_handle, precision=10): """ Calculate the Jaccard distance between two VCF files. :arg input_handles: List of two open readable handles to VCF files. :type input_handles: list(stream) :arg output_handle: An open writable handle. :type output_handle: stream :arg precision: Number of decimals in the output. :type precision: int """ first_vcf = vcf.Reader(input_handles[0]) second_vcf = vcf.Reader(input_handles[1]) symmetric_difference = 0 total = 0 walker = vcfutils.walk_together(first_vcf, second_vcf) for first_record, second_record in walker: if first_record and second_record and not (first_record.is_indel or second_record.is_indel): if (first_record.alleles[1].sequence != second_record.alleles[1].sequence): symmetric_difference += 1 total += 1 #if #for output_handle.write('{value:.{precision}f}\n'.format( value=symmetric_difference / total, precision=precision))
def test_walk(self): # easy case: all same sites reader1 = vcf.Reader(fh('example-4.0.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) reader3 = vcf.Reader(fh('example-4.0.vcf')) n = 0 for x in utils.walk_together(reader1, reader2, reader3): self.assertEqual(len(x), 3) self.assertEqual(x[0], x[1]) self.assertEqual(x[1], x[2]) n+= 1 self.assertEqual(n, 5) # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left expected = 'llrrttrl' reader1 = vcf.Reader(fh('walk_left.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) for ex, recs in zip(expected, utils.walk_together(reader1, reader2)): if ex == 'l': assert recs[0] is not None assert recs[1] is None if ex == 'r': assert recs[1] is not None assert recs[0] is None if ex == 't': assert recs[0] is not None assert recs[1] is not None # test files with many chromosomes, set 'vcf_record_sort_key' to define chromosome order chr_order = map(str, range(1, 30)) + ['X', 'Y', 'M'] get_key = lambda r: (chr_order.index(r.CHROM.replace('chr','')), r.POS) reader1 = vcf.Reader(fh('issue-140-file1.vcf')) reader2 = vcf.Reader(fh('issue-140-file2.vcf')) reader3 = vcf.Reader(fh('issue-140-file3.vcf')) expected = "66642577752767662466" # each char is an integer bit flag - like file permissions for ex, recs in zip(expected, utils.walk_together(reader1, reader2, reader3, vcf_record_sort_key = get_key)): ex = int(ex) for i, flag in enumerate([0x4, 0x2, 0x1]): if ex & flag: self.assertNotEqual(recs[i], None) else: self.assertEqual(recs[i], None)
def get_variants_shared_by_mother_and_son(self): self.file_mother = vcf.Reader(open(self.filename_mother, 'r')) self.file_son = vcf.Reader(open(self.filename_son, 'r')) MandS = 0 # see father and son, it is identical momson = utils.walk_together(self.file_mother, self.file_son) for record in momson: if not record[0] is None and not record[1] is None: MandS += 1 return MandS
def get_variants_shared_by_mother_and_son(self): print "\n+++++++++++++++++++\nReturn the number of identified variants shared by mother and son:" count = 0 lines = utils.walk_together(vcf.Reader(open(file_mother, "r")), vcf.Reader(open(file_son, "r"))) for entry in lines: if not entry[0] is None and not entry[1] is None: count += 1 print count return count
def __init__(self, out_dir, dna_vcf, *rna_vcfs): if not os.path.exists(out_dir): os.makedirs(out_dir) self.out_dir = out_dir self.dna_reader = vcf.Reader(filename=dna_vcf) self.dna_name = os.path.basename(dna_vcf) rna_readers = [] rna_names = [] outs_per_sample = OrderedDict() for rna_vcf in rna_vcfs: basename = os.path.basename(rna_vcf) rna_names.append(basename) rna_readers.append(vcf.Reader(filename=rna_vcf)) # also create open file handles for each sample to write results to out_file = os.path.join(out_dir, os.path.splitext(basename)[0]) out_file = out_file + '.results' outf = open(out_file, 'w') outf.write('\t'.join(PERSAMPLE_COLS) + '\n') outs_per_sample[basename] = outf self.rna_readers = rna_readers self.rna_names = rna_names self.outs_per_sample= outs_per_sample outov = open(os.path.join(out_dir, 'overlap.results'), 'w') outov.write('#RNA_SAMPLE_ORDER: ' + ', '.join(rna_names) + '\n') outov.write('\t'.join(OVERLAP_COLS) + '\n') self.outs_overlap = outov samerec = lambda rec: (rec.CHROM, rec.POS, rec.REF) for calls in utils.walk_together(self.dna_reader, *self.rna_readers, vcf_record_sort_key=samerec): # select for heterozygous calls present in DNA and any one of the RNAs if calls[0] is not None and calls[0].samples[0].is_het and \ any(calls[1:]): # gene annotation gene = '?' if 'ANN' in calls[0].INFO: gene = ','.join(calls[0].INFO['ANN']) if all(calls[1:]): self.write_overlap(gene, calls[0], calls[1:]) print('Found in DNA and all RNA:', calls[0].CHROM, calls[0].POS, gene, file=sys.stderr) else: print('Found in DNA and {0} RNA:'.format(len([x for x in calls[1:] if x])), calls[0].CHROM, calls[0].POS, gene, file=sys.stderr) self.write_per_rna(gene, calls[0], calls[1:])
def create_vcf_walktogether(vcf_files): """Create a VCF walk-together generator. Arguments: vcf_files: A list of `vcf.Reader` objects Returns: A `vcf.utils.walk_together` generator """ def vcf_record_sort_key(r): return r.CHROM, r.POS, r.REF, r.ALT return pyvcf_utils.walk_together(*vcf_files, vcf_record_sort_key=vcf_record_sort_key)
def merge_mother_father_son_into_one_vcf(self): print "\n+++++++++++++++++++\nCreates one VCF containing all variants of the trio (merge VCFs):" merge = open("merge.vcf", "w") writer = vcf.Writer(merge, vcf.Reader(open(file_mother, "r")), "\n") for lines in utils.walk_together(vcf.Reader(open(file_mother, "r")), vcf.Reader(open(file_father, "r")), vcf.Reader(open(file_son, "r"))): for entry in lines: if entry is not None: writer.write_record(entry) print "merge files ok"
def get_variants_shared_by_mother_and_son(self): MS_list = [] self.mother_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24143.vcf")) self.son_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24385.vcf")) shared = utils.walk_together(self.mother_vcf, self.son_vcf) count=0 for i in shared: if i[0] == i[1]: count +=1 #MS_list.append(i) else: pass #print(len(MS_list)) return count #MS_list
def get_variants_shared_by_trio(self): self.father_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24149.vcf")) self.mother_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24143.vcf")) self.son_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24385.vcf")) shared = utils.walk_together(self.father_vcf, self.son_vcf, self.mother_vcf) count = 0 #FSM_list = [] for i in shared: if i[0] == i[1] == i[2]: #makes sure that the variants are the same count += 1 #FSM_list.append(i) else: pass #print(len(FSM_list)) return count #FSM_list
def get_variants_shared_by_trio(self): self.file_mother = vcf.Reader(open(self.filename_mother, 'r')) self.file_father = vcf.Reader(open(self.filename_father, 'r')) self.file_son = vcf.Reader(open(self.filename_son, 'r')) trio = 0 trios = utils.walk_together(self.file_mother, self.file_father, self.file_son) #identical to father and son as well as mother and son, just with an added comparison for record in trios: if not record[0] is None and not record[1] is None and not record[ 2] is None: trio += 1 return trio
def get_variants_shared_by_father_and_son(self): self.file_father = vcf.Reader(open(self.filename_father, 'r')) self.file_son = vcf.Reader(open(self.filename_son, 'r')) DandS = 0 #here we need to use two files, we do this by using utils.walk_together dadson = utils.walk_together( self.file_father, self.file_son ) #the two files are now a list called record, we can access the father under position 0 and the son under position 1 for record in dadson: if not record[0] is None and not record[ 1] is None: #if the father's (record[0]) and the son's (record[1]) are not empty, count 1 DandS += 1 return DandS
def merge_mother_father_son_into_one_vcf(self): self.file_mother = vcf.Reader(open(self.filename_mother, 'r')) self.file_father = vcf.Reader(open(self.filename_father, 'r')) self.file_son = vcf.Reader(open(self.filename_son, 'r')) trio_file = open("trio_file.vcf", 'w') #to merge the file, we use vcf.Writer and supply a file to write in, a template (the mother) and a line terminator ("\n") writer = vcf.Writer(trio_file, self.file_mother, "\n") for record in utils.walk_together(self.file_mother, self.file_father, self.file_son): for entry in record: if entry is not None: #if there is an entry, write it into the new file writer.write_record(entry) result = "The files have been merged into trio_file.vcf" return result
def merge_hc_mity(fhc, fmity, fout, priority): """Merges the given HaplotypeCaller and UnifiedGenotyper VCFs into a new VCF.""" hc = vcf.Reader(fhc) mity = vcf.Reader(fmity) # some sanity checks # TODO: possible to make it handle different samples in the two VCFs? if sorted(hc.samples) != sorted(mity.samples): raise ValueError( "Input VCF files must have the same sample column headers.") if sorted(hc.contigs.keys()) != sorted(mity.contigs.keys()): raise ValueError("Input VCF files must denote the same contigs.") if sorted(hc.formats.keys()) != sorted(mity.formats.keys()): raise ValueError("Input VCF files must contain the same formats.") # NOTE: arbitrarily picking mity as the base template ~ we're doing # dict updates, so the hc values will take precedence # merge infos mity.infos.update(hc.infos) # merge formats ~ not necessary since they're equal # TODO: merge filters? # merge metadata if 'GATKCommandLine' in mity.metadata: mity.metadata['UnifiedGenotyperCommandLine'] = \ mity.metadata['GATKCommandLine'] if 'GATKCommandLine' in hc.metadata: mity.metadata['HaplotypeCallerCommandLine'] = \ hc.metadata['GATKCommandLine'] del mity.metadata['GATKCommandLine'] del hc.metadata['GATKCommandLine'] mity.metadata.update(hc.metadata) # add custom INFO field, denoting the variant caller for each variant # iterate over both, picking the priority when variants are called by both # files mity.infos['GATKCaller'] = _Info( 'GATKCaller', '.', 'String', 'GATK ' 'variant caller used to call the variant') out_writer = vcf.Writer(fout, mity) for hc_rec, mity_rec in walk_together(hc, mity): if hc_rec.CHROM != "MT": out_writer.write_record(hc_rec) elif mity_rec.CHROM == "MT": out_writer.write_record(mity_rec) else: assert False, "We should not be here!"
def merge_mother_father_son_into_one_vcf(self): ''' Creates one VCF containing all variants of the trio (merge VCFs) :return: ''' print "\n---------------\nMerging files.." vcf_readerson = vcf.Reader(open(self.vcf_son, 'r')) vcf_readermother = vcf.Reader(open(self.vcf_mother, 'r')) vcf_readerfather = vcf.Reader(open(self.vcf_father, 'r')) merge_file = open("merge_file.vcf", "w") writer = vcf.Writer(merge_file, vcf_readermother, "\n") for records in utils.walk_together(vcf_readermother, vcf_readerfather, vcf_readerson): for entry in records: if entry is not None: writer.write_record(entry) print("Successfully merged files: Outputfile = merge_file.vcf")
def merge_mother_father_son_into_one_vcf(self): self.father_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24149.vcf")) self.mother_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24143.vcf")) self.son_vcf = vcf.Reader(open("AmpliseqExome.20141120.NA24385.vcf")) FS = open("FSM.vcf", "w") FSVCF = vcf.Writer(FS, self.son_vcf, "\n") shared = utils.walk_together(self.father_vcf, self.son_vcf, self.mother_vcf) count = 0 for i in shared: if i[0] or i[1] or i[2]: # if i[0] is None: if i[1] is None: FSVCF.write_record(i[2]);count += 1 elif i[2] is None: FSVCF.write_record(i[1]);count += 1 else: #if i[2] and i[1] are true FSVCF.write_record(i[1]);count += 1 elif i[1] is None: if i[0] is None: FSVCF.write_record(i[2]);count += 1 elif i[2] is None: FSVCF.write_record(i[0]);count += 1 else: #if i[2] and i[0] are true FSVCF.write_record(i[2]);count += 1 elif i[2] is None: if i[0] is None: FSVCF.write_record(i[1]);count += 1 elif i[1] is None: FSVCF.write_record(i[0]);count += 1 else: # if i[1] and i[0] are true FSVCF.write_record(i[0]);count += 1 else: FSVCF.write_record(i[0]);count += 1 FSVCF.close() FS.close() print("the three have " + str(count) + " variants\n they are saved in FSM.vcf") #53227 return '''
def merge_mother_father_son_into_one_vcf(self): ''' Creates one VCF containing all variants of the trio (merge VCFs) :return: ''' ## class vcf.Writer(stream, template, lineterminator='n')[source] ## der Writer benoetigt ein template, aus dem die metadaten uebernommen werden ## ich habe mich fuer den Sohn entschieden ## ein Stream im Modus write wird geoeffnet trio_file = open("trio_file.vcf", "w") writer = vcf.Writer(trio_file, self.file_son, "\n") ## http://nullege.com/codes/search/vcf.utils.walk_together ## um mehrere vcf Files gleichzeitig zu bearbeiten kann man vcf.utils.walk_together benutzen for records in utils.walk_together(self.file_father, self.file_father, self.file_son): ## jeder Eintrag der nicht None ist wird in das VCF geschrieben for eintrag in records: if eintrag is not None: writer.write_record(eintrag) success = "The file has been merged successfully to trio_file.vcf" return success
def main(args): options = parse_args() vcf_reader_A = vcf.Reader(open(options.inVCFA, 'r')) vcf_reader_B = vcf.Reader(open(options.inVCFB, 'r')) x_list = list() y_list = list() rare_x_list = list() rare_y_list = list() common_x_list = list() common_y_list = list() for records in vcfutils.walk_together(vcf_reader_A, vcf_reader_B): if None in records: print(records) continue num_hom_alts = len(records[0].get_hom_alts()) num_hom_refs = len(records[0].get_hom_refs()) num_hets = len(records[0].get_hets()) total_genotypes = float(num_hom_alts + num_hom_refs + num_hets) minor_allele_count = (num_hom_alts * 2) + num_hets minor_af_a = float(minor_allele_count) / float((total_genotypes * 2)) num_hom_alts = len(records[1].get_hom_alts()) num_hom_refs = len(records[1].get_hom_refs()) num_hets = len(records[1].get_hets()) total_genotypes = float(num_hom_alts + num_hom_refs + num_hets) minor_allele_count = (num_hom_alts * 2) + num_hets minor_af_b = float(minor_allele_count) / float((total_genotypes * 2)) if minor_af_a == 0.0 or minor_af_b == 0.0: continue x_list.append(minor_af_a) y_list.append(minor_af_b) if minor_af_a == 0.01 or minor_af_b <= 0.01: rare_x_list.append(minor_af_a) rare_y_list.append(minor_af_b) else: common_x_list.append(minor_af_a) common_y_list.append(minor_af_b) #if minor_af_a != 0.0 or minor_af_b != 0.0: # if minor_af_a == 0.0: minor_af_a = 1.0 # if minor_af_b == 0.0: minor_af_b = 1.0 # x_list.append(minor_af_a) # y_list.append(minor_af_b) adjust_figure = False if min(x_list) < 0.001: print("WARNING an x value is less than 0.001") adjust_figure = True if min(y_list) < 0.001: print("WARNING an y value is less than 0.001") adjust_figure = True # Calculate ks stats total_ks_stat, total_ks_pvalue = ks_2samp(x_list, y_list) rare_ks_stat, rare_ks_pvalue = ks_2samp(rare_x_list, rare_y_list) common_ks_stat, common_ks_pvalue = ks_2samp(common_x_list, common_y_list) with open('AF_comparison.raw_AF.{}.ks_stats.tsv'.format(options.outReport), 'w') as ks_output: ks_output.write('variant_type\tsample_size\tks_stat\tks_pvalue\n') ks_output.write('total\t{}\t{}\t{}\n'.format(len(x_list), total_ks_stat, total_ks_pvalue)) ks_output.write('rare\t{}\t{}\t{}\n'.format(len(rare_x_list), rare_ks_stat, rare_ks_pvalue)) ks_output.write('common\t{}\t{}\t{}\n'.format(len(common_x_list), common_ks_stat, common_ks_pvalue)) if adjust_figure: pdb.set_trace() # Plot total variant figure fig = plt.figure() ax = plt.gca() ax.set_xscale('log') ax.set_yscale('log') ax.scatter(x_list, y_list, color='black', s=8) ax.set_xlabel('Allele Frequency Sample Set Broad') ax.set_ylabel('Allele Frequency Sample Set Not Broad') ax.grid() ax.plot([0.0, 1.0], [0.0, 1.0], color="red") plt.xticks([0.001, 0.01, 0.1, 1]) plt.yticks([0.001, 0.01, 0.1, 1]) plt.title("Allele frequency for {}".format(options.outReport)) fig.savefig("AF_comparison.raw_AF.{}.png".format(options.outReport)) plt.close(fig) # Plot rare variant figure fig = plt.figure() ax = plt.gca() ax.set_xscale('log') ax.set_yscale('log') ax.scatter(rare_x_list, rare_y_list, color='black', s=8) ax.set_xlabel('Allele Frequency Sample Set Broad') ax.set_ylabel('Allele Frequency Sample Set Not Broad') ax.grid() ax.plot([0.0, 1.0], [0.0, 1.0], color="red") plt.xticks([0.001, 0.01, 0.1, 1]) plt.yticks([0.001, 0.01, 0.1, 1]) plt.title("Allele frequency for {}".format(options.outReport)) fig.savefig( "AF_comparison.raw_AF.either_less_than_or_equal_0.01.{}.png".format( options.outReport)) plt.close(fig) # Plot common variant figure fig = plt.figure() ax = plt.gca() ax.set_xscale('log') ax.set_yscale('log') ax.scatter(common_x_list, common_y_list, color='black', s=8) ax.set_xlabel('Allele Frequency Sample Set Broad') ax.set_ylabel('Allele Frequency Sample Set Not Broad') ax.grid() ax.plot([0.0, 1.0], [0.0, 1.0], color="red") plt.xticks([0.001, 0.01, 0.1, 1]) plt.yticks([0.001, 0.01, 0.1, 1]) plt.title("Allele frequency for {}".format(options.outReport)) fig.savefig("AF_comparison.raw_AF.both_more_than_0.01.{}.png".format( options.outReport)) plt.close(fig)
def test_walk(self): # easy case: all same sites reader1 = vcf.Reader(fh('example-4.0.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) reader3 = vcf.Reader(fh('example-4.0.vcf')) n = 0 for x in utils.walk_together(reader1, reader2, reader3): assert len(x) == 3 assert (x[0] == x[1]) and (x[1] == x[2]) n+= 1 assert n == 5 # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left expected = 'llrrttrl' reader1 = vcf.Reader(fh('walk_left.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) for ex, recs in zip(expected, utils.walk_together(reader1, reader2)): if ex == 'l': assert recs[0] is not None assert recs[1] is None if ex == 'r': assert recs[1] is not None assert recs[0] is None if ex == 't': assert recs[0] is not None assert recs[1] is not None # case with working custom equality function # without custom function, exception should be raised reader1 = vcf.Reader(fh('example-4.0.vcf')) reader2 = vcf.Reader(fh('walk_refcall.vcf')) self.assertRaises(AttributeError, next, utils.walk_together(reader1, reader2)) # with custom function, iteration works reader1 = vcf.Reader(fh('example-4.0.vcf')) reader2 = vcf.Reader(fh('walk_refcall.vcf')) def custom_eq(rec1, rec2): # check for equality only on CHROM, POS, and REF if rec1 is None or rec2 is None: return False return rec1.CHROM == rec2.CHROM and rec1.POS == rec2.POS and \ rec1.REF == rec2.REF nrecs, ncomps = 0, 0 for x in utils.walk_together(reader1, reader2, eq_func=custom_eq): assert len(x) == 2 # avoid assert() when one record is None if x[0] is not None and x[1] is not None: assert (custom_eq(x[0], x[1]) and custom_eq(x[1], x[0])) ncomps += 1 # still increment counter to ensure iteration is finished for all # records nrecs += 1 # check number of records total assert nrecs == 5 # check how many records found in all files assert ncomps == 4
parser.add_option("-v", "--vcf", action="store", type="string", dest="vcf_filename") parser.add_option("-n", "--nea", action="store", type="string", dest="nea_filename") (options, args) = parser.parse_args() vcf_reader = vcf.Reader(open(options.vcf_filename, 'r')) nea_reader = vcf.Reader(open(options.nea_filename, 'r')) for record in utils.walk_together(vcf_reader, nea_reader): human_record = record[0] neand_record = record[1] if (human_record is not None) & (neand_record is not None): neand_gt = neand_record.genotype('AltaiNea').gt_bases if (neand_gt is not None): if ((human_record.INFO['AFR_AF'][0] < 0.01) & ((human_record.INFO['EAS_AF'][0] > 0.01) | (human_record.INFO['EUR_AF'][0] > 0.01))): if ((neand_gt[0] is not human_record.REF) | (neand_gt[2] is not human_record.REF)): print human_record.CHROM, human_record.POS, human_record.POS, human_record.ID, human_record.REF, human_record.ALT[ 0], human_record.QUAL, print neand_gt[0], neand_gt[2], neand_record.QUAL, print human_record.INFO['AFR_AF'][0], human_record.INFO[ 'EUR_AF'][0], human_record.INFO['EAS_AF'][0]
def test_walk(self): # easy case: all same sites reader1 = vcf.Reader(fh('example-4.0.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) reader3 = vcf.Reader(fh('example-4.0.vcf')) n = 0 for x in utils.walk_together(reader1, reader2, reader3): assert len(x) == 3 assert (x[0] == x[1]) and (x[1] == x[2]) n += 1 assert n == 5 # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left expected = 'llrrttrl' reader1 = vcf.Reader(fh('walk_left.vcf')) reader2 = vcf.Reader(fh('example-4.0.vcf')) for ex, recs in zip(expected, utils.walk_together(reader1, reader2)): if ex == 'l': assert recs[0] is not None assert recs[1] is None if ex == 'r': assert recs[1] is not None assert recs[0] is None if ex == 't': assert recs[0] is not None assert recs[1] is not None # case with working custom equality function # without custom function, exception should be raised reader1 = vcf.Reader(fh('example-4.0.vcf')) reader2 = vcf.Reader(fh('walk_refcall.vcf')) self.assertRaises(AttributeError, next, utils.walk_together(reader1, reader2)) # with custom function, iteration works reader1 = vcf.Reader(fh('example-4.0.vcf')) reader2 = vcf.Reader(fh('walk_refcall.vcf')) def custom_eq(rec1, rec2): # check for equality only on CHROM, POS, and REF if rec1 is None or rec2 is None: return False return rec1.CHROM == rec2.CHROM and rec1.POS == rec2.POS and \ rec1.REF == rec2.REF nrecs, ncomps = 0, 0 for x in utils.walk_together(reader1, reader2, eq_func=custom_eq): assert len(x) == 2 # avoid assert() when one record is None if x[0] is not None and x[1] is not None: assert (custom_eq(x[0], x[1]) and custom_eq(x[1], x[0])) ncomps += 1 # still increment counter to ensure iteration is finished for all # records nrecs += 1 # check number of records total assert nrecs == 5 # check how many records found in all files assert ncomps == 4
def main(): #dev_vcf = '/Users/atoutoud/Projects/testCompare/data/NA12891.dev_short.vcf' #truth_vcf = '/Users/atoutoud/Projects/testCompare/data/NA12891.truth_short.vcf' truth_vcf = sys.argv[1] dev_vcf = sys.argv[2] #dev_reader= vcf.Reader(open(dev_vcf, 'r')) #truth_reader = vcf.Reader(open(truth_vcf,'r')) print('Sample Comparison') dev_reader = vcf.Reader(filename=dev_vcf) truth_reader = vcf.Reader(filename=truth_vcf) #Checks if a filename is provided. pyVCF looks for the filename in the header line, for replicates of the same sample with different filenames # the correct ones should be provided otherwise it will fail. if len(sys.argv) == 4: sample = sys.argv[3] else: sample = os.path.basename(truth_vcf).split(os.extsep)[0] print(sample) summary = Comparison() records_dont_match = [] call_difference = [] percent_difference = [] DP_range = [] #Walk_together is a pyVCF inbuilt function to read two vcfs at the same time. for dev_rec, truth_rec in walk_together(dev_reader, truth_reader): # A record corresponds to [CHROM,POS,REF,ALT], if the same it checks the metrics differences. if dev_rec == truth_rec: try: #If the DP is different between the records. if dev_rec.genotype(sample)['DP'] != truth_rec.genotype( sample)['DP']: summary.diff_metrics += 1 #count_metrics += 1 print('') print(dev_rec.CHROM, dev_rec.POS, dev_rec.REF, dev_rec.ALT, dev_rec.QUAL) print( '--------------------------------------------------------------' ) print('\t'.join(dev_rec.FORMAT.split(':'))) for entry in truth_rec.genotype(sample).data: print(entry, end='\t') print('') for entry in dev_rec.genotype(sample).data: print(entry, end='\t') true_DP = truth_rec.genotype(sample)['DP'] test_DP = dev_rec.genotype(sample)['DP'] DP_range.append(true_DP) DP_range.append(test_DP) if true_DP == 0: difference = 0 # had to set, as the % different calculation divides by the true_DP, so if that is 0 it breaks else: difference = round( abs((test_DP - true_DP) / true_DP * 100), 4) percent_difference.append(difference) DP_diff = true_DP - test_DP call_difference.append(DP_diff) print('\nDP difference {}'.format(DP_diff)) print(difference, '%') print('') if dev_rec.genotype(sample)['GQ'] <= 20: summary.dev_GQ += 1 elif truth_rec.genotype(sample)['GQ'] <= 20: summary.truth_GQ += 1 else: summary.matches += 1 if dev_rec.genotype(sample)['GQ'] != truth_rec.genotype( sample)['GQ']: summary.diff_GQ += 1 # except AttributeError: summary.no_format_count += 1 summary.no_formats.append([dev_rec, dev_rec.INFO]) print('No format fields {} at position:{}'.format( dev_rec.CHROM, dev_rec.POS)) else: #count_no_match +=1 summary.diff += 1 #Stores the different values so they can be explorted all together at the end. if truth_rec is None: records_dont_match.append({ "truth": (truth_rec), "dev": (dev_rec.CHROM, dev_rec.POS, dev_rec.REF, dev_rec.ALT) }) elif dev_rec is None: records_dont_match.append({ "truth": (truth_rec.CHROM, truth_rec.POS, truth_rec.REF, truth_rec.ALT), "dev": (dev_rec) }) else: records_dont_match.append({ "truth": (truth_rec.CHROM, truth_rec.POS, truth_rec.REF, truth_rec.ALT), "dev": (dev_rec.CHROM, dev_rec.POS, dev_rec.REF, dev_rec.ALT) }) #print ('** Records do not match **',dev_rec,truth_rec) summary.total_count() stats1, stats2 = summary.get_stats() #summary.output_no_format() #Prints the summary metrics for the entirety of the vcf files summary.print_metrics(round(stats1, 4), round(stats2, 4)) print("\nRecords that didn't match first is truth, second is dev") for i in records_dont_match: print(i) #Outputs histogram values for DP difference and percent difference. calculate_hist(call_difference, [1, 2, 3, 4, 5, 6, 7, 8, 9, max(call_difference)]) print( '***Please note last bin contains values of difference greater than 8 calls***' ) calculate_hist(percent_difference, [ 0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, max(percent_difference) ]) print( '***Please note last bin contains entities with a percent change greater than 5% ***' ) print('\nDP range:', min(DP_range), max(DP_range))