def summary_haplotype_block(vcf, haplotype, outfile): phased_block = defaultdict(lambda : list()) phased_block_com = defaultdict(lambda : list()) import VCF for v in VCF.lines(vcf): block_id = re.split(r':', v['FCM'][1])[1] genotype = re.split(r':', v['FCM'][0])[0] #print genotype #print '{}\t{}\t{}'.format(v['CHROM'], v['POS'], block_id) block_idx = '{}_{}'.format(v['CHROM'], block_id) snp_idx = '{}:{}'.format(v['CHROM'], v['POS']) bases = [v['REF'], v['ALT']] hap1_10x = bases[int(genotype[0])] haplotype_flag = -1 if haplotype.has_key(snp_idx): if hap1_10x == haplotype[snp_idx][0]: haplotype_flag = 0 elif hap1_10x == haplotype[snp_idx][1]: haplotype_flag = 1 phased_block[block_idx].append(int(v['POS'])) phased_block_com[block_idx].append(haplotype_flag) ofile = open(outfile, 'w') for blc in phased_block.keys(): snps = len(phased_block[blc]) start = np.min(phased_block[blc]) end = np.max(phased_block[blc]) length= int(end) - int(start) + 1 hap1_n = len([i for i in phased_block_com[blc] if i == 0]) hap2_n = len([i for i in phased_block_com[blc] if i == 1]) hap0_n = len([i for i in phased_block_com[blc] if i == -1]) print >> ofile, '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(blc, snps, length, start, end, hap1_n, hap2_n, hap0_n) ofile.close()
def check_vcf( input ): v = VCF( input ) debug = False print "\n".join(v.metadata) for line in v.lines(): r = line.ref a = line.alt_list if len(r) > 1: if len(a) > 1: raise Exception("WARNING: multi-allelic change not coded for") line1 = copy.deepcopy( line ) line2 = copy.deepcopy( line ) if len(a[0]) > 1: if len(r) < len(a[0]): # insertion if len(r) == 2: line1.ref = r[1] line1.alt = r[1]+a[0][2:] line1.pos += 1 line2.ref = r[1] line2.alt = a[0][1] line2.pos += 1 if debug: print "====" print line1 print line2 if debug: print line elif len(a[0]) < len(r): if len(a[0]) == 2: line1.ref = r[1:] line1.alt = r[1] line1.pos += 1 line2.ref = r[1] line2.alt = a[0][1] line2.pos += 1 if debug: print "====" print line1 print line2 if debug: print line else: print line else: print line else: print line
def sort_vcf( input, reference, output ): contig_list = get_contig_list( reference ) print "read {} contigs".format(len(contig_list)) v = VCF( input, True ) # request index of VCF upon open with open( output, 'w') as fd_out: fd_out.writelines( [ line+"\n" for line in v.metadata ] ) for contig in contig_list: print "writing entries for contig {}".format(contig) # filter lines from vcf by contig count = 0 if v.seek(contig) < 0: print "skipped {} because it's not in the VCF".format( contig ) else: for line in v.lines( True, \ lambda raw_line: VCFLine(raw_line).chr == contig ): fd_out.write( line.line+"\n" ) count += 1 print "wrote {} entries for {}".format( count, contig )
import VCF import cyvcf2 try: ## MUST BE A UNCOMPRESSED VCF file finput = argv[1] except IndexError as ie: exit("{}\nUSAGE: $0 $vcf_file ".format(ie)) if not path.exists(finput): msg = "ERROR: FNF {}".format(finput) raise IOError(msg) d = {} for v in VCF.lines(finput): if v['CHROM'] in d: d[v['CHROM']].append(v['POS']) else: d[v['CHROM']] = [v['POS']] with open("{}.consPos.txt".format(finput), 'wt') as of: for key,val in d.items(): ## make sure all positions are integer ; if not raise error try: data = [int(i) for i in val] except ValueError as ve: exit("ERROR: {}".format(e)) # https://stackoverflow.com/questions/2361945/detecting-consecutive-integers-in-a-list for k, g in groupby(enumerate(data), lambda ix: ix[0] - ix[1]): cn = list(map(itemgetter(1), g))
def summary_haplotype_block(vcf, haplotype, outfile_up, outfile_down): phased_block = defaultdict(lambda : list()) phased_block_com = defaultdict(lambda : list()) phased_block_rank = defaultdict(lambda : int()) count = 0 import VCF for v in VCF.lines(vcf): block_id = re.split(r':', v['FCM'][1])[1] genotype = re.split(r':', v['FCM'][0])[0] #print genotype #print '{}\t{}\t{}\t{}'.format(v['CHROM'], v['POS'], block_id, v['FCM'][0]) block_idx = '{}_{}'.format(v['CHROM'], block_id) snp_idx = '{}:{}'.format(v['CHROM'], v['POS']) bases = [v['REF'], v['ALT']] hap1_10x = bases[int(genotype[0])] haplotype_flag = -1 if haplotype.has_key(snp_idx): if hap1_10x == haplotype[snp_idx][0]: haplotype_flag = 0 elif hap1_10x == haplotype[snp_idx][1]: haplotype_flag = 1 if not phased_block.has_key(block_idx): count += 1 phased_block[block_idx].append(int(v['POS'])) phased_block_com[block_idx].append(haplotype_flag) phased_block_rank[block_idx] = count for blc in phased_block.keys(): chrs, blc_id = re.split(r'_', blc) snps = len(phased_block[blc]) start = np.min(phased_block[blc]) end = np.max(phased_block[blc]) length= int(end) - int(start) + 1 hap1_n = len([i for i in phased_block_com[blc] if i == 0]) hap2_n = len([i for i in phased_block_com[blc] if i == 1]) hap0_n = len([i for i in phased_block_com[blc] if i == -1]) print '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(blc, snps, length, chrs, start, end, hap1_n, hap2_n, hap0_n) ratio = 0.0 #try: # ratio = np.max(float(hap1_n)/(float(hap1_n)+float(hap2_n)), float(hap2_n)/(float(hap1_n)+float(hap2_n))) #except: # continue min_snp = 1 if hap1_n >= min_snp and hap1_n == snps - hap0_n: ratio = 1 elif hap2_n >= min_snp and hap2_n == snps - hap0_n: ratio = 1 elif hap1_n >= min_snp and hap1_n > hap2_n: ratio = float(hap1_n)/(float(hap1_n)+float(hap2_n)) elif hap2_n >= min_snp and hap2_n > hap1_n: ratio = float(hap2_n)/(float(hap1_n)+float(hap2_n)) print ratio if ratio > 0.95: color = 'gray' hap = 3 print hap1_n, hap2_n if hap1_n > hap2_n: color = 'orange' hap = 1 else: color = 'blue' hap = 2 print color if phased_block_rank[blc]%2 == 1: print >> outfile_up, '{}\t{}\t{}\t{}\t{}\t+'.format(chrs, start, end, hap, color) else: print >> outfile_down, '{}\t{}\t{}\t{}\t{}\t+'.format(chrs, start, end, hap, color)
def nr_sensitivity( input, sample, truth, minqual=0, misses=None, debug=False ): print >>sys.stderr,"input={}\ntruth={}\nminqual={}".format(input,truth,minqual) truth_vcf = VCF( truth ) eval_vcf = VCF( input ) if misses: misses_fd = open( misses, 'w' ) found = False for line in truth_vcf.metadata: if line[:len("#CHROM")]=="#CHROM": misses_fd.write( '##nr_concordance="comment={subset of '\ 'missed sites created by DISCOVAR release bundle Python program '\ 'nr_concordance.py}"\n') found = True misses_fd.write(line+"\n") if not found: raise Exception("program bug? made it this far without #CHROM in truth VCF?") else: misses_fd = None eval_sample_index = eval_vcf.sample_names.index(sample) truth_sample_index = truth_vcf.sample_names.index(sample) eval_chr=[] truth_chr=[] eval_gen = eval_vcf.lines() eval_line = eval_gen.next() eval_chr.append( eval_line.chr ) eval_done = False n_truth_lines = 0 n_site_hits = 0 n_site_concords = 0 last_truth=(None,None) last_eval=(None,None) for truth_line in truth_vcf.lines(): check_sort_order( truth, last_truth, truth_line.chr, truth_line.pos ) truth_genotype=truth_line.get_sample_dict(truth_sample_index)["GT"] if truth_genotype == "0/0" or truth_genotype=="0|0" or truth_genotype == ".": continue if debug: print >>sys.stderr,"seeking {}:{}".format( truth_line.chr, truth_line.pos ) n_truth_lines += 1 if n_truth_lines % 1000 == 0: print n_truth_lines # skip to the correct chromosome if len(truth_chr) == 0 or truth_chr[-1] != truth_line.chr: truth_chr.append( truth_line.chr ) # if we've already passed this chr in the eval file, then spin if not eval_done and truth_line.chr != eval_line.chr and truth_line.chr in eval_chr: continue # if we've not already passed this chr, then find it while not eval_done and eval_line.chr != truth_line.chr: try: eval_line = eval_gen.next() check_sort_order( input, last_eval, eval_line.chr, eval_line.pos ) if debug: print >>sys.stderr,"...next chr={}".format( eval_line.chr ) if eval_chr[-1] != eval_line.chr: eval_chr.append(eval_line.chr) print eval_line.chr except StopIteration: eval_done=True # try to find the correct position while not eval_done and eval_line.pos < truth_line.pos \ and eval_line.chr == truth_line.chr: try: eval_line = eval_gen.next() check_sort_order( input, last_eval, eval_line.chr, eval_line.pos ) if debug: print >>sys.stderr,"...next chr:pos={}:{}".format( eval_line.chr, eval_line.pos ) if eval_chr[-1] != eval_line.chr: eval_chr.append(eval_line.chr) print eval_line.chr except StopIteration: eval_done=True if minqual > 0 and eval_line.qual == '.': raise Exception("not sure what to do here, we're filtering on qual, but qual is '.'") if eval_done or eval_line.pos != truth_line.pos \ or eval_line.chr != truth_line.chr \ or ( eval_line.qual != '.' and float(eval_line.qual) < minqual ): if misses_fd: misses_fd.write(truth_line.line+"\n") else: if truth_line.ref != eval_line.ref: raise Exception(""" Your truth set does not seem to be called on the same reference as your call set. We're done here. truth={} truth_pos={}:{} truth_ref={} input={} input_pos={}:{} input_ref={} """.format( truth, truth_line.chr, truth_line.pos, truth_line.ref, input, eval_line.chr, eval_line.pos, eval_line.ref ) ) if debug: print >>sys.stderr,""" Evaluating: truth_pos={}:{} truth_ref={} truth_alt={} input_pos={}:{} input_ref={} input_alt={} """.format( truth_line.chr, truth_line.pos, truth_line.ref, truth_line.alt, eval_line.chr, eval_line.pos, eval_line.ref, eval_line.alt ) # grab truth NR bases and eval NR bases eval_genotype=eval_line.get_sample_dict( eval_sample_index)["GT"] if eval_genotype != ".": eval_calls_idx = eval_genotype.split("/") if eval_calls_idx[0] == eval_genotype: eval_calls_idx = eval_genotype.split("|") if '0' in eval_calls_idx: eval_calls_idx.remove('0') eval_calls_idx = map(int, eval_calls_idx ) truth_calls_idx = truth_genotype.split("/") if truth_calls_idx[0] == truth_genotype: truth_calls_idx = truth_genotype.split("|") if '0' in truth_calls_idx: truth_calls_idx.remove('0') truth_calls_idx = map(int, truth_calls_idx ) if len(eval_calls_idx) > 0: n_site_hits += 1 if debug: print >>sys.stderr, "accepting site hit at {}:{}".format(eval_line.chr, eval_line.pos) elif debug: print >>sys.stderr, "no NR calls" for truth_nr in [ truth_line.alt_list[i-1] for i in truth_calls_idx ]: # if any truth non-reference call is not found on # the eval line, then we break without counting the # concordance if not truth_nr in eval_line.alt_list: if debug: print >>sys.stderr,""" non-concordant: pos={}:{} truth_ref={} eval_ref={} truth_alt={} eval_alt={} """.format( truth_line.chr, truth_line.pos, truth_line.ref, eval_line.ref, truth_line.alt_list, eval_line.alt_list ) break else: # normal termination of the for loop, so count the # concordance if debug: print >>sys.stderr,""" CONCORDANT: pos={}:{} truth_ref={} eval_ref={} truth_alt={} eval_alt={} """.format( truth_line.chr, truth_line.pos, truth_line.ref, eval_line.ref, truth_line.alt_list, eval_line.alt_list ) n_site_concords += 1 print "n_truth_lines={}, n_site_hits={}, n_site_concords={}, site_hit_frac={}, site_concord_frac={}".format( n_truth_lines, n_site_hits, n_site_concords, n_site_hits/float(n_truth_lines), n_site_concords/float(n_truth_lines) ) # check that for the eval chromosomes that are also in the truth # set, that they come in the same order # first form intersection set overlap_chr = set(truth_chr).intersection(set(eval_chr)) truth_chr_rev = [ chr for chr in truth_chr if chr in overlap_chr ] truth_chr_rev.reverse() for chr in eval_chr: if chr not in overlap_chr: continue if chr != truth_chr_rev[-1]: raise Exception(""" input chromosome ordering doesn't match truth chromosome ordering: input={} truth={} """.format( eval_chr, truth_chr ) ) truth_chr_rev.pop() if misses_fd: misses_fd.close()