Beispiel #1
0
def summary_haplotype_block(vcf, haplotype, outfile):
    phased_block = defaultdict(lambda : list())
    phased_block_com = defaultdict(lambda : list())
    import VCF
    for v in VCF.lines(vcf):
        block_id = re.split(r':', v['FCM'][1])[1]
        genotype = re.split(r':', v['FCM'][0])[0]
        #print genotype
        #print '{}\t{}\t{}'.format(v['CHROM'], v['POS'], block_id)
        block_idx = '{}_{}'.format(v['CHROM'], block_id)
        snp_idx   = '{}:{}'.format(v['CHROM'], v['POS'])
        bases     = [v['REF'], v['ALT']]
        hap1_10x  = bases[int(genotype[0])]
        haplotype_flag = -1
        if haplotype.has_key(snp_idx):
            if hap1_10x == haplotype[snp_idx][0]:
                haplotype_flag = 0
            elif hap1_10x == haplotype[snp_idx][1]:
                haplotype_flag = 1
        phased_block[block_idx].append(int(v['POS']))
        phased_block_com[block_idx].append(haplotype_flag)

    ofile = open(outfile, 'w')
    for blc in phased_block.keys():
        snps  = len(phased_block[blc])  
        start = np.min(phased_block[blc])
        end   = np.max(phased_block[blc])
        length= int(end) - int(start) + 1
        hap1_n = len([i for i in phased_block_com[blc] if i == 0])
        hap2_n = len([i for i in phased_block_com[blc] if i == 1])
        hap0_n = len([i for i in phased_block_com[blc] if i == -1])
        print >> ofile, '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(blc, snps, length, start, end, hap1_n, hap2_n, hap0_n)
    ofile.close()
def check_vcf( input ):
    v = VCF( input )

    debug = False

    print "\n".join(v.metadata)

    for line in v.lines():
        r = line.ref
        a = line.alt_list

        if len(r) > 1:
            if len(a) > 1:
                raise Exception("WARNING: multi-allelic change not coded for")

            line1 = copy.deepcopy( line )
            line2 = copy.deepcopy( line )

            if len(a[0]) > 1:
                if len(r) < len(a[0]):          # insertion
                    if len(r) == 2:
                        line1.ref = r[1]
                        line1.alt = r[1]+a[0][2:] 
                        line1.pos += 1
                        line2.ref = r[1]
                        line2.alt = a[0][1] 
                        line2.pos += 1
                    if debug: print "===="
                    print line1
                    print line2
                    if debug: print line
                elif len(a[0]) < len(r):
                    if len(a[0]) == 2:
                        line1.ref = r[1:]
                        line1.alt = r[1]
                        line1.pos += 1
                        line2.ref = r[1]
                        line2.alt = a[0][1]
                        line2.pos += 1
                    if debug: print "===="
                    print line1
                    print line2
                    if debug: print line
                else:
                    print line
            else:
                print line
        else:
            print line
def sort_vcf( input, reference, output ):
    contig_list = get_contig_list( reference )

    print "read {} contigs".format(len(contig_list))

    v = VCF( input, True )      # request index of VCF upon open

    with open( output, 'w') as fd_out:
        fd_out.writelines( [ line+"\n" for line in v.metadata ] )

        for contig in contig_list:
            print "writing entries for contig {}".format(contig)
            # filter lines from vcf by contig
            count = 0
            if v.seek(contig) < 0:
                print "skipped {} because it's not in the VCF".format( contig )
            else:
                for line in v.lines( True, \
                        lambda raw_line: VCFLine(raw_line).chr == contig ):
                    fd_out.write( line.line+"\n" )
                    count += 1
                print "wrote {} entries for {}".format( count, contig )
import VCF
import cyvcf2

try:
	## MUST BE A UNCOMPRESSED VCF file
	finput = argv[1]
except IndexError as ie:
	exit("{}\nUSAGE: $0 $vcf_file ".format(ie))

if not path.exists(finput):
	msg = "ERROR: FNF {}".format(finput)
	raise IOError(msg)

d = {}

for v in VCF.lines(finput):
	if v['CHROM'] in d:
		d[v['CHROM']].append(v['POS'])
	else:
		d[v['CHROM']] = [v['POS']]

with open("{}.consPos.txt".format(finput), 'wt') as of:
	for key,val in d.items():
		## make sure all positions are integer ; if not raise error
		try:
			data = [int(i) for i in val]
		except ValueError as ve:
			exit("ERROR: {}".format(e))
		# https://stackoverflow.com/questions/2361945/detecting-consecutive-integers-in-a-list
		for k, g in groupby(enumerate(data), lambda ix: ix[0] - ix[1]):
			cn = list(map(itemgetter(1), g))
Beispiel #5
0
def summary_haplotype_block(vcf, haplotype, outfile_up, outfile_down):
    phased_block = defaultdict(lambda : list())
    phased_block_com  = defaultdict(lambda : list())
    phased_block_rank = defaultdict(lambda : int())
    count = 0
    import VCF
    for v in VCF.lines(vcf):
        block_id = re.split(r':', v['FCM'][1])[1]
        genotype = re.split(r':', v['FCM'][0])[0]
        #print genotype
        #print '{}\t{}\t{}\t{}'.format(v['CHROM'], v['POS'], block_id, v['FCM'][0])
        block_idx = '{}_{}'.format(v['CHROM'], block_id)
        snp_idx   = '{}:{}'.format(v['CHROM'], v['POS'])
        bases     = [v['REF'], v['ALT']]
        hap1_10x  = bases[int(genotype[0])]
        haplotype_flag = -1
        if haplotype.has_key(snp_idx):
            if hap1_10x == haplotype[snp_idx][0]:
                haplotype_flag = 0
            elif hap1_10x == haplotype[snp_idx][1]:
                haplotype_flag = 1
        if not phased_block.has_key(block_idx):
            count += 1
        phased_block[block_idx].append(int(v['POS']))
        phased_block_com[block_idx].append(haplotype_flag)
        phased_block_rank[block_idx] = count

    for blc in phased_block.keys():
        chrs, blc_id = re.split(r'_', blc)
        snps  = len(phased_block[blc])  
        start = np.min(phased_block[blc])
        end   = np.max(phased_block[blc])
        length= int(end) - int(start) + 1
        hap1_n = len([i for i in phased_block_com[blc] if i == 0])
        hap2_n = len([i for i in phased_block_com[blc] if i == 1])
        hap0_n = len([i for i in phased_block_com[blc] if i == -1])
        print '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(blc, snps, length, chrs, start, end, hap1_n, hap2_n, hap0_n)
        ratio = 0.0
        #try:
        #    ratio = np.max(float(hap1_n)/(float(hap1_n)+float(hap2_n)), float(hap2_n)/(float(hap1_n)+float(hap2_n)))
        #except:
        #    continue
        min_snp = 1
        if hap1_n >= min_snp and hap1_n == snps - hap0_n:
            ratio = 1
        elif hap2_n >= min_snp and hap2_n == snps - hap0_n:
            ratio = 1
        elif hap1_n >= min_snp and hap1_n > hap2_n:
            ratio = float(hap1_n)/(float(hap1_n)+float(hap2_n))
        elif hap2_n >= min_snp and hap2_n > hap1_n:
            ratio = float(hap2_n)/(float(hap1_n)+float(hap2_n)) 

        print ratio
        if ratio > 0.95: 
            color = 'gray'
            hap   = 3
            print hap1_n, hap2_n
            if hap1_n > hap2_n:
                color = 'orange'
                hap   = 1
            else:
                color = 'blue' 
                hap   = 2
            print color
            if phased_block_rank[blc]%2 == 1:
                print >> outfile_up, '{}\t{}\t{}\t{}\t{}\t+'.format(chrs, start, end, hap, color)
            else:
                print >> outfile_down, '{}\t{}\t{}\t{}\t{}\t+'.format(chrs, start, end, hap, color)
Beispiel #6
0
def nr_sensitivity( input, sample, truth, minqual=0, misses=None, debug=False ):
    print >>sys.stderr,"input={}\ntruth={}\nminqual={}".format(input,truth,minqual)

    truth_vcf = VCF( truth )
    eval_vcf = VCF( input )

    if misses:
        misses_fd = open( misses, 'w' )
        found = False
        for line in truth_vcf.metadata:
            if line[:len("#CHROM")]=="#CHROM":
                misses_fd.write( '##nr_concordance="comment={subset of '\
                    'missed sites created by DISCOVAR release bundle Python program '\
                    'nr_concordance.py}"\n')
                found = True
            misses_fd.write(line+"\n")
        if not found:
            raise Exception("program bug? made it this far without #CHROM in truth VCF?")

    else: misses_fd = None

    eval_sample_index = eval_vcf.sample_names.index(sample)
    truth_sample_index = truth_vcf.sample_names.index(sample)

    eval_chr=[]
    truth_chr=[]

    eval_gen = eval_vcf.lines()
    eval_line = eval_gen.next()
    eval_chr.append( eval_line.chr )

    eval_done = False

    n_truth_lines = 0
    n_site_hits = 0
    n_site_concords = 0

    last_truth=(None,None)
    last_eval=(None,None)

    for truth_line in truth_vcf.lines():

        check_sort_order( truth, last_truth, truth_line.chr, truth_line.pos )

        truth_genotype=truth_line.get_sample_dict(truth_sample_index)["GT"]
        if truth_genotype == "0/0" or truth_genotype=="0|0" or truth_genotype == ".": continue
        if debug: print >>sys.stderr,"seeking {}:{}".format( truth_line.chr, truth_line.pos )
        n_truth_lines += 1
        if n_truth_lines % 1000 == 0: print n_truth_lines

        # skip to the correct chromosome
        if len(truth_chr) == 0 or truth_chr[-1] != truth_line.chr:
            truth_chr.append( truth_line.chr )

        # if we've already passed this chr in the eval file, then spin
        if not eval_done and truth_line.chr != eval_line.chr and truth_line.chr in eval_chr:
            continue

        # if we've not already passed this chr, then find it
        while not eval_done and eval_line.chr != truth_line.chr:
            try:
                eval_line = eval_gen.next()
                check_sort_order( input, last_eval, eval_line.chr, eval_line.pos )
                if debug: print >>sys.stderr,"...next chr={}".format( eval_line.chr )
                if eval_chr[-1] != eval_line.chr:
                    eval_chr.append(eval_line.chr)
                    print eval_line.chr
            except StopIteration: eval_done=True

        # try to find the correct position
        while not eval_done and eval_line.pos < truth_line.pos \
                and eval_line.chr == truth_line.chr:
            try:
                eval_line = eval_gen.next()
                check_sort_order( input, last_eval, eval_line.chr, eval_line.pos )
                if debug: print >>sys.stderr,"...next chr:pos={}:{}".format( eval_line.chr, eval_line.pos )
                if eval_chr[-1] != eval_line.chr:
                    eval_chr.append(eval_line.chr)
                    print eval_line.chr
            except StopIteration: eval_done=True

        if minqual > 0 and eval_line.qual == '.':
            raise Exception("not sure what to do here, we're filtering on qual, but qual is '.'")

        if eval_done or eval_line.pos != truth_line.pos \
                or  eval_line.chr != truth_line.chr \
                or  ( eval_line.qual != '.' and float(eval_line.qual) < minqual ):
            if misses_fd: misses_fd.write(truth_line.line+"\n")
        else:
            if truth_line.ref != eval_line.ref:
                raise Exception("""
                    Your truth set does not seem to be called on the
                    same reference as your call set.  We're done here.
                    truth={}
                    truth_pos={}:{}
                    truth_ref={}

                    input={}
                    input_pos={}:{}
                    input_ref={}
                    """.format( truth, truth_line.chr, truth_line.pos,
                        truth_line.ref, input, eval_line.chr,
                        eval_line.pos, eval_line.ref ) )

            if debug:
                print >>sys.stderr,"""
                Evaluating:
                truth_pos={}:{}
                truth_ref={}
                truth_alt={}

                input_pos={}:{}
                input_ref={}
                input_alt={}

                """.format( truth_line.chr, truth_line.pos,
                        truth_line.ref, truth_line.alt, eval_line.chr,
                        eval_line.pos, eval_line.ref, eval_line.alt )

            # grab truth NR bases and eval NR bases
            eval_genotype=eval_line.get_sample_dict( eval_sample_index)["GT"]
            if eval_genotype != ".":
                eval_calls_idx = eval_genotype.split("/")
                if eval_calls_idx[0] == eval_genotype: eval_calls_idx = eval_genotype.split("|")
                if '0' in eval_calls_idx: eval_calls_idx.remove('0')
                eval_calls_idx = map(int, eval_calls_idx )
                truth_calls_idx = truth_genotype.split("/")
                if truth_calls_idx[0] == truth_genotype: truth_calls_idx = truth_genotype.split("|")
                if '0' in truth_calls_idx: truth_calls_idx.remove('0')
                truth_calls_idx = map(int, truth_calls_idx )

                if len(eval_calls_idx) > 0:
                    n_site_hits += 1
                    if debug: print >>sys.stderr, "accepting site hit at {}:{}".format(eval_line.chr, eval_line.pos)
                elif debug:
                    print >>sys.stderr, "no NR calls"

                for truth_nr in [ truth_line.alt_list[i-1] for i in truth_calls_idx ]:
                    # if any truth non-reference call is not found on
                    # the eval line, then we break without counting the
                    # concordance
                    if not truth_nr in eval_line.alt_list:
                        if debug: print >>sys.stderr,"""
                            non-concordant:
                            pos={}:{}
                            truth_ref={}
                            eval_ref={}
                            truth_alt={}
                            eval_alt={}
                            """.format( truth_line.chr, truth_line.pos,
                                    truth_line.ref, eval_line.ref,
                                    truth_line.alt_list,
                                    eval_line.alt_list )
                        break

                else:
                    # normal termination of the for loop, so count the
                    # concordance
                    if debug: print >>sys.stderr,"""
                            CONCORDANT:
                            pos={}:{}
                            truth_ref={}
                            eval_ref={}
                            truth_alt={}
                            eval_alt={}
                            """.format( truth_line.chr, truth_line.pos,
                                    truth_line.ref, eval_line.ref,
                                    truth_line.alt_list,
                                    eval_line.alt_list )
                    n_site_concords += 1

    print "n_truth_lines={}, n_site_hits={}, n_site_concords={}, site_hit_frac={}, site_concord_frac={}".format(
            n_truth_lines, n_site_hits, n_site_concords,
            n_site_hits/float(n_truth_lines),
            n_site_concords/float(n_truth_lines)
            )

    # check that for the eval chromosomes that are also in the truth
    # set, that they come in the same order

    # first form intersection set
    overlap_chr = set(truth_chr).intersection(set(eval_chr))

    truth_chr_rev = [ chr for chr in truth_chr if chr in overlap_chr ]
    truth_chr_rev.reverse()

    for chr in eval_chr:
        if chr not in overlap_chr: continue
        if chr != truth_chr_rev[-1]:
            raise Exception("""
            input chromosome ordering doesn't match truth chromosome ordering:
            input={}
            truth={}
            """.format( eval_chr, truth_chr ) )
        truth_chr_rev.pop()

    if misses_fd: misses_fd.close()