def make_snp2gene_file(genepred, snptable, outfile_mpileup, outfile_snp2genes, include_overlap=False):
	with open(genepred) as infh:
		for line in infh:
			if line.startswith('#'): continue
			chromosome, strand, cdsstart, cdsend, exonstarts, exonends, genename, ID, inferred_strand = fromannotationline(line)
			for start, end in zip(exonstarts, exonends):
				exon = dr_tools.Cregion(chromosome, start, end)
				exon.gene = genename
				exon.addtowindows()
	
	snps_per_gene = defaultdict(list)
	snp_positions = []
	
	for p in dr_tools.splitlines(snptable):
		# e.g. 585     chr1    10019   10020   rs376643643     0       +       A       A       -/A     genomic deletion        unknown 0       0       near-gene-5     exact   1               1       SSMP,   0   
		
		if p[11] != 'single': continue # ignore non-SNPs
		
		chromosome = p[1]
		position = int(p[2]) # 0-based
		genes = set(exon.gene for exon in dr_tools.Cregion.overlappingpoint(chromosome, position))
		if include_overlap or len(genes) == 1:
			for gene in genes:
				snps_per_gene[gene].append('%s:%s|%s'%(p[1], p[3], p[9]))
				snp_positions.append('%s\t%s'%(p[1], p[3]))
	
	with open(outfile_snp2genes, 'w') as outfh:
		for gene, snps in snps_per_gene.items():
			print >>outfh, dr_tools.join(gene, len(snps), ';'.join(sorted(snps)))
	
	with open(outfile_mpileup, 'w') as outfh:
		for snpline in snp_positions:
			print >>outfh, snpline
def find_heterozygous(hitcounts_files, snp2gene_file_in, snp2gene_file_out, o):
	minreads_allele = o.minreadsH
	samples_prim = set(o.samplesH) if o.samplesH else set()
	snps_per_gene_in = dict()
	gene_order = []
	with open(snp2gene_file_in, 'r') as infh:
		for line in infh:
			p = line.rstrip('\r\n').split('\t')
			snps_per_gene_in[p[0]] = p[2].split(';')
			gene_order.append(p[0])
	reads_per_gene = defaultdict(lambda: defaultdict(lambda: (0,0)))
	sec_reads_per_gene = defaultdict(lambda: defaultdict(lambda: (0,0)))
	sec_samples_count = defaultdict(lambda: defaultdict(lambda: (0,0)))
	for inf in hitcounts_files:
		sample = inf.split('/')[-1].split('.counts')[0]
		if o.minothersamplesH == 0 and o.minothersamplereadsH == 0 and samples_prim and sample not in samples_prim: continue
		with gzip.open(inf, 'r') as infh:
			for line in infh:
				p = line.rstrip('\r\n').split('\t')
				gene = p[0]
				if (not samples_prim) or sample in samples_prim:
					reads_per_gene[gene] = [(reads_per_gene[gene][i][0]+int(v.split(',')[0]), reads_per_gene[gene][i][1]+int(v.split(',')[1])) for i, v in enumerate(p[1].split(';'))]
				else:
					sec_samples_count[gene] = [(sec_samples_count[gene][i][0]+(int(v.split(',')[0])>=1), sec_samples_count[gene][i][1]+(int(v.split(',')[1]))>=1) for i, v in enumerate(p[1].split(';'))]
					sec_reads_per_gene[gene] = [(reads_per_gene[gene][i][0]+int(v.split(',')[0]), reads_per_gene[gene][i][1]+int(v.split(',')[1])) for i, v in enumerate(p[1].split(';'))]
	with open(snp2gene_file_out, 'w') as outfh:
		for gene in gene_order:
			ok_snps = []
			for snpinfo, reads, sec_s_count, reads_sec in zip(snps_per_gene_in[gene], reads_per_gene[gene], sec_samples_count[gene], sec_reads_per_gene[gene]):
				if reads[0] >=minreads_allele and reads[1] >= minreads_allele and (reads[1]==0 or reads[0]/reads[1] <= o.maxratioH) and (reads[0]==0 or reads[1]/reads[0] <= o.maxratioH) and sec_s_count[0] >= o.minothersamplesH  and sec_s_count[1] >= o.minothersamplesH and reads_sec[0] >=o.minothersamplereadsH and reads_sec[1] >= o.minothersamplereadsH:
					ok_snps.append(snpinfo)
			print >>outfh, dr_tools.join(gene, len(ok_snps), ';'.join(ok_snps))
def counts_to_SNPallelehits(counts_file, samplename, snp2gene_file_in, outputpath):
	snps_per_gene_in = dict()
	gene_order = []
	with open(snp2gene_file_in, 'r') as infh:
		for line in infh:
			p = line.rstrip('\r\n').split('\t')
			snps_per_gene_in[p[0]] = p[2].split(';')
			gene_order.append(p[0])
	reads_per_gene = dict()
	with gzip.open(counts_file, 'r') as infh:
		for line in infh:
			p = line.rstrip('\r\n').split('\t')
			gene = p[0]
			reads_per_gene[gene] = [(int(v.split(',')[0]), int(v.split(',')[1])) for i, v in enumerate(p[1].split(';')) if v]
	with open(outputpath, 'w') as outfh:
		print >>outfh, dr_tools.join('#samples', samplename+'_c57only', samplename+'_castonly')
		print >>outfh, dr_tools.join('#allmappedreads', 0, 0)
		print >>outfh, dr_tools.join('#normalizationreads', 0, 0)
		print >>outfh, dr_tools.join('#arguments', ' '.join(sys.argv), 'time: '+time.asctime())
		for gene in gene_order:
			for snpinfo, reads in zip(snps_per_gene_in[gene], reads_per_gene[gene]):
				print >>outfh, dr_tools.join(gene, snpinfo, 0, 0, reads)
コード例 #4
0
def collapse_mirnas(molc_file):
	gene2molc = {}
	gene2trid = {}
	trid2gene = {}
	with open(o.out_molc_files, 'w') as outfh:
		for line in open(molc_file, 'r'):
			if line.startswith('#'):
				print >> outfh, line[:-1]

			else:
				p = line.strip('\n').split('\t')
				trans_ids = p[1]; genename = p[0]
				gene2trid[genename] = trans_ids

				if genename.startswith("hsa"): #selects only mirbase mirnas from the expression table
					molc_counts = map(float, p[2:])
					zeros = [0]*len(molc_counts)			
					gene2molc[genename] = [i+j for i,j in zip(gene2molc.get(genename, zeros), molc_counts)]
				else:
					print >> outfh, line[:-1]
				
		for gene in gene2molc:
			print >> outfh, dr_tools.join(gene, gene2trid[gene], [round(m, 2) for m in gene2molc[gene]])
		if o.rpkmf_genes:
			symbols_set = dict((s,i) for i,s in enumerate(symbols))
			new_sample_values = dict()
			for name in sample_order:
				new_sample_values[name] = []
				for i, symbol in enumerate(expr['symbols']):
					if symbol in symbols_set:
						new_sample_values[name].append(sample_values[name][symbols_set[symbol]])
					else:
						new_sample_values[name].append('0 0')
			sample_values = new_sample_values
			symbols = expr['symbols']
			IDs = expr['IDs']
	elif o.rpkmf_genes: raise Exception
	
	# write to file
	with open(o.outf, 'w') as outfh:
		print >>outfh, dr_tools.join('#samples', ['%s_c57only\t%s_castonly'%(s,s) for s in sample_order])
		print >>outfh,  dr_tools.join('#allmappedreads', ['0\t0' for s in sample_order])
		print >>outfh,  dr_tools.join('#normalizationreads', ['0\t0' for s in sample_order])
		print >>outfh,  dr_tools.join('#arguments', ' '.join(sys.argv), 'time: '+time.asctime())
		for i in range(len(symbols)):
			#if IDs[i] == '0 0':
			#	print symbols[i]
			#	IDs[i] = 'NA'
			if o.noNA and IDs[i] == 'NA': continue
			try: print >>outfh, dr_tools.join(symbols[i], IDs[i], ['0\t0' for name in sample_order], [swap_order(sample_values[name][i]) for name in sample_order])
			except:
				print symbols[i], sample_values[name][i]
				raise
import dr_tools, argparse

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('-i', '--rpkmf_in', required=True)
	parser.add_argument('-o', '--rpkmf_out', required=True)
	parser.add_argument('-s', '--sample_lists', nargs='+', required=True)
	o = parser.parse_args()
	
	with open(o.rpkmf_out, 'w') as outfh:
		with open(o.rpkmf_in, 'r') as infh:
			for li, line in enumerate(infh):
				if li == 0:
					p = line.rstrip('\r\n').split('\t')
					sample_to_clone = dict((sample, filename) for filename in o.sample_lists for sample in dr_tools.loadlist(filename))
					for i, name in enumerate(p):
						if i==0: continue
						for suffix in ('', '_c57only', '_castonly'):
							if name.endswith(suffix) and name[:-len(suffix)] in sample_to_clone:
								clone_name = sample_to_clone[name[:-len(suffix)] ].split('/')[-1].split('.txt')[0]
								p[i] = clone_name + '-' + name
					print >>outfh, dr_tools.join(p)
				else:
					outfh.write(line)
                               args.addminreads) + subtract(
                                   expr2[s2][gene_i], args.F2,
                                   expr2[s1][gene_i], 0, args.round)
            '''
			s1e = subtract(expr1[s1][gene_i], args.F1, expr1[s2][gene_i], args.addminreads)
			s2e = subtract(expr1[s2][gene_i], args.F1, expr1[s1][gene_i], args.addminreads)
			if args.inf2 is not None:
				s1e = max(s1e, subtract(expr2[s1][gene_i], args.F2, expr2[s2][gene_i], 0))
				s2e = max(s2e, subtract(expr2[s2][gene_i], args.F2, expr2[s1][gene_i], 0))
			'''
            gene_counts_out[gene_i].extend([s1e, s2e])

    with open(args.outf, 'w') as outfh:
        for i, p in enumerate(dr_tools.splitlines(args.inf1)):
            if i < 3:
                print >> outfh, dr_tools.join(p)
            elif i == 3:
                assert p[0] == '#arguments'
                print >> outfh, dr_tools.join(p, ' '.join(sys.argv),
                                              'time: ' + time.asctime())
            else:
                gene_i = i - 4
                # replace the expression values according to the change in read counts
                new_expressions = [
                    old_rpkm if old_rpkm <= 0 else
                    0.0 if old_count == 0 else new_count / old_count * old_rpkm
                    for new_count, old_count, old_rpkm in zip(
                        gene_counts_out[gene_i],
                        map(float, p[2:2 + len(samples)]),
                        map(float, p[2 + len(samples):2 + 2 * len(samples)]))
                ]
import dr_tools, argparse

if '__main__' == __name__:
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--rpkmf_in', required=True)
    parser.add_argument('-o', '--rpkmf_out', required=True)
    parser.add_argument('-s', '--sample_lists', nargs='+', required=True)
    o = parser.parse_args()

    with open(o.rpkmf_out, 'w') as outfh:
        with open(o.rpkmf_in, 'r') as infh:
            for li, line in enumerate(infh):
                if li == 0:
                    p = line.rstrip('\r\n').split('\t')
                    sample_to_clone = dict(
                        (sample, filename) for filename in o.sample_lists
                        for sample in dr_tools.loadlist(filename))
                    for i, name in enumerate(p):
                        if i == 0: continue
                        for suffix in ('', '_c57only', '_castonly'):
                            if name.endswith(
                                    suffix
                            ) and name[:-len(suffix)] in sample_to_clone:
                                clone_name = sample_to_clone[
                                    name[:-len(suffix)]].split('/')[-1].split(
                                        '.txt')[0]
                                p[i] = clone_name + '-' + name
                    print >> outfh, dr_tools.join(p)
                else:
                    outfh.write(line)
				s2e = subtract(expr1[s2][gene_i]-expr2[s2][gene_i], args.F1, expr1[s1][gene_i]-expr2[s1][gene_i], args.addminreads) + subtract(expr2[s2][gene_i], args.F2, expr2[s1][gene_i], 0, args.round)
			
			
			'''
			s1e = subtract(expr1[s1][gene_i], args.F1, expr1[s2][gene_i], args.addminreads)
			s2e = subtract(expr1[s2][gene_i], args.F1, expr1[s1][gene_i], args.addminreads)
			if args.inf2 is not None:
				s1e = max(s1e, subtract(expr2[s1][gene_i], args.F2, expr2[s2][gene_i], 0))
				s2e = max(s2e, subtract(expr2[s2][gene_i], args.F2, expr2[s1][gene_i], 0))
			'''
			gene_counts_out[gene_i].extend([s1e, s2e])
	
	with open(args.outf, 'w') as outfh:
		for i, p in enumerate(dr_tools.splitlines(args.inf1)):
			if i < 3:
				print >>outfh, dr_tools.join(p)
			elif i == 3:
				assert p[0] == '#arguments'
				print >>outfh, dr_tools.join(p, ' '.join(sys.argv), 'time: '+time.asctime())
			else:
				gene_i = i - 4
				# replace the expression values according to the change in read counts
				new_expressions = [old_rpkm if old_rpkm <= 0 else 0.0 if old_count == 0 else new_count/old_count*old_rpkm for new_count, old_count, old_rpkm in zip(gene_counts_out[gene_i], map(float, p[2:2+len(samples)]), map(float, p[2+len(samples):2+2*len(samples)]))]
				print >>outfh, dr_tools.join(p[0], p[1], new_expressions, ['%.12g'%c for c in gene_counts_out[gene_i]])
	
	
'''
Results:

danielr@rna ~/casthybrid/one_chr_reads $ python threshold_strainspec.py -i1 oorefv15_trainingvals.txt -i2 oorefv13_trainingvals.txt -F1 0.0 -F2 0.001 --addminreads 1 |python ../maternal_tx/maternal_fraction.py /dev/stdin
8cell_8-1 0.0732625841351 0.0
	parser.add_argument('rpkmf_total')
	o = parser.parse_args()
	
	exprt = dr_tools.loadexpr(o.rpkmf_total, counts=False)
	counts = dr_tools.loadexpr(o.rpkmf_total, counts=True)
	
	if o.rpkmf_alleles:
		expra = dr_tools.loadexpr(o.rpkmf_alleles, counts=True)
	
	
		AiD = dict((ti, expra.ID_to_index[ID]) for ti, ID in enumerate(exprt['IDs']) if ID in expra.ID_to_index)
	
		for s in exprt.samples:
			if s+'_castonly' not in expra.samples: continue
			with open(s + '_expression.txt', 'w') as outfh:
				print >>outfh, dr_tools.join('#Gene_symbol', 'Refseq_IDs', 'RPKM', 'reads', 'CAST_hits', 'C57_hits')
				for ti in range(len(exprt['IDs'])):
					if ti in AiD:
						ai = AiD[ti]
						cast = int(expra[s+'_castonly'][ai])
						c57 = int(expra[s+'_c57only'][ai])
					else:
						cast = 0
						c57 = 0
					rpkm = exprt[s][ti]
					reads = int(round(counts[s][ti]))
					symbol = exprt['symbols'][ti].replace('+','|')
					ID = exprt['IDs'][ti].replace('+','|')
					print >>outfh, dr_tools.join(symbol, ID, rpkm, reads, cast, c57)
	else:
		for s in exprt.samples:
コード例 #11
0
            if coord not in database_snps:
                # strange, it should be (in database_snps)
                # unless o.snp_validatedbefore was used
                continue
            snp = database_snps[coord]
            snp.c57count += int(p[snp.c57index])
            snp.castcount += int(p[snp.castindex])

    ratios = []
    with open(o.outfile, 'w') as outfh:
        for coord, snpinfo in database_snps.items():
            reads = snpinfo.c57count + snpinfo.castcount
            if reads == 0: ratio = 0
            else: ratio = snpinfo.c57count / reads
            if o.minratio <= ratio <= (
                    1 - o.minratio) and reads >= o.minreads_sum and min(
                        snpinfo.c57count,
                        snpinfo.castcount) >= o.minreads_allele:
                print >> outfh, dr_tools.join(coord, snpinfo.bases, '0',
                                              '1.00', '1.00', '1.00', '1.00',
                                              '1.00')
                ratios.append(ratio)

    if o.figure:
        import pylab
        step = 0.005
        xarr, yarr = dr_tools.bin(ratios, -step, 1 + step, step, 1)
        #yarr = [y/len(ratios) for y in yarr]
        pylab.plot(xarr, yarr, 'k-')
        pylab.savefig(o.figure)
コード例 #12
0
	expr_out = dr_tools.Parsed_rpkms([], False)
	normalization_factors = []
	
	for s in expr_in.samples:
		Y_k = expr_in[s]
		N_k = sum(Y_k)
		nonzero = [gi for gi in range(len(expr_in['symbols'])) if Y_k[gi] > 0 and Y_r[gi] > 0]
		A_distr = sorted((A(gi, Y_k, Y_r, N_k, N_r), gi) for gi in nonzero)
		M_distr = sorted((M(gi, Y_k, Y_r, N_k, N_r), gi) for gi in nonzero)
		
		Gstar = set(gi for A_val,gi in A_distr[int(0.05*len(A_distr)):-int(0.05*len(A_distr))]) & set(gi for M_val,gi in M_distr[int(0.3*len(M_distr)):-int(0.3*len(M_distr))])
		
		if len(nonzero) == 0: f_k = 1
		else:
			log2TMM = sum(w(gi, Y_k, Y_r, N_k, N_r) * M(gi, Y_k, Y_r, N_k, N_r) for gi in Gstar)/sum(w(gi, Y_k, Y_r, N_k, N_r) for gi in Gstar)
			f_k = 2**log2TMM # multipy non-reference by this value
		
		#print s, f_k
		
		expr_out[s] = [Y_k[gi]*f_k for gi in range(len(expr_in['symbols']))]
		normalization_factors.append(f_k)
	expr_out.allmappedreads = expr_in.allmappedreads
	expr_out.normalizationreads = expr_in.normalizationreads
	expr_out.samples = expr_in.samples
	expr_out['symbols'] = expr_in['symbols']
	expr_out['IDs'] = expr_in['IDs']
	
	dr_tools.writeexpr(o.outfile, expr_out, counts_expr=(dr_tools.loadexpr(o.infile, counts=True) if o.copy_counts else None), extra_comment_lines=[dr_tools.join('#TMM_normalization_factors', normalization_factors)])
	

    exprt = dr_tools.loadexpr(o.rpkmf_total, counts=False)
    counts = dr_tools.loadexpr(o.rpkmf_total, counts=True)

    if o.rpkmf_alleles:
        expra = dr_tools.loadexpr(o.rpkmf_alleles, counts=True)

        AiD = dict((ti, expra.ID_to_index[ID])
                   for ti, ID in enumerate(exprt['IDs'])
                   if ID in expra.ID_to_index)

        for s in exprt.samples:
            if s + '_castonly' not in expra.samples: continue
            with open(s + '_expression.txt', 'w') as outfh:
                print >> outfh, dr_tools.join('#Gene_symbol', 'Refseq_IDs',
                                              'RPKM', 'reads', 'CAST_hits',
                                              'C57_hits')
                for ti in range(len(exprt['IDs'])):
                    if ti in AiD:
                        ai = AiD[ti]
                        cast = int(expra[s + '_castonly'][ai])
                        c57 = int(expra[s + '_c57only'][ai])
                    else:
                        cast = 0
                        c57 = 0
                    rpkm = exprt[s][ti]
                    reads = int(round(counts[s][ti]))
                    symbol = exprt['symbols'][ti].replace('+', '|')
                    ID = exprt['IDs'][ti].replace('+', '|')
                    print >> outfh, dr_tools.join(symbol, ID, rpkm, reads,
                                                  cast, c57)
    marker_order = [m for m in marker_order if m in gene_to_marker.values()]
    if not o.shuffle_patterns:
        pop_cytof_pattern = dict(
            (pop, [markers[m][popi] for m in marker_order])
            for popi, pop in enumerate(header))
    else:
        pop_cytof_pattern = dict(
            (pop, random.shuffle([markers[m][popi] for m in marker_order]))
            for popi, pop in enumerate(header))
    exprt = dr_tools.loadexpr(o.rpkmfile)
    random.seed(0)

    midexpr_symi_all_D = dict()
    for symi, sym in enumerate(exprt['symbols']):
        if sym not in gene_to_marker:
            raise Exception(dr_tools.join(sym, 'sym'))
        if gene_to_marker[sym] not in markers:
            raise Exception(dr_tools.join(gene_to_marker[sym], 'cytof'))
        midexpr_symi_all_D[gene_to_marker[sym]] = (numpy.mean(
            [exprt[s][symi] for s in exprt.samples]), symi)
    midexpr_symi_all = [midexpr_symi_all_D[m] for m in marker_order]
    sym_order = [midexpr_symi_all_D[m][1] for m in marker_order]

    pop_counts = dict((pop, 0) for pop in pop_cytof_pattern)
    pop_samples = defaultdict(list)

    for sample in exprt.samples:
        relexpr = [
            exprt[sample][symi] / midall for midall, symi in midexpr_symi_all
        ]
コード例 #15
0
                for i, symbol in enumerate(expr['symbols']):
                    if symbol in symbols_set:
                        new_sample_values[name].append(
                            sample_values[name][symbols_set[symbol]])
                    else:
                        new_sample_values[name].append('0 0')
            sample_values = new_sample_values
            symbols = expr['symbols']
            IDs = expr['IDs']
    elif o.rpkmf_genes:
        raise Exception

    # write to file
    with open(o.outf, 'w') as outfh:
        print >> outfh, dr_tools.join(
            '#samples',
            ['%s_c57only\t%s_castonly' % (s, s) for s in sample_order])
        print >> outfh, dr_tools.join('#allmappedreads',
                                      ['0\t0' for s in sample_order])
        print >> outfh, dr_tools.join('#normalizationreads',
                                      ['0\t0' for s in sample_order])
        print >> outfh, dr_tools.join('#arguments', ' '.join(sys.argv),
                                      'time: ' + time.asctime())
        for i in range(len(symbols)):
            #if IDs[i] == '0 0':
            #	print symbols[i]
            #	IDs[i] = 'NA'
            if o.noNA and IDs[i] == 'NA': continue
            try:
                print >> outfh, dr_tools.join(
                    symbols[i], IDs[i], ['0\t0' for name in sample_order], [
コード例 #16
0
file1 = '/mnt/crick/rickards/projects/hsa_snp_calling/snp_stats_ac2.txt'
file2 = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/SNPs_per_gene.txt'  # created by make_allelecalls.py -P using the -a and -s arguments
output = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/heterozygous_SNPs_per_gene.txt'

import dr_tools

positions = set()
for p in dr_tools.splitlines(file1):  # for each SNP line in the file
    if float(p[-2]) < 0.9:  # if second last column's value is <0.9
        positions.add('%s:%s' % (p[0], p[1]))  # add to allowed SNP list

print len(positions)
c = 0
outfh = open(output, 'w')
for p in dr_tools.splitlines(file2):  # for each gene
    snps = []
    for snpinfo in p[2].split(';'):  # go through the SNPs for the gene
        if snpinfo.split('|')[0] in positions:  # see if on allowed list
            snps.append(snpinfo)  # add to SNPs to print to output
            c += 1
    print >> outfh, dr_tools.join(
        p[0], len(snps), ';'.join(snps))  # output the SNPs for the gene
outfh.close()
print c
file1 = '/mnt/crick/rickards/projects/hsa_snp_calling/snp_stats_ac2.txt'
file2 = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/SNPs_per_gene.txt' # created by make_allelecalls.py -P using the -a and -s arguments
output = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/heterozygous_SNPs_per_gene.txt'


import dr_tools

positions = set()
for p in dr_tools.splitlines(file1): # for each SNP line in the file
	if float(p[-2]) < 0.9: # if second last column's value is <0.9
		positions.add('%s:%s'%(p[0], p[1])) # add to allowed SNP list

print len(positions)
c=0
outfh = open(output, 'w')
for p in dr_tools.splitlines(file2): # for each gene
	snps = []
	for snpinfo in p[2].split(';'): # go through the SNPs for the gene
		if snpinfo.split('|')[0] in positions: # see if on allowed list
			snps.append(snpinfo) # add to SNPs to print to output
			c+=1
	print >>outfh, dr_tools.join(p[0], len(snps), ';'.join(snps)) # output the SNPs for the gene
outfh.close()
print c
    o = parser.parse_args()

    samples = []
    for li, p in enumerate(dr_tools.splitlines(o.cellnumbertable)):
        samples.append(Sample(str(li + 1), p[0], int(p[1]) if p[1] else None))

    for csv_path in o.allcells_csv:
        name = csv_path.split('/')[-1].split('_1.')[0].split('.txt')[0]
        matching_samples = [s for s in samples if name in s.names]
        if len(matching_samples) > 1: raise Exception
        if len(matching_samples) == 0:
            if o.vocal:
                print name, 'A'
            continue
        matching_samples[0].cells_cytof_all = count_cells(csv_path)

    for csv_path in o.Bcells_csv:
        name = csv_path.split('/')[-1].split('.')[0]
        matching_samples = [s for s in samples if name in s.names]
        if len(matching_samples) > 1: raise Exception
        elif len(matching_samples) == 0:
            print name, 'B'
            continue
        matching_samples[0].cells_cytof_B = count_cells(csv_path)

    for sample in samples:
        if sample.has_all_info():
            print dr_tools.join(sample.names[0], sample.est_Bcells())
        elif o.vocal:
            print sample.names, 'C'
コード例 #19
0
    snps_per_gene = defaultdict(list)

    for p in dr_tools.splitlines(o.snplist):
        # e.g. chr11   117883408       C       A       0       1.00    -1.00   0.90    0.10    0.71    0.29

        if ',' in p[2] or ',' in p[3]:
            continue  # added 18 Dec, since snp_stats2.py -S removes these SNPs anyway

        chromosome = p[0]
        position = int(p[1]) - 1
        genes = set(exon.gene for exon in dr_tools.Cregion.overlappingpoint(
            chromosome, position))
        if o.include_overlap:
            for gene in genes:
                snps_per_gene[gene].append('%s:%s' % (p[0], p[1]))
        else:
            if len(genes
                   ) == 1:  # don't allow overlapping genes, exclude those SNPs
                gene = list(genes)[0]
                try:
                    snps_per_gene[gene].append('%s:%s' % (p[0], p[1]))
                except:
                    print p
                    raise

    with open(o.outfile, 'w') as outfh:
        for gene, snps in snps_per_gene.items():
            print >> outfh, dr_tools.join(gene, len(snps),
                                          ';'.join(sorted(snps)))
コード例 #20
0
ファイル: count_smallrnas.py プロジェクト: snijesh/smallseq
def bam_to_windows(inbam):
	inbamPysamObj = pysam.Samfile(inbam, "rb" )
	p = inbam.split("/")
	samplename = p[-2]
	outbamTmp = "/".join(p[:-3]+[o.outdir]+p[-2:])
	tempCountfile = ".".join(outbamTmp.split(".")[:-1]) + "_tmpCount.txt"
	finalCountfile = ".".join(outbamTmp.split(".")[:-1]) + "_Count.txt"
	read2overlapCoords=defaultdict(list)

	for read in inbamPysamObj:
		readname = read.qname
		tid = read.rname
		readchr  = inbamPysamObj.getrname(tid)
		readstart = int(read.pos)
		readend = read.aend
		if read.is_reverse: 
			strand="-"
		else:
			strand="+"
		readlen = len(read.seq) #this is the actual read length (41M, means readlen=41)
		read_len = read.qlen  #this only considers matches (8S30M, means read_len=30)
 
		midpos = (readstart + readend)//2

		#retrieve list of overlapping coordinates
		overlap_list = betweenRE.overlappingpoint(readchr, midpos, strand)
		annotatedCount = len(overlap_list)

		#make a dictionary of read and overlapping coordinates
		read2overlapCoords[readname].append(overlap_list)

	with open(tempCountfile, "w") as outfh:
		for read in read2overlapCoords:
			coordsList = read2overlapCoords[read]
			readCount = len(coordsList)
			annotatedCount = readCount-coordsList.count([])
			#len(coordsList) is never zero
			for coord in coordsList:
				if len(coord) == 0:
					print >> outfh, dr_tools.join(read, "NA", readCount, annotatedCount)
				else:
					###coord[1] will be double-counting
					coord = str(coord[0])  #otherwise I got keyError. it was "instance" type variable
					geneid = coord2geneid.get(coord, 'NA')
					print >> outfh, dr_tools.join(read, geneid, readCount, annotatedCount)
	outfh.close()

	## readCount, annotatedCount scenarios
	# 1, 1  unique map, annotated to single gene, counts as 1
	# 2, 1  multi map, annotated to single gene, count as 1, discard other alignment
	# n, n  multi map, annotated to two genes, count as 1/n
	# k, m  where k>m and m>1, multi map, annotated to multi genes, count 1/m, discard other alignment
	#
	#formula is always: count = 1/annotatedCount
	geneid2counts={}
	unannotReadsDict={}
	for line in open(tempCountfile, "r"):
		p = line.split()
		read, geneid, readCount, annotatedCount = p
		annotatedCount = int(annotatedCount)
		if not geneid in geneid2counts: geneid2counts[geneid] = 0
		if annotatedCount > 0:
			geneid2counts[geneid] += 1/annotatedCount
		else:
			geneid2counts[geneid] += 0
		if annotatedCount < readCount and annotatedCount == 0:
			unannotReadsDict[read] = 1

	num_unannot = len(unannotReadsDict)
	num_annot = 0
	for geneid in geneid2counts:
		if "P-cel" in geneid: continue
		if geneid == "NA": continue
		num_annot += geneid2counts[geneid]

	with open(finalCountfile, "w") as outfh2:
		print >> outfh2, dr_tools.join("#samples", samplename)
		print >> outfh2, dr_tools.join("#unannotatedmolc", num_unannot)
		print >> outfh2, dr_tools.join("#annotatedmolc", num_annot)
		for geneid in geneidlist:
			print >> outfh2, dr_tools.join(geneid2name[geneid], geneid, geneid2counts.get(geneid, "0"))
	outfh2.close()
		database_snps[coord] = SNPinfo(c57base, castbase)
	
	for filepath in o.cellsums_files:
		for p in dr_tools.splitlines(filepath):
			coord = '%s\t%s'%(p[0], p[1])
			if coord not in database_snps:
				# strange, it should be (in database_snps)
				# unless o.snp_validatedbefore was used
				continue
			snp = database_snps[coord]
			snp.c57count += int(p[snp.c57index])
			snp.castcount += int(p[snp.castindex])
	
	ratios = []
	with open(o.outfile, 'w') as outfh:
		for coord, snpinfo in database_snps.items():
			reads = snpinfo.c57count+snpinfo.castcount
			if reads == 0: ratio = 0
			else: ratio = snpinfo.c57count/reads
			if o.minratio <= ratio <= (1-o.minratio) and reads >= o.minreads_sum and min(snpinfo.c57count, snpinfo.castcount) >= o.minreads_allele:
				print >>outfh, dr_tools.join(coord, snpinfo.bases, '0', '1.00', '1.00', '1.00', '1.00', '1.00')
				ratios.append(ratio)
	
	if o.figure:
		import pylab
		step = 0.005
		xarr, yarr = dr_tools.bin(ratios, -step, 1+step, step, 1)
		#yarr = [y/len(ratios) for y in yarr]
		pylab.plot(xarr, yarr, 'k-')
		pylab.savefig(o.figure)
	set2 = set(entries2)
	set1_unique_c = len(set(entries1[sym] for sym in (set1-set2)))
	set2_unique_c = len(set(entries2[sym] for sym in (set2-set1)))
	common_c = len(set(entries1[sym] for sym in (set2&set1)))
	common_c2 = len(set(entries2[sym] for sym in (set2&set1)))
	if not common_c == common_c2: raise Exception
	saygenes = []
	for genes in set(entries2[sym] for sym in (set2&set1)):
		saygenes.append(';'.join(list(genes)))
	return set1_unique_c, common_c, set2_unique_c, ', '.join(saygenes)

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('-A', '--annotationfile', default='/mnt/crick/danielr/Xandclones_BR/BR_fibroblasts/snp-call/more_formats/mm9_ensembl_refseq_norandom_11Apr2012_genesymbols.txt')
	parser.add_argument('-a', '--set1', required=True)
	parser.add_argument('-b', '--set2', required=True)
	parser.add_argument('-ge', '--disallowedgenes', nargs='+')
	o = parser.parse_args()
	
	if o.disallowedgenes:
		disallowedgenes = set()
		for filename in o.disallowedgenes:
			disallowedgenes.update(set(dr_tools.loadlist(filename)))
	else:
		disallowedgenes = None
	
	ID_to_symbol = dict((p[1], p[12]) for p in dr_tools.splitlines(o.annotationfile) if disallowedgenes is None or p[12] not in disallowedgenes)
	
	print dr_tools.join(overlap_of_2(load_geneset(ID_to_symbol, o.set1), load_geneset(ID_to_symbol, o.set2)))
	print len(set(ID_to_symbol.values()))
						# comes when len(V1)!=len(V2), maybe because some samples have zero cells meeting the filter requirements?
						print >>sys.stderr, combo
						stat = float('nan')
						p = float('nan')
					try:stat = float(stat) # to deal with 1-element array values
					except TypeError: pass
					n = len(V1)
					if o.output_summary_values:
						n = repr(zip(V1, V2))
			else:
				transformed_x, clean_y = zip(*[(distfunc(x),y) for x,y in zip(locations_x, locations_y) if not math.isnan(x) and not math.isnan(distfunc(x))])
				stat, p = correlation(transformed_x, clean_y)
				n = len(clean_y)
			if str(p) == 'nan':
				#print combo, len(V1), len(V2), V1[:10], V2[:10]
				continue
			pvals.append(PVal(p, dr_tools.join(comboname, distname, str(stat), abundance, n))) # different from _v4: n
			if isinstance(stat, tuple):
				pvals[-1].r = numpy.mean(stat)
			else:
				pvals[-1].r = stat
	
	# false discovery rate and output
	for test_inst, q in zip(pvals, dr_tools.globalFDR([test_inst.p for test_inst in pvals])):
		test_inst.q = q
	for test_inst in sorted(pvals, key=lambda obj: (obj.p, -abs(obj.r)), reverse=False):
		if test_inst.q < o.maxq or o.maxq >= 1: print test_inst
	
	if o.saycombinedP:
		print 'combined P:', dr_tools.combinedP([test_inst.p for test_inst in pvals])
                yarr = [y - mid_y for y in yarr]
            locations_x.extend(xarr)
            locations_y.extend(yarr)
            patient_by_sample.extend([patient for x in xarr])

        if o.shuffle:
            random.shuffle(locations_y)

        table[comboname] = locations_y
        table['time_from_baseline_months'] = locations_x
        table['CMM_ID'] = patient_by_sample
        if o.shuffle_name:
            random.shuffle(table['CMM_ID'])
        column_order.append(comboname)

    print dr_tools.join(column_order)
    transposed_table = zip(*[table[c] for c in column_order])

    last_patient = ''
    for patient in set_order_patients:
        if patient == last_patient:
            print dr_tools.join(patient)
        else:
            for row in transposed_table:
                if row[0] == patient:
                    print dr_tools.join(row)
                    break
            else:
                print patient
        last_patient = patient