Beispiel #1
0
def getAlleleCount(bamfile, snpfile, outfile):
	brcparams   = Box()
	brcparams.f = ref
	brcparams.w = 0
	brcparams.l = snpfile

	brcparams[''] = bamfile
	cmd = '{bamrc} {args} > {outfile!r}'.format(
		bamrc = bamrc, args = cmdargs(brcparams, equal = ' '), outfile = outfile + '.tmp')
	runcmd(cmd)

	# reformated output to desired format
	reader = TsvReader(outfile + '.tmp', cnames = False)
	snper  = TsvReader(snpfile, cnames = False)
	#chr1	564773	C	14	=:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00	A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00	C:14:...	G:0:...	T:0:...	N:0:...
	writer = TsvWriter(outfile)
	writer.cnames = ['Chrm', 'pos', 'A', 'C', 'G', 'T', 'Total', 'refCount', 'mutCount']

	for r in reader:
		while True:
			try:
				snp   = next(snper)
			except StopIteration:
				break
			# use the end position, in case it's 0-based
			if snp[0] == r[0] and snp[2] == r[1]:
				counts = dict(
					A = r[5].split(':', 2)[1],
					C = r[6].split(':', 2)[1],
					G = r[7].split(':', 2)[1],
					T = r[8].split(':', 2)[1]
				)
				rec    = TsvRecord()
				rec.Chrm  = r[0]
				rec.pos   = r[1]
				rec.Total = r[3]
				rec.A = counts['A']
				rec.C = counts['C']
				rec.G = counts['G']
				rec.T = counts['T']
				# if reference allele is unknown, assuming all are ref alleles
				rec.refCount = counts.get(snp[6].upper(), r[3])
				# if mut allele is unknown, assuming no mutations happened
				rec.mutCount = counts.get(snp[7].upper(), 0)
				writer.write(rec)
				# go to next snp
				break
			else:
				# go to next r
				continue
	writer.close()
Beispiel #2
0
def main(opts):
    """Main function"""
    org_tfgenes = read_tfgenes(opts.origin)
    add_tfgenes = read_tfgenes(opts.addition)
    writer = TsvWriter(opts.outfile)
    logger.info('Writing the union set to %s ...', opts.outfile)
    for gene, tfs in org_tfgenes.items():
        for tf in (tfs | add_tfgenes.pop(gene, set())):
            writer.write([tf, gene])
    for gene, tfs in add_tfgenes.items():
        for tf in tfs:
            writer.write([tf, gene])
    writer.close()
    logger.info('Done.')
Beispiel #3
0
"""
	S1	S2	..	Sn
G1	...
G2	...
"""
expreader  = TsvReader(expfile)
expdata    = [r for r in expreader if r[0] in genes or r[0] in tfs]
expreader.close()
datawriter = TsvWriter(outdata)
for i, cname in enumerate(expreader.cnames):
	if i == 0:
		# genes + tfs
		datawriter.cnames = [r[0] for r in expdata]
		datawriter.writeHead()
	else:
		datawriter.write([cname] + [r[i] for r in expdata])
datawriter.close()
del expdata
genes = [g for g in genes if g in datawriter.cnames]
tfs   = [g for g in tfs if g in datawriter.cnames]

genetfs = {g: [tf for tf in gtfs if tf in tfs] for g, gtfs in genetfs.items() if g in genes}

# save the group file
# mutfile
"""
	S1	S2	..	Sn
M1	... (0/1/2/NA)
M2	...
"""
mutreader = TsvReader(mutfile)
Beispiel #4
0
reader = TsvReader(infile, cnames=False)
allsnps = set(reader.dump(0))
reader.rewind()
allgenes = set(reader.dump(1))
reader.close()

# assign a probability to each snp
nsnps = len(allsnps)
ngenes = len(allgenes)
snp_probs = dict(zip(allsnps, random.choices(range(ngenes * snppergene),
                                             k=nsnps)))

genebed = TsvWriter(genefile)
snpbed = TsvWriter(snpfile)

geneperchr = math.ceil(float(ngenes) / float(nchr))
for i, gene in enumerate(allgenes):
    chrname = 'chr' + str(int(i % nchr) + 1)
    start = (int(i / nchr) + 1) * dist
    end = start + 1
    first_snp_pos = int(start - dist/2.0 - snppergene)
    snps = (snp for snp in snp_probs
            if i * snppergene <= snp_probs[snp] < (i+1)*snppergene)
    genebed.write([chrname, start, end, gene, 0, '+'])
    for j, snp in enumerate(snps):
        snppos = first_snp_pos + j
        snpbed.write([chrname, snppos, snppos, snp, 0, '+'])

genebed.close()
snpbed.close()
Beispiel #5
0
            (r.IID, r))).dump())
else:
    metadata = None

logger.info('Reading genotype matrix ...')
# snp1 gt1s1 gt1s2 ...
inreader = TsvReader(infile, cnames=True)
samples = inreader.meta[1:]

logger.info('Writing tfam file ...')
tfamWriter = TsvWriter(tfamfile)
tfamWriter.meta = ['FID', 'IID', 'PID', 'MID', 'Sex', 'Pheno']
#tfamWriter.writeHead(callback = lambda meta: '#' + '\t'.join(meta))
if not metadata:
    for s in samples:
        tfamWriter.write([s, s, '0', '0', 'other', '-9'])
else:
    for s in samples:
        tfamWriter.write([
            metadata[s].FID if s in metadata and 'FID' in metadata[s] else s,
            s, (metadata[s].PID or '0')
            if s in metadata and 'PID' in metadata[s] else '0',
            (metadata[s].MID or '0')
            if s in metadata and 'MID' in metadata[s] else '0',
            (metadata[s].Sex or 'other')
            if s in metadata and 'Sex' in metadata[s] else 'other',
            (metadata[s].Pheno or '-9')
            if s in metadata and 'Pheno' in metadata[s] else '-9'
        ])
tfamWriter.close()
Beispiel #6
0
	for key in sorted(attrs.keys()):
		if key in writer.cnames:
			continue
		if 'id' in key.lower():
			return attrs[key]
		if 'name' in key.lower():
			return attrs[key]
		return attrs[key]

gff = Gff(infile)
for record in gff:
	r        = TsvRecord()
	r.CHR    = record['seqid']
	r.START  = record['start']
	r.END    = record['end']
	r.SCORE  = record['score']
	r.STRAND = record['strand']
	attrs    = record['attributes']
	attrs.update(dict(
		CHR    = r.CHR,
		START  = r.START,
		END    = r.END,
		SCORE  = r.SCORE,
		STRAND = r.STRAND
	))
	r.NAME   = getNameFromAttrs(attrs)
	if keepinfo:
		r.ORIGINAL = '; '.join('{}={}'.format(k,v) for k, v in attrs.items() if k not in writer.cnames)
	writer.write(r)
	
Beispiel #7
0
indata1 = TsvReader(infile1, **inopts1)
indata2 = TsvReader(infile2, **inopts2)

cnames1 = indata1.meta if not rnames1 else indata1.meta[1:]
cnames2 = indata2.meta if not rnames2 else indata2.meta[1:]
paired  = list(set(cnames1) & set(cnames2))
cnames1 = cnames2 = paired

if rnames1:
	cnames1 = [indata1.meta[0]] + cnames1
if rnames2:
	cnames2 = [indata2.meta[0]] + cnames2

cindex1 = [indata1.meta.index(c) for c in cnames1]
cindex2 = [indata2.meta.index(c) for c in cnames2]

outdata1 = TsvWriter(outfile1)
outdata2 = TsvWriter(outfile2)
outdata1.meta = cnames1
outdata2.meta = cnames2
outdata1.writeHead()
outdata2.writeHead()

for r1 in indata1:
	outdata1.write(r1[i] for i in cindex1)
outdata1.close()

for r2 in indata2:
	outdata2.write(r2[i] for i in cindex2)
outdata2.close()
Beispiel #8
0
from bioprocs.utils.tsvio2 import TsvReader, TsvWriter

infile = {{i.infile | quote}}
outfile = {{o.outfile | quote}}
inopts = {{args.inopts | repr}}
infmt = {{args.infmt | quote}}
cutoff = {{args.cutoff | repr}}

degrees = defaultdict(lambda: 0)
if infmt.startswith('pair'):
    reader = TsvReader(infile, **inopts)
    for r in reader:
        if cutoff:
            try:
                score = float(r[2])
            except TypeError:
                raise TypeError(
                    'The 3rd column should be a score for apply the cutoff.')
            if score < cutoff:
                continue
        degrees[r[0]] += 1
        degrees[r[1]] += 1
    writer = TsvWriter(outfile)
    for node in sorted(degrees.keys(), key=lambda x: degrees[x], reverse=True):
        if infmt.endswith('complete'):
            writer.write([node, int(int(degrees[node]) / 2)])
        else:
            writer.write([node, degrees[node]])
    writer.close()
else:
    raise ValueError('Input format other than "pair" not supported yet.')
Beispiel #9
0
	writer.cnames = ['CHROM', 'START', 'END', 'NAME', 'SCORE', 'STRAND']
	writer.writeHead()
for r in reader:
	gene = r[genecol]
	if gene not in genes:
		msg = 'Gene does not exist: {}'.format(gene)
		if notfound == 'error':
			raise ValueError(msg)
		else:
			log2pyppl('Gene does not exist: {msg}', 'warning')
			continue
	chrom, _, _, start, end, _, strand = genes[gene]
	start, end = int(start), int(end)
	if strand == '-':
		record = [
			chrom, 
			min(start, end - region.down) if region.withbody else end - region.down, 
			end + region.up, 
			gene, 0, strand
		]
	else:
		record = [
			chrom, 
			start - region.up, 
			max(end, start + region.down) if region.withbody else start + region.down, 
			gene, 0, strand
		]
	writer.write(record)
writer.close()

Beispiel #10
0
    for i in range(0, len(snplist), 1000):
        chunk = snplist[i:i + 1000]
        sql = 'SELECT chrom, chromStart, chromEnd, name, score, strand, refUCSC, alleles, alleleFreqs FROM snp{dbsnpver} WHERE name in ({snps})'.format(
            dbsnpver=dbsnpver, snps=', '.join("'{}'".format(s) for s in chunk))
        result = g.sql(sql)
        for r in result:
            allfreqs = dict(zip(r.alleles.split(','),
                                r.alleleFreqs.split(',')))
            reffreq = allfreqs.get(r.refUCSC, '0')
            if r.refUCSC in allfreqs:
                del allfreqs[r.refUCSC]
            if '' in allfreqs:
                del allfreqs['']
            writer.write([
                r.chrom, r.chromStart, r.chromEnd, r.name, r.score, r.strand,
                r.refUCSC, ','.join(allfreqs.keys()),
                ','.join([reffreq] + list(allfreqs.values()))
            ])
    writer.close()

else:
    # snps
    snplist = path.join(jobindir, path.basename(snpfile) + '.list')
    reader = TsvReader(snpfile, cnames=False)
    writer = TsvWriter(snplist)
    for r in reader:
        writer.write([r[snpcol]])
    reader.close()
    writer.close()

    shell.TOOLS.vcftools = vcftools
Beispiel #11
0
		raise ValueError('Method %s not supported yet.' % method)

def numpval(pval):
	try:
		return float(pval)
	except TypeError:
		return 1.0

reader    = TsvReader(infile)
writer    = TsvWriter(outfile)
prevsnp   = None
prevpvals = []
for r in reader:
	snp = r.Case.split('.')[0]
	if snp != prevsnp:
		if prevsnp:
			writer.write([
				prevsnp,
				aggregate(prevpvals, method)
			])
		prevsnp   = snp
		prevpvals = [numpval(r.Pval)]
	else:
		prevpvals.append(numpval(r.Pval))
writer.write([
	prevsnp,
	aggregate(prevpvals, method)
])

writer.close()
Beispiel #12
0
	# get number of lines of affysnps file
	total  = wc_l(affysnps)
	dists  = distribute(total, nthread)
	reader = TsvReader(affysnps, cnames = False)
	# dir to save the split file and result file
	thdir  = path.join(outdir, 'bamrc.nthreads')
	if not path.exists(thdir):
		makedirs(thdir)
	
	asbname = path.basename(affysnps).split('.')[0]
	for i, dist in enumerate(dists):
		writer = TsvWriter(path.join(thdir, '{bname}.thread{i}.snp'.format(
			bname = asbname, i = i
		)))
		for _ in range(dist):
			writer.write(next(reader))
		writer.close()
	
	para   = Parallel(nthread, raiseExc = True)
	para.run(getAlleleCount, [
		(tumbam, path.join(
			thdir, '{bname}.thread{i}.snp'.format(bname = asbname, i = i)
		), path.join(
			thdir, '{tumbn}.thread{i}.bamrc'.format(tumbn = path.basename(tumbam), i = i)
		)) for i in range(nthread)
	])
	# merge to tumsnp
	writer = TsvWriter(tumsnp)
	writer.cnames = ['Chrm', 'pos', 'A', 'C', 'G', 'T', 'Total', 'refCount', 'mutCount']
	writer.writeHead(lambda cn: "#" + "\t".join(cn))
	for i in range(nthread):