Ejemplo n.º 1
0
def main(opts):
    """Main function"""
    org_tfgenes = read_tfgenes(opts.origin)
    add_tfgenes = read_tfgenes(opts.addition)
    writer = TsvWriter(opts.outfile)
    logger.info('Writing the union set to %s ...', opts.outfile)
    for gene, tfs in org_tfgenes.items():
        for tf in (tfs | add_tfgenes.pop(gene, set())):
            writer.write([tf, gene])
    for gene, tfs in add_tfgenes.items():
        for tf in tfs:
            writer.write([tf, gene])
    writer.close()
    logger.info('Done.')
Ejemplo n.º 2
0
def getAlleleCount(bamfile, snpfile, outfile):
	brcparams   = Box()
	brcparams.f = ref
	brcparams.w = 0
	brcparams.l = snpfile

	brcparams[''] = bamfile
	cmd = '{bamrc} {args} > {outfile!r}'.format(
		bamrc = bamrc, args = cmdargs(brcparams, equal = ' '), outfile = outfile + '.tmp')
	runcmd(cmd)

	# reformated output to desired format
	reader = TsvReader(outfile + '.tmp', cnames = False)
	snper  = TsvReader(snpfile, cnames = False)
	#chr1	564773	C	14	=:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00	A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00	C:14:...	G:0:...	T:0:...	N:0:...
	writer = TsvWriter(outfile)
	writer.cnames = ['Chrm', 'pos', 'A', 'C', 'G', 'T', 'Total', 'refCount', 'mutCount']

	for r in reader:
		while True:
			try:
				snp   = next(snper)
			except StopIteration:
				break
			# use the end position, in case it's 0-based
			if snp[0] == r[0] and snp[2] == r[1]:
				counts = dict(
					A = r[5].split(':', 2)[1],
					C = r[6].split(':', 2)[1],
					G = r[7].split(':', 2)[1],
					T = r[8].split(':', 2)[1]
				)
				rec    = TsvRecord()
				rec.Chrm  = r[0]
				rec.pos   = r[1]
				rec.Total = r[3]
				rec.A = counts['A']
				rec.C = counts['C']
				rec.G = counts['G']
				rec.T = counts['T']
				# if reference allele is unknown, assuming all are ref alleles
				rec.refCount = counts.get(snp[6].upper(), r[3])
				# if mut allele is unknown, assuming no mutations happened
				rec.mutCount = counts.get(snp[7].upper(), 0)
				writer.write(rec)
				# go to next snp
				break
			else:
				# go to next r
				continue
	writer.close()
Ejemplo n.º 3
0
random.seed(seed)

reader = TsvReader(infile, cnames=False)
allsnps = set(reader.dump(0))
reader.rewind()
allgenes = set(reader.dump(1))
reader.close()

# assign a probability to each snp
nsnps = len(allsnps)
ngenes = len(allgenes)
snp_probs = dict(zip(allsnps, random.choices(range(ngenes * snppergene),
                                             k=nsnps)))

genebed = TsvWriter(genefile)
snpbed = TsvWriter(snpfile)

geneperchr = math.ceil(float(ngenes) / float(nchr))
for i, gene in enumerate(allgenes):
    chrname = 'chr' + str(int(i % nchr) + 1)
    start = (int(i / nchr) + 1) * dist
    end = start + 1
    first_snp_pos = int(start - dist/2.0 - snppergene)
    snps = (snp for snp in snp_probs
            if i * snppergene <= snp_probs[snp] < (i+1)*snppergene)
    genebed.write([chrname, start, end, gene, 0, '+'])
    for j, snp in enumerate(snps):
        snppos = first_snp_pos + j
        snpbed.write([chrname, snppos, snppos, snp, 0, '+'])
Ejemplo n.º 4
0
# FID, IID, PID, MID, Sex, Pheno
if metafile:
    logger.info('Reading metafile ...')
    metadata = dict(
        TsvReader(metafile, cnames=True, row=lambda r: tuple(
            (r.IID, r))).dump())
else:
    metadata = None

logger.info('Reading genotype matrix ...')
# snp1 gt1s1 gt1s2 ...
inreader = TsvReader(infile, cnames=True)
samples = inreader.meta[1:]

logger.info('Writing tfam file ...')
tfamWriter = TsvWriter(tfamfile)
tfamWriter.meta = ['FID', 'IID', 'PID', 'MID', 'Sex', 'Pheno']
#tfamWriter.writeHead(callback = lambda meta: '#' + '\t'.join(meta))
if not metadata:
    for s in samples:
        tfamWriter.write([s, s, '0', '0', 'other', '-9'])
else:
    for s in samples:
        tfamWriter.write([
            metadata[s].FID if s in metadata and 'FID' in metadata[s] else s,
            s, (metadata[s].PID or '0')
            if s in metadata and 'PID' in metadata[s] else '0',
            (metadata[s].MID or '0')
            if s in metadata and 'MID' in metadata[s] else '0',
            (metadata[s].Sex or 'other')
            if s in metadata and 'Sex' in metadata[s] else 'other',
Ejemplo n.º 5
0
from bioprocs.utils.tsvio2 import TsvWriter, TsvRecord
from gff import Gff

infile    = {{i.infile | quote}}
outfile   = {{o.outfile | quote}}
attr2name = {{args.attr2name}}
keepinfo  = {{args.keepinfo | repr}}

writer = TsvWriter(outfile)
writer.cnames = ['CHR', 'START', 'END', 'NAME', 'SCORE', 'STRAND']
if keepinfo:
	writer.cnames.append('ORIGINAL')

def getNameFromAttrs(attrs):
	if attr2name:
		return attr2name(**attrs)
	for key in sorted(attrs.keys()):
		if key in writer.cnames:
			continue
		if 'id' in key.lower():
			return attrs[key]
		if 'name' in key.lower():
			return attrs[key]
		return attrs[key]

gff = Gff(infile)
for record in gff:
	r        = TsvRecord()
	r.CHR    = record['seqid']
	r.START  = record['start']
	r.END    = record['end']
Ejemplo n.º 6
0
indata1 = TsvReader(infile1, **inopts1)
indata2 = TsvReader(infile2, **inopts2)

cnames1 = indata1.meta if not rnames1 else indata1.meta[1:]
cnames2 = indata2.meta if not rnames2 else indata2.meta[1:]
paired  = list(set(cnames1) & set(cnames2))
cnames1 = cnames2 = paired

if rnames1:
	cnames1 = [indata1.meta[0]] + cnames1
if rnames2:
	cnames2 = [indata2.meta[0]] + cnames2

cindex1 = [indata1.meta.index(c) for c in cnames1]
cindex2 = [indata2.meta.index(c) for c in cnames2]

outdata1 = TsvWriter(outfile1)
outdata2 = TsvWriter(outfile2)
outdata1.meta = cnames1
outdata2.meta = cnames2
outdata1.writeHead()
outdata2.writeHead()

for r1 in indata1:
	outdata1.write(r1[i] for i in cindex1)
outdata1.close()

for r2 in indata2:
	outdata2.write(r2[i] for i in cindex2)
outdata2.close()
Ejemplo n.º 7
0
shell.TOOLS.bedtools = bedtools
bedtools = shell.Shell(subcmd = True, dash = '-', equal = ' ').bedtools

params['g']   = gsize
params['i']   = infile

if not 'l' and not 'r' and not 'b' in params:
	raise ValueError('You have to define a length to flank (args.params.l, args.params.r or params.b')

if args.extend:
	left   = params.get('l', params.get('b', 0))
	right  = params.get('r', params.get('b', 0))
	stdns  = params.get('s', False)
	reader = TsvReader(infile, cnames = False)
	writer = TsvWriter(outfile)
	for r in reader:
		if not stdns or r[5] == '+':
			left2, right2 = left, right
		else:
			left2, right2 = right, left
		if params.pct:
			length   = r[2] - r[1]
			r[1] -= round(length * left2)
			r[2] += round(length * right2)
		else:
			r[1] -= left2
			r[2] += right2
		writer.write(r)
else:
	params._stdout = outfile
Ejemplo n.º 8
0
from bioprocs.utils.tsvio2 import TsvReader, TsvWriter

infile = {{i.infile | quote}}
outfile = {{o.outfile | quote}}
inopts = {{args.inopts | repr}}
infmt = {{args.infmt | quote}}
cutoff = {{args.cutoff | repr}}

degrees = defaultdict(lambda: 0)
if infmt.startswith('pair'):
    reader = TsvReader(infile, **inopts)
    for r in reader:
        if cutoff:
            try:
                score = float(r[2])
            except TypeError:
                raise TypeError(
                    'The 3rd column should be a score for apply the cutoff.')
            if score < cutoff:
                continue
        degrees[r[0]] += 1
        degrees[r[1]] += 1
    writer = TsvWriter(outfile)
    for node in sorted(degrees.keys(), key=lambda x: degrees[x], reverse=True):
        if infmt.endswith('complete'):
            writer.write([node, int(int(degrees[node]) / 2)])
        else:
            writer.write([node, degrees[node]])
    writer.close()
else:
    raise ValueError('Input format other than "pair" not supported yet.')
Ejemplo n.º 9
0
outfile = {{ o.outfile | quote}}
outdir  = {{ o.outdir | quote}}
params = {{ args.params | repr}}
idxfile = {{ args.idxfile | quote}}
kallisto = {{ args.kallisto | quote}}
nthread = {{ args.nthread | repr}}

shell.TOOLS.kallisto = kallisto
params.i = idxfile
params.o = outdir
params.t = nthread
params._ = [fq1, fq2]

kallisto = shell.Shell(subcmd = True).kallisto
kallisto.quant(**params).run()

imfile        = path.join(outdir, 'abundance.tsv')
reader        = TsvReader(imfile)
writer        = TsvWriter(outfile)
writer.cnames = ['target_id', 'est_counts']
writer.writeHead()

for r in reader:
	r.target_id = r.target_id.split('::')[0]
	try:
		r.est_counts = int(round(float(r.est_counts)))
	except TypeError:
		r.est_counts = 0
	writer.write(r)
writer.close()
Ejemplo n.º 10
0
inopts   = {{args.inopts | repr}}
outopts  = {{args.outopts | repr}}
refgene  = {{args.refgene | quote}}
genecol  = {{args.inopts.genecol | repr}}
ocnames  = {{args.outopts.cnames | repr}}
del inopts['genecol']
del outopts['cnames']
if region.down is None:
	region.down = region.up

# get all genes' TSS and strand
reader = TsvReader(refgene, cnames = False, delimit = '"')
genes  = {r[1]:r[0].split("\t")[:7] for r in reader}

reader = TsvReader(infile, **inopts)
writer = TsvWriter(outfile, **outopts)
if ocnames:
	writer.cnames = ['CHROM', 'START', 'END', 'NAME', 'SCORE', 'STRAND']
	writer.writeHead()
for r in reader:
	gene = r[genecol]
	if gene not in genes:
		msg = 'Gene does not exist: {}'.format(gene)
		if notfound == 'error':
			raise ValueError(msg)
		else:
			log2pyppl('Gene does not exist: {msg}', 'warning')
			continue
	chrom, _, _, start, end, _, strand = genes[gene]
	start, end = int(start), int(end)
	if strand == '-':
Ejemplo n.º 11
0
ncol    = {{args.ncol}}
nameopt = {{args.name | quote}}

def rowFactory(row):
	rparts = row[0].split('_')
	if len(rparts) == 4:
		(chrom, pos, ref, alt) = rparts
		name = chrom + '_' + pos if nameopt == 'neat' else row[0]
	elif len(rparts) == 5:
		(chrom, pos, name, ref, alt) = rparts
		if name == 'NOVEL':
			name = chrom + '_' + pos
		if nameopt == 'full':
			name = row[0]
	else:
		raise ValueError('Malformat genotype matrix, expect 4 or 5 items in row names.')
	if ncol == 3:
		return [chrom, pos, int(pos) + 1]
	if ncol == 6:
		return [chrom, pos, int(pos) + 1, name, 0, '+']
	else:
		return [chrom, pos, int(pos) + 1, name, 0, '+', ref + ',' + alt, ','.join(rows[1:])]

reader = TsvReader(infile, cnames = True, attach = False, row = rowFactory)
writer = TsvWriter(outfile)
for r in reader:
	writer.write(r)
writer.close()


Ejemplo n.º 12
0
		alleles='C,T,',	
		alleleNs='2634.000000,2374.000000,',	
		alleleFreqs='0.525958,0.474042,',	
		bitfields=set(['maf-5-all-pops', 'maf-5-some-pop'])	
	)	
	'''

    # snps
    reader = TsvReader(snpfile, cnames=False)
    snplist = list(set(r[snpcol] for r in reader))
    reader.close()

    from cruzdb import Genome
    g = Genome(genome)
    outfiletmp = outfile + '.tmp'
    writer = TsvWriter(outfiletmp)
    for i in range(0, len(snplist), 1000):
        chunk = snplist[i:i + 1000]
        sql = 'SELECT chrom, chromStart, chromEnd, name, score, strand, refUCSC, alleles, alleleFreqs FROM snp{dbsnpver} WHERE name in ({snps})'.format(
            dbsnpver=dbsnpver, snps=', '.join("'{}'".format(s) for s in chunk))
        result = g.sql(sql)
        for r in result:
            allfreqs = dict(zip(r.alleles.split(','),
                                r.alleleFreqs.split(',')))
            reffreq = allfreqs.get(r.refUCSC, '0')
            if r.refUCSC in allfreqs:
                del allfreqs[r.refUCSC]
            if '' in allfreqs:
                del allfreqs['']
            writer.write([
                r.chrom, r.chromStart, r.chromEnd, r.name, r.score, r.strand,
Ejemplo n.º 13
0
mnames = [m.name for m in MemeReader(tfmotifs)]
motifs = {k:v for k, v in motifns.items() if k in mnames}
logger.info('%s motifs loaded', len(motifs))

if tool == 'meme':
	cmdparams        = []
	params.thresh    = pval
	params.verbosity = 4
	for motif, name in motifs.items():
		params.oc    = path.join(outdir, name + '.' + re.sub(r'[^\w_]', '', motif))
		params.motif = motif
		params[""]   = [tfmotifs, sfile]
		cmdparams.append((meme, cmdargs(params, dash = '--', equal = ' ')))
	Parallel(nthread, raiseExc = True).run('{} {}', cmdparams)

	writer = TsvWriter(outfile)
	writer.cnames = [
		"CHR", "START", "END", "NAME", "SCORE", "STRAND", "MOTIF", "SEQ", "STARTONSEQ",
		"STOPONSEQ", "RAWSCORE", "PVAL", "QVAL", "MATCHEDSEQ", "UCSCLINK"
	]
	writer.writeHead(callback = lambda cnames: "#" + "\t".join(cnames))

	def rowfactory(r):
		r.PVAL       = float(r['p-value'])
		if r.PVAL >= pval:
			return None
		r.RAWSCORE = r.score
		try:
			r.SCORE = int(float(r.score) * 10)
		except TypeError:
			r.SCORE = 0
Ejemplo n.º 14
0
		try:
			reduce
		except NameError:
			from functools import reduce
		return reduce(mul, pvals, 1) ** (1.0/float(len(pvals)))
	else:
		raise ValueError('Method %s not supported yet.' % method)

def numpval(pval):
	try:
		return float(pval)
	except TypeError:
		return 1.0

reader    = TsvReader(infile)
writer    = TsvWriter(outfile)
prevsnp   = None
prevpvals = []
for r in reader:
	snp = r.Case.split('.')[0]
	if snp != prevsnp:
		if prevsnp:
			writer.write([
				prevsnp,
				aggregate(prevpvals, method)
			])
		prevsnp   = snp
		prevpvals = [numpval(r.Pval)]
	else:
		prevpvals.append(numpval(r.Pval))
writer.write([
Ejemplo n.º 15
0
	open(outdata, 'w').close()
	open(outgroup, 'w').close()
	open(outcase, 'w').close()
	exit(0)

# save the data file
# expfile
"""
	S1	S2	..	Sn
G1	...
G2	...
"""
expreader  = TsvReader(expfile)
expdata    = [r for r in expreader if r[0] in genes or r[0] in tfs]
expreader.close()
datawriter = TsvWriter(outdata)
for i, cname in enumerate(expreader.cnames):
	if i == 0:
		# genes + tfs
		datawriter.cnames = [r[0] for r in expdata]
		datawriter.writeHead()
	else:
		datawriter.write([cname] + [r[i] for r in expdata])
datawriter.close()
del expdata
genes = [g for g in genes if g in datawriter.cnames]
tfs   = [g for g in tfs if g in datawriter.cnames]

genetfs = {g: [tf for tf in gtfs if tf in tfs] for g, gtfs in genetfs.items() if g in genes}

# save the group file
Ejemplo n.º 16
0
        writer.writeMeta()
        for _ in range(joblist[i]):
            try:
                writer.write(reader.next())
            except StopIteration:
                break
        writer.close()
        thparams = params.copy()
        thparams[""] = [qfile, mfile2]
        thparams.thresh = qval
        thparams.oc = ocdir
        cmdps.append((tomtom, cmdargs(thparams, dash='-', equal=' ')))
    reader.close()
    Parallel(nthread, raiseExc=True).run('{} {}', cmdps)

    writer = TsvWriter(outfile)
    reader = TsvReader(path.join(ocdirs[0], 'tomtom.txt'),
                       comment='##',
                       cnames=lambda header: header[1:].strip().split("\t"))
    writer.cnames = reader.cnames
    writer.writeHead(lambda cnames: "#" + "\t".join(cnames))
    reader.close()
    for ocdir in ocdirs:
        reader = TsvReader(
            path.join(ocdir, 'tomtom.txt'),
            comment='##',
            cnames=lambda header: header[1:].strip().split("\t"))
        for r in reader:
            writer.write(r)
        reader.close()
    writer.close()
Ejemplo n.º 17
0
	getAlleleCount(normbam, affysnps, normsnp)
else:
	# try to split the affysnps into N files and distribute the jobs to nthreads
	# get number of lines of affysnps file
	total  = wc_l(affysnps)
	dists  = distribute(total, nthread)
	reader = TsvReader(affysnps, cnames = False)
	# dir to save the split file and result file
	thdir  = path.join(outdir, 'bamrc.nthreads')
	if not path.exists(thdir):
		makedirs(thdir)
	
	asbname = path.basename(affysnps).split('.')[0]
	for i, dist in enumerate(dists):
		writer = TsvWriter(path.join(thdir, '{bname}.thread{i}.snp'.format(
			bname = asbname, i = i
		)))
		for _ in range(dist):
			writer.write(next(reader))
		writer.close()
	
	para   = Parallel(nthread, raiseExc = True)
	para.run(getAlleleCount, [
		(tumbam, path.join(
			thdir, '{bname}.thread{i}.snp'.format(bname = asbname, i = i)
		), path.join(
			thdir, '{tumbn}.thread{i}.bamrc'.format(tumbn = path.basename(tumbam), i = i)
		)) for i in range(nthread)
	])
	# merge to tumsnp
	writer = TsvWriter(tumsnp)