def getAlleleCount(bamfile, snpfile, outfile): brcparams = Box() brcparams.f = ref brcparams.w = 0 brcparams.l = snpfile brcparams[''] = bamfile cmd = '{bamrc} {args} > {outfile!r}'.format( bamrc = bamrc, args = cmdargs(brcparams, equal = ' '), outfile = outfile + '.tmp') runcmd(cmd) # reformated output to desired format reader = TsvReader(outfile + '.tmp', cnames = False) snper = TsvReader(snpfile, cnames = False) #chr1 564773 C 14 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:14:... G:0:... T:0:... N:0:... writer = TsvWriter(outfile) writer.cnames = ['Chrm', 'pos', 'A', 'C', 'G', 'T', 'Total', 'refCount', 'mutCount'] for r in reader: while True: try: snp = next(snper) except StopIteration: break # use the end position, in case it's 0-based if snp[0] == r[0] and snp[2] == r[1]: counts = dict( A = r[5].split(':', 2)[1], C = r[6].split(':', 2)[1], G = r[7].split(':', 2)[1], T = r[8].split(':', 2)[1] ) rec = TsvRecord() rec.Chrm = r[0] rec.pos = r[1] rec.Total = r[3] rec.A = counts['A'] rec.C = counts['C'] rec.G = counts['G'] rec.T = counts['T'] # if reference allele is unknown, assuming all are ref alleles rec.refCount = counts.get(snp[6].upper(), r[3]) # if mut allele is unknown, assuming no mutations happened rec.mutCount = counts.get(snp[7].upper(), 0) writer.write(rec) # go to next snp break else: # go to next r continue writer.close()
# save the data file # expfile """ S1 S2 .. Sn G1 ... G2 ... """ expreader = TsvReader(expfile) expdata = [r for r in expreader if r[0] in genes or r[0] in tfs] expreader.close() datawriter = TsvWriter(outdata) for i, cname in enumerate(expreader.cnames): if i == 0: # genes + tfs datawriter.cnames = [r[0] for r in expdata] datawriter.writeHead() else: datawriter.write([cname] + [r[i] for r in expdata]) datawriter.close() del expdata genes = [g for g in genes if g in datawriter.cnames] tfs = [g for g in tfs if g in datawriter.cnames] genetfs = {g: [tf for tf in gtfs if tf in tfs] for g, gtfs in genetfs.items() if g in genes} # save the group file # mutfile """ S1 S2 .. Sn M1 ... (0/1/2/NA)
from bioprocs.utils.tsvio2 import TsvWriter, TsvRecord from gff import Gff infile = {{i.infile | quote}} outfile = {{o.outfile | quote}} attr2name = {{args.attr2name}} keepinfo = {{args.keepinfo | repr}} writer = TsvWriter(outfile) writer.cnames = ['CHR', 'START', 'END', 'NAME', 'SCORE', 'STRAND'] if keepinfo: writer.cnames.append('ORIGINAL') def getNameFromAttrs(attrs): if attr2name: return attr2name(**attrs) for key in sorted(attrs.keys()): if key in writer.cnames: continue if 'id' in key.lower(): return attrs[key] if 'name' in key.lower(): return attrs[key] return attrs[key] gff = Gff(infile) for record in gff: r = TsvRecord() r.CHR = record['seqid'] r.START = record['start'] r.END = record['end']
outfile = {{ o.outfile | quote}} outdir = {{ o.outdir | quote}} params = {{ args.params | repr}} idxfile = {{ args.idxfile | quote}} kallisto = {{ args.kallisto | quote}} nthread = {{ args.nthread | repr}} shell.TOOLS.kallisto = kallisto params.i = idxfile params.o = outdir params.t = nthread params._ = [fq1, fq2] kallisto = shell.Shell(subcmd = True).kallisto kallisto.quant(**params).run() imfile = path.join(outdir, 'abundance.tsv') reader = TsvReader(imfile) writer = TsvWriter(outfile) writer.cnames = ['target_id', 'est_counts'] writer.writeHead() for r in reader: r.target_id = r.target_id.split('::')[0] try: r.est_counts = int(round(float(r.est_counts))) except TypeError: r.est_counts = 0 writer.write(r) writer.close()
logger.info('%s motifs loaded', len(motifs)) if tool == 'meme': cmdparams = [] params.thresh = pval params.verbosity = 4 for motif, name in motifs.items(): params.oc = path.join(outdir, name + '.' + re.sub(r'[^\w_]', '', motif)) params.motif = motif params[""] = [tfmotifs, sfile] cmdparams.append((meme, cmdargs(params, dash = '--', equal = ' '))) Parallel(nthread, raiseExc = True).run('{} {}', cmdparams) writer = TsvWriter(outfile) writer.cnames = [ "CHR", "START", "END", "NAME", "SCORE", "STRAND", "MOTIF", "SEQ", "STARTONSEQ", "STOPONSEQ", "RAWSCORE", "PVAL", "QVAL", "MATCHEDSEQ", "UCSCLINK" ] writer.writeHead(callback = lambda cnames: "#" + "\t".join(cnames)) def rowfactory(r): r.PVAL = float(r['p-value']) if r.PVAL >= pval: return None r.RAWSCORE = r.score try: r.SCORE = int(float(r.score) * 10) except TypeError: r.SCORE = 0 r.STRAND = r.strand r.MOTIF = r.motif_id # split motif_alt_id
))) for _ in range(dist): writer.write(next(reader)) writer.close() para = Parallel(nthread, raiseExc = True) para.run(getAlleleCount, [ (tumbam, path.join( thdir, '{bname}.thread{i}.snp'.format(bname = asbname, i = i) ), path.join( thdir, '{tumbn}.thread{i}.bamrc'.format(tumbn = path.basename(tumbam), i = i) )) for i in range(nthread) ]) # merge to tumsnp writer = TsvWriter(tumsnp) writer.cnames = ['Chrm', 'pos', 'A', 'C', 'G', 'T', 'Total', 'refCount', 'mutCount'] writer.writeHead(lambda cn: "#" + "\t".join(cn)) for i in range(nthread): subrc = path.join( thdir, '{tumbn}.thread{i}.bamrc'.format(tumbn = path.basename(tumbam), i = i) ) reader = TsvReader(subrc, cnames = False) for r in reader: writer.write(r.values()) reader.close() writer.close() # normal para.run(getAlleleCount, [ (normbam, path.join( thdir, '{bname}.thread{i}.snp'.format(bname = asbname, i = i)