def read_tfgenes(tgfile): """Read tf-gene pairs""" logger.info('Reading TF-gene pairs in %s ...', tgfile) reader = TsvReader(tgfile, cnames=False) ret = {} # gene => tf for row in reader: ret.setdefault(row[1], set()).add(row[0]) reader.close() return ret
def getAlleleCount(bamfile, snpfile, outfile): brcparams = Box() brcparams.f = ref brcparams.w = 0 brcparams.l = snpfile brcparams[''] = bamfile cmd = '{bamrc} {args} > {outfile!r}'.format( bamrc = bamrc, args = cmdargs(brcparams, equal = ' '), outfile = outfile + '.tmp') runcmd(cmd) # reformated output to desired format reader = TsvReader(outfile + '.tmp', cnames = False) snper = TsvReader(snpfile, cnames = False) #chr1 564773 C 14 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:14:... G:0:... T:0:... N:0:... writer = TsvWriter(outfile) writer.cnames = ['Chrm', 'pos', 'A', 'C', 'G', 'T', 'Total', 'refCount', 'mutCount'] for r in reader: while True: try: snp = next(snper) except StopIteration: break # use the end position, in case it's 0-based if snp[0] == r[0] and snp[2] == r[1]: counts = dict( A = r[5].split(':', 2)[1], C = r[6].split(':', 2)[1], G = r[7].split(':', 2)[1], T = r[8].split(':', 2)[1] ) rec = TsvRecord() rec.Chrm = r[0] rec.pos = r[1] rec.Total = r[3] rec.A = counts['A'] rec.C = counts['C'] rec.G = counts['G'] rec.T = counts['T'] # if reference allele is unknown, assuming all are ref alleles rec.refCount = counts.get(snp[6].upper(), r[3]) # if mut allele is unknown, assuming no mutations happened rec.mutCount = counts.get(snp[7].upper(), 0) writer.write(rec) # go to next snp break else: # go to next r continue writer.close()
def __init__(self, sfile): reader = TsvReader(sfile) self.samcol = reader.cnames[0] if self.samcol == 'ROWNAMES': self.samcol = 'Sample' reader.cnames[0] = 'Sample' self.data = reader.dump() self.nrow = len(self.data) self.ncol = len(reader.cnames) self.colnames = reader.cnames self.rownames = [row[self.samcol] for row in self.data] expectColnames = ['Sample', 'Patient', 'Group', 'Batch'] if not set(expectColnames) & set(self.colnames): raise SampleInfoException('Unexpected column names: %s.' % str(self.colnames))
def _read(self, sifile): standard_cnames = ["", "Sample", "Patient", "Group", "Batch"] reader = TsvReader(sifile) self.cnames = reader.cnames if not self.cnames: raise SampleInfoException( 'Headers for sample information file is required.') if any(cname not in standard_cnames for cname in self.cnames): raise SampleInfoException( 'Headers should be a subset of {!r}'.format( ', '.join(standard_cnames))) if "" in self.cnames: self.cnames[self.cnames.index("")] = "Sample" self.mat = reader.dump()
def assertFileEqual(self, first, second, filetype=None, firstInopts=None, secondInopts=None, msg=None): if not self.maxDiff is None: self.maxDiff = max(self.maxDiff or 5000, 5000) filetype1 = filetype or ('text' if istext(first) else 'nontext') filetype2 = filetype or ('text' if istext(second) else 'nontext') if filetype1 != filetype2: standardMsg = 'Files different, because file1 is {0} but file2 is {1}'.format( filetype1, filetype2) self.fail(self._formatMessage(msg, standardMsg)) elif filetype1 == 'text': # and filetype2 == 'text': reader1 = TsvReader(first, ** firstInopts) if firstInopts else TsvReader(first) reader2 = TsvReader( second, **secondInopts) if secondInopts else TsvReader(second) rindex = 0 for r1 in reader1: rindex += 1 try: r2 = next(reader2) except StopIteration: standardMsg = 'File1 and file2 are different.\nFile1: {2}\nFile2: {3}\nRow {0} of file1 is: {1}, but nothing at row {0} of file2.'.format( rindex, r1, first, second) self.fail(self._formatMessage(msg, standardMsg)) if r1 != r2: standardMsg = 'File1 and file2 are different.\nFile1: {3}\nFile2: {4}\nRow {0} of file1: {1}\nRow {0} of file2: {2}'.format( rindex, r1, r2, first, second) self.fail(self._formatMessage(msg, standardMsg)) else: # filetype1 == 'nontext' and filetype2 == 'nonetext': # binary import filecmp if not filecmp.cmp(first, second, shallow=False): standardMsg = 'Binary files are different:\n{}\n{}'.format( first, second) self.fail(self._formatMessage(msg, standardMsg))
if inopts1.headCallback: inopts1.headCallback = eval(inopts1.headCallback) if inopts2.headCallback: inopts2.headCallback = eval(inopts2.headCallback) inopts1.attach = False inopts2.attach = False rnames1 = inopts1.get('rnames', True) rnames2 = inopts2.get('rnames', True) if 'rnames' in inopts1: del inopts1['rnames'] if 'rnames' in inopts2: del inopts2['rnames'] indata1 = TsvReader(infile1, **inopts1) indata2 = TsvReader(infile2, **inopts2) cnames1 = indata1.meta if not rnames1 else indata1.meta[1:] cnames2 = indata2.meta if not rnames2 else indata2.meta[1:] paired = list(set(cnames1) & set(cnames2)) cnames1 = cnames2 = paired if rnames1: cnames1 = [indata1.meta[0]] + cnames1 if rnames2: cnames2 = [indata2.meta[0]] + cnames2 cindex1 = [indata1.meta.index(c) for c in cnames1] cindex2 = [indata2.meta.index(c) for c in cnames2]
bedtools = {{ args.bedtools | quote}} shell.TOOLS.bedtools = bedtools bedtools = shell.Shell(subcmd = True, dash = '-', equal = ' ').bedtools params['g'] = gsize params['i'] = infile if not 'l' and not 'r' and not 'b' in params: raise ValueError('You have to define a length to flank (args.params.l, args.params.r or params.b') if args.extend: left = params.get('l', params.get('b', 0)) right = params.get('r', params.get('b', 0)) stdns = params.get('s', False) reader = TsvReader(infile, cnames = False) writer = TsvWriter(outfile) for r in reader: if not stdns or r[5] == '+': left2, right2 = left, right else: left2, right2 = right, left if params.pct: length = r[2] - r[1] r[1] -= round(length * left2) r[2] += round(length * right2) else: r[1] -= left2 r[2] += right2 writer.write(r) else:
covfile = {{i.covfile | quote}} genes = {{i.genes.split(',') | repr}} tfs = {{i.tfs.split(';') | repr}} outdata = {{o.outdata | quote}} outgroup = {{o.outgroup | quote}} outcase = {{o.outcase | quote}} genetfs = { g:tfs[i].split(',') for i, g in enumerate(genes) } # get gene, snp pairs """ chr1 12463073 12463074 AADACL4 0 + chr1 12463073 12463074 chr1_12463073_rs7547740_A_G 0 + chr1 12480504 12480505 AADACL4 0 + chr1 12480504 12480505 chr1_12480504_rs6660365_T_C 0 + chr1 12496021 12496022 AADACL4 0 + chr1 12496021 12496022 chr1_12496021_rs6541023_T_C 0 + """ mutgenes = defaultdict(lambda: []) intereader = TsvReader(interfile) genes = set() for r in intereader: if not r[3] in genetfs: continue mutgenes[r[9]].append(r[3]) genes.add(r[3]) intereader.close() # shrink the sets genetfs = {g: genetfs[g] for g in genes} tfs = list({tf for gtfs in genetfs.values() for tf in gtfs}) # nothing, write empty files if not mutgenes or not genes: open(outdata, 'w').close()
from bioprocs.utils.tsvio2 import TsvReader, TsvWriter # snp gene # SNP1 Gene10 # sorted by gene infile = {{i.infile | quote}} snpfile = {{o.snpfile | quote}} genefile = {{o.genefile | quote}} snppergene = {{args.snppergene | repr}} nchr = {{args.nchr | repr}} seed = {{args.seed | repr}} # distances between genes dist = {{args.dist | repr}} random.seed(seed) reader = TsvReader(infile, cnames=False) allsnps = set(reader.dump(0)) reader.rewind() allgenes = set(reader.dump(1)) reader.close() # assign a probability to each snp nsnps = len(allsnps) ngenes = len(allgenes) snp_probs = dict(zip(allsnps, random.choices(range(ngenes * snppergene), k=nsnps))) genebed = TsvWriter(genefile) snpbed = TsvWriter(snpfile) geneperchr = math.ceil(float(ngenes) / float(nchr))
ncol = {{args.ncol}} nameopt = {{args.name | quote}} def rowFactory(row): rparts = row[0].split('_') if len(rparts) == 4: (chrom, pos, ref, alt) = rparts name = chrom + '_' + pos if nameopt == 'neat' else row[0] elif len(rparts) == 5: (chrom, pos, name, ref, alt) = rparts if name == 'NOVEL': name = chrom + '_' + pos if nameopt == 'full': name = row[0] else: raise ValueError('Malformat genotype matrix, expect 4 or 5 items in row names.') if ncol == 3: return [chrom, pos, int(pos) + 1] if ncol == 6: return [chrom, pos, int(pos) + 1, name, 0, '+'] else: return [chrom, pos, int(pos) + 1, name, 0, '+', ref + ',' + alt, ','.join(rows[1:])] reader = TsvReader(infile, cnames = True, attach = False, row = rowFactory) writer = TsvWriter(outfile) for r in reader: writer.write(r) writer.close()
func=set(['ncRNA']), locType=u'exact', weight=1L, exceptions=set([]), submitterCount=26, submitters='1000GENOMES,ABI,BCM-HGSC-SUB,BCM_SSAHASNP,BGI,BL,BUSHMAN,COMPLETE_GENOMICS,DDI,ENSEMBL,EVA-GONL,EVA_DECODE,EVA_GENOME_DK,EVA_UK10K_ALSPAC,EVA_UK10K_TWINSUK,GMI,HAMMER_LAB,HGSV,HUMANGENOME_JCVI,ILLUMINA-UK,JMKIDD_LAB,PJP,SSAHASNP,SSMP,TISHKOFF,WEILL_CORNELL_DGM,', alleleFreqCount=2, alleles='C,T,', alleleNs='2634.000000,2374.000000,', alleleFreqs='0.525958,0.474042,', bitfields=set(['maf-5-all-pops', 'maf-5-some-pop']) ) ''' # snps reader = TsvReader(snpfile, cnames=False) snplist = list(set(r[snpcol] for r in reader)) reader.close() from cruzdb import Genome g = Genome(genome) outfiletmp = outfile + '.tmp' writer = TsvWriter(outfiletmp) for i in range(0, len(snplist), 1000): chunk = snplist[i:i + 1000] sql = 'SELECT chrom, chromStart, chromEnd, name, score, strand, refUCSC, alleles, alleleFreqs FROM snp{dbsnpver} WHERE name in ({snps})'.format( dbsnpver=dbsnpver, snps=', '.join("'{}'".format(s) for s in chunk)) result = g.sql(sql) for r in result: allfreqs = dict(zip(r.alleles.split(','), r.alleleFreqs.split(',')))
from bioprocs.gene import pPromoters from bioprocs.bed import pBedIntersect from bioprocs.tsv import pTsvMerge from bioprocs.stats import pChow from bioprocs.vcfnext import pGTMat2Bed from bioprocs.utils.parallel import distributeList from bioprocs.utils.tsvio2 import TsvReader if __name__ == '__main__': params = params.parse() logger.logger.info('Reading tfhits ...') # get tf-gene pairs genes = defaultdict(lambda: set()) reader = TsvReader(params.tfhits, cnames=False) allgenes = set() npairs = 0 if params.tfhits.endswith('.bed'): for r in reader: tf, gene = r[3].split('::') genes[gene].add(tf) allgenes.add(gene) npairs += 1 else: for r in reader: tf, gene = r[:2] genes[gene].add(tf) allgenes.add(gene) npairs += 1 reader.close()
tffile = {{i.tffile | quote}} sfile = {{i.sfile | quote}} outfile = {{o.outfile | quote}} outdir = {{o.outdir | quote}} tool = {{args.tool | quote}} meme = {{args.meme | quote}} params = {{args.params | repr}} tfmotifs = {{args.tfmotifs | quote}} pval = {{args.pval | repr}} ucsclink = {{args.ucsclink | quote}} nthread = {{args.nthread | repr}} # get all motifs logger.info('Loading motif names ...') reader = TsvReader(tffile, cnames = False) ncol = len(next(reader)) reader.rewind() if ncol == 1: motifns = {r[0]:r[0] for r in reader} else: motifns = {r[0]:r[1] for r in reader} logger.info('%s motif names read.', len(motifns)) # match motifs logger.info('Matching motifs in database ...') mnames = [m.name for m in MemeReader(tfmotifs)] motifs = {k:v for k, v in motifns.items() if k in mnames} logger.info('%s motifs loaded', len(motifs)) if tool == 'meme':
from operator import mul try: reduce except NameError: from functools import reduce return reduce(mul, pvals, 1) ** (1.0/float(len(pvals))) else: raise ValueError('Method %s not supported yet.' % method) def numpval(pval): try: return float(pval) except TypeError: return 1.0 reader = TsvReader(infile) writer = TsvWriter(outfile) prevsnp = None prevpvals = [] for r in reader: snp = r.Case.split('.')[0] if snp != prevsnp: if prevsnp: writer.write([ prevsnp, aggregate(prevpvals, method) ]) prevsnp = snp prevpvals = [numpval(r.Pval)] else: prevpvals.append(numpval(r.Pval))
infile = {{i.infile | quote}} outfile = {{o.outfile | quote}} region = {{args.region | repr}} notfound = {{args.notfound | quote}} inopts = {{args.inopts | repr}} outopts = {{args.outopts | repr}} refgene = {{args.refgene | quote}} genecol = {{args.inopts.genecol | repr}} ocnames = {{args.outopts.cnames | repr}} del inopts['genecol'] del outopts['cnames'] if region.down is None: region.down = region.up # get all genes' TSS and strand reader = TsvReader(refgene, cnames = False, delimit = '"') genes = {r[1]:r[0].split("\t")[:7] for r in reader} reader = TsvReader(infile, **inopts) writer = TsvWriter(outfile, **outopts) if ocnames: writer.cnames = ['CHROM', 'START', 'END', 'NAME', 'SCORE', 'STRAND'] writer.writeHead() for r in reader: gene = r[genecol] if gene not in genes: msg = 'Gene does not exist: {}'.format(gene) if notfound == 'error': raise ValueError(msg) else: log2pyppl('Gene does not exist: {msg}', 'warning')
infile = {{i.infile | quote}} metafile = {{i.metafile | quote}} outdir = {{o.outdir | quote}} plink = {{args.plink | quote}} keeptxt = {{args.keeptxt | repr}} chrmaps = {{args.chrmaps | repr}} prefix = path.join(outdir, {{i.infile | fn2 | quote}}) tpedfile = prefix + ".tped" tfamfile = prefix + ".tfam" # column names could be: # FID, IID, PID, MID, Sex, Pheno if metafile: logger.info('Reading metafile ...') metadata = dict( TsvReader(metafile, cnames=True, row=lambda r: tuple( (r.IID, r))).dump()) else: metadata = None logger.info('Reading genotype matrix ...') # snp1 gt1s1 gt1s2 ... inreader = TsvReader(infile, cnames=True) samples = inreader.meta[1:] logger.info('Writing tfam file ...') tfamWriter = TsvWriter(tfamfile) tfamWriter.meta = ['FID', 'IID', 'PID', 'MID', 'Sex', 'Pheno'] #tfamWriter.writeHead(callback = lambda meta: '#' + '\t'.join(meta)) if not metadata: for s in samples: tfamWriter.write([s, s, '0', '0', 'other', '-9'])
outfile = {{ o.outfile | quote}} outdir = {{ o.outdir | quote}} params = {{ args.params | repr}} idxfile = {{ args.idxfile | quote}} kallisto = {{ args.kallisto | quote}} nthread = {{ args.nthread | repr}} shell.TOOLS.kallisto = kallisto params.i = idxfile params.o = outdir params.t = nthread params._ = [fq1, fq2] kallisto = shell.Shell(subcmd = True).kallisto kallisto.quant(**params).run() imfile = path.join(outdir, 'abundance.tsv') reader = TsvReader(imfile) writer = TsvWriter(outfile) writer.cnames = ['target_id', 'est_counts'] writer.writeHead() for r in reader: r.target_id = r.target_id.split('::')[0] try: r.est_counts = int(round(float(r.est_counts))) except TypeError: r.est_counts = 0 writer.write(r) writer.close()
dbs = {{args.libs | alwaysList | repr}} plot = {{args.plot | repr}} nthread = {{args.nthread | repr}} Rscript = {{args.Rscript | repr}} cutoff = {{args.cutoff | repr}} devpars = {{args.devpars | repr}} pathview = {{args.pathview | repr}} shell.TOOLS.Rscript = Rscript if isinstance(cutoff, dict): if cutoff['by'] == 'p': cutoff['by'] = 'Pval' if cutoff['by'] == 'q': cutoff['by'] = 'AdjPval' reader = TsvReader(infile, **inopts) genes = [r[genecol] for r in reader] en = Enrichr(cutoff = cutoff, top = top, Rscript = Rscript) en.addList(genes, description = path.basename(infile)) para = Parallel(nthread = nthread) runPathview = lambda r, hsa: shell.Shell().Rscript(r, hsa).run() for db in dbs: outfile = path.join(outdir, prefix + '.' + db + '.txt') en.enrich(db) en.export(outfile, top = 100) if plot: plotfile = path.join(outdir, prefix + '.' + db + '.png') en.plot(plotfile, res = devpars.res, width = devpars.width, height = devpars.height) if pathview and 'KEGG' in db:
from gff import Gff from bioprocs.utils.tsvio2 import TsvWriter, TsvReader, TsvRecord from bioprocs.utils import logger infile = {{i.infile | quote}} outfile = {{o.outfile | quote}} notfound = {{args.notfound | quote}} genecol = {{args.genecol or 0 | repr}} inopts = {{args.inopts | repr}} refgene = {{args.refgene | quote}} if not path.isfile(refgene): raise OSError('Refgene file does not exists: {}'.format(refgene)) # get genes genes = TsvReader(infile, **inopts).dump(genecol) genes = dict(zip(genes, [False] * len(genes))) writer = TsvWriter(outfile) writer.cnames = ['CHR', 'START', 'END', 'NAME', 'SCORE', 'STRAND'] gff = Gff(refgene) for g in gff: attrs = g['attributes'] if attrs['gene_id'] not in genes: continue r = TsvRecord() r.CHR = g['seqid'] r.START = g['start'] r.END = g['end'] r.SCORE = g['score'] r.STRAND = g['strand']
try: writer.write(reader.next()) except StopIteration: break writer.close() thparams = params.copy() thparams[""] = [qfile, mfile2] thparams.thresh = qval thparams.oc = ocdir cmdps.append((tomtom, cmdargs(thparams, dash='-', equal=' '))) reader.close() Parallel(nthread, raiseExc=True).run('{} {}', cmdps) writer = TsvWriter(outfile) reader = TsvReader(path.join(ocdirs[0], 'tomtom.txt'), comment='##', cnames=lambda header: header[1:].strip().split("\t")) writer.cnames = reader.cnames writer.writeHead(lambda cnames: "#" + "\t".join(cnames)) reader.close() for ocdir in ocdirs: reader = TsvReader( path.join(ocdir, 'tomtom.txt'), comment='##', cnames=lambda header: header[1:].strip().split("\t")) for r in reader: writer.write(r) reader.close() writer.close() else: params[""] = [mfile1, mfile2]
# go to next snp break else: # go to next r continue writer.close() if nthread == 1: getAlleleCount(tumbam, affysnps, tumsnp) getAlleleCount(normbam, affysnps, normsnp) else: # try to split the affysnps into N files and distribute the jobs to nthreads # get number of lines of affysnps file total = wc_l(affysnps) dists = distribute(total, nthread) reader = TsvReader(affysnps, cnames = False) # dir to save the split file and result file thdir = path.join(outdir, 'bamrc.nthreads') if not path.exists(thdir): makedirs(thdir) asbname = path.basename(affysnps).split('.')[0] for i, dist in enumerate(dists): writer = TsvWriter(path.join(thdir, '{bname}.thread{i}.snp'.format( bname = asbname, i = i ))) for _ in range(dist): writer.write(next(reader)) writer.close() para = Parallel(nthread, raiseExc = True)