def getAlleleCount(bamfile, snpfile, outfile): brcparams = Box() brcparams.f = ref brcparams.w = 0 brcparams.l = snpfile brcparams[''] = bamfile cmd = '{bamrc} {args} > {outfile!r}'.format( bamrc = bamrc, args = cmdargs(brcparams, equal = ' '), outfile = outfile + '.tmp') runcmd(cmd) # reformated output to desired format reader = TsvReader(outfile + '.tmp', cnames = False) snper = TsvReader(snpfile, cnames = False) #chr1 564773 C 14 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:14:... G:0:... T:0:... N:0:... writer = TsvWriter(outfile) writer.cnames = ['Chrm', 'pos', 'A', 'C', 'G', 'T', 'Total', 'refCount', 'mutCount'] for r in reader: while True: try: snp = next(snper) except StopIteration: break # use the end position, in case it's 0-based if snp[0] == r[0] and snp[2] == r[1]: counts = dict( A = r[5].split(':', 2)[1], C = r[6].split(':', 2)[1], G = r[7].split(':', 2)[1], T = r[8].split(':', 2)[1] ) rec = TsvRecord() rec.Chrm = r[0] rec.pos = r[1] rec.Total = r[3] rec.A = counts['A'] rec.C = counts['C'] rec.G = counts['G'] rec.T = counts['T'] # if reference allele is unknown, assuming all are ref alleles rec.refCount = counts.get(snp[6].upper(), r[3]) # if mut allele is unknown, assuming no mutations happened rec.mutCount = counts.get(snp[7].upper(), 0) writer.write(rec) # go to next snp break else: # go to next r continue writer.close()
def main(opts): """Main function""" org_tfgenes = read_tfgenes(opts.origin) add_tfgenes = read_tfgenes(opts.addition) writer = TsvWriter(opts.outfile) logger.info('Writing the union set to %s ...', opts.outfile) for gene, tfs in org_tfgenes.items(): for tf in (tfs | add_tfgenes.pop(gene, set())): writer.write([tf, gene]) for gene, tfs in add_tfgenes.items(): for tf in tfs: writer.write([tf, gene]) writer.close() logger.info('Done.')
S1 S2 .. Sn G1 ... G2 ... """ expreader = TsvReader(expfile) expdata = [r for r in expreader if r[0] in genes or r[0] in tfs] expreader.close() datawriter = TsvWriter(outdata) for i, cname in enumerate(expreader.cnames): if i == 0: # genes + tfs datawriter.cnames = [r[0] for r in expdata] datawriter.writeHead() else: datawriter.write([cname] + [r[i] for r in expdata]) datawriter.close() del expdata genes = [g for g in genes if g in datawriter.cnames] tfs = [g for g in tfs if g in datawriter.cnames] genetfs = {g: [tf for tf in gtfs if tf in tfs] for g, gtfs in genetfs.items() if g in genes} # save the group file # mutfile """ S1 S2 .. Sn M1 ... (0/1/2/NA) M2 ... """ mutreader = TsvReader(mutfile) mutdata = [r for r in mutreader if r[0] in mutgenes]
reader = TsvReader(infile, cnames=False) allsnps = set(reader.dump(0)) reader.rewind() allgenes = set(reader.dump(1)) reader.close() # assign a probability to each snp nsnps = len(allsnps) ngenes = len(allgenes) snp_probs = dict(zip(allsnps, random.choices(range(ngenes * snppergene), k=nsnps))) genebed = TsvWriter(genefile) snpbed = TsvWriter(snpfile) geneperchr = math.ceil(float(ngenes) / float(nchr)) for i, gene in enumerate(allgenes): chrname = 'chr' + str(int(i % nchr) + 1) start = (int(i / nchr) + 1) * dist end = start + 1 first_snp_pos = int(start - dist/2.0 - snppergene) snps = (snp for snp in snp_probs if i * snppergene <= snp_probs[snp] < (i+1)*snppergene) genebed.write([chrname, start, end, gene, 0, '+']) for j, snp in enumerate(snps): snppos = first_snp_pos + j snpbed.write([chrname, snppos, snppos, snp, 0, '+']) genebed.close() snpbed.close()
for s in samples: tfamWriter.write([s, s, '0', '0', 'other', '-9']) else: for s in samples: tfamWriter.write([ metadata[s].FID if s in metadata and 'FID' in metadata[s] else s, s, (metadata[s].PID or '0') if s in metadata and 'PID' in metadata[s] else '0', (metadata[s].MID or '0') if s in metadata and 'MID' in metadata[s] else '0', (metadata[s].Sex or 'other') if s in metadata and 'Sex' in metadata[s] else 'other', (metadata[s].Pheno or '-9') if s in metadata and 'Pheno' in metadata[s] else '-9' ]) tfamWriter.close() def getCompondGT(gt, ref, alt): compGTs = { "0": ref + ' ' + ref, "1": ref + ' ' + alt, "2": alt + ' ' + alt } return compGTs.get(gt, '0 0') logger.info('Writing tped file ...') tpedWriter = TsvWriter(tpedfile) for r in inreader: (chrom, pos, _, ref, alt) = r[0].split('_')
indata1 = TsvReader(infile1, **inopts1) indata2 = TsvReader(infile2, **inopts2) cnames1 = indata1.meta if not rnames1 else indata1.meta[1:] cnames2 = indata2.meta if not rnames2 else indata2.meta[1:] paired = list(set(cnames1) & set(cnames2)) cnames1 = cnames2 = paired if rnames1: cnames1 = [indata1.meta[0]] + cnames1 if rnames2: cnames2 = [indata2.meta[0]] + cnames2 cindex1 = [indata1.meta.index(c) for c in cnames1] cindex2 = [indata2.meta.index(c) for c in cnames2] outdata1 = TsvWriter(outfile1) outdata2 = TsvWriter(outfile2) outdata1.meta = cnames1 outdata2.meta = cnames2 outdata1.writeHead() outdata2.writeHead() for r1 in indata1: outdata1.write(r1[i] for i in cindex1) outdata1.close() for r2 in indata2: outdata2.write(r2[i] for i in cindex2) outdata2.close()
from bioprocs.utils.tsvio2 import TsvReader, TsvWriter infile = {{i.infile | quote}} outfile = {{o.outfile | quote}} inopts = {{args.inopts | repr}} infmt = {{args.infmt | quote}} cutoff = {{args.cutoff | repr}} degrees = defaultdict(lambda: 0) if infmt.startswith('pair'): reader = TsvReader(infile, **inopts) for r in reader: if cutoff: try: score = float(r[2]) except TypeError: raise TypeError( 'The 3rd column should be a score for apply the cutoff.') if score < cutoff: continue degrees[r[0]] += 1 degrees[r[1]] += 1 writer = TsvWriter(outfile) for node in sorted(degrees.keys(), key=lambda x: degrees[x], reverse=True): if infmt.endswith('complete'): writer.write([node, int(int(degrees[node]) / 2)]) else: writer.write([node, degrees[node]]) writer.close() else: raise ValueError('Input format other than "pair" not supported yet.')