def colselect(infile, col, cut, rev, tag): """Select the columns from input file""" pTsvColSelect1 = pTsvColSelect.copy(tag=tag) if str(col).isdigit(): pTsvColSelect1.input = [infile] pTsvColSelect1.args.cols = [0, int(col)] start = end = pTsvColSelect1 else: pTsvHeader1 = pTsvHeader.copy(tag=tag) pTsvHeader1.input = [infile] start = pTsvHeader1 pShell1 = pShell.copy(tag=tag) pShell1.depends = pTsvHeader1 pShell1.args.cmd = """ head -1 {{i.infile | quote}} > {{o.outfile | quote}} echo "%s" >> {{o.outfile | quote}} """ % col pTsvColSelect1.depends = pShell1 pTsvColSelect1.input = 'infile:file, colfile:file' pTsvColSelect1.input = lambda ch: ch.insert(0, infile) start = pTsvHeader1 end = pTsvColSelect1 if cut and cut < 1: pTsv1 = pTsv.copy(tag=tag) pTsv1.depends = pTsvColSelect1 pTsv1.args.row = 'lambda row: float(row[%r]) < %f' % (col, cut) end = pTsv1 elif cut and cut > 1: pSort1 = pSort.copy(tag=tag) pSort1.depends = pTsvColSelect1 pSort1.args.inopts.skip = 1 pSort1.args.params.k = '2g%s' % ('' if rev else 'r') pShell2 = pShell.copy(tag=tag) pShell2.depends = pSort1 pShell2.args.cmd = "head -n %d {{i.infile | quote}} > {{o.outfile | quote}}" % ( int(cut) + 1) end = pShell2 if rev: pTsv2 = pTsv.copy(tag=tag) pTsv2.depends = end pTsv2.args.row = 'lambda row: row.__setitem__({0!r}, "-" + row[{0!r}])'.format( col) end = pTsv2 return start, end
def colselect(infile, col, cut): """Select the columns from input file""" tag = Path(infile).stem.split(".")[0] pTsvColSelect1 = pTsvColSelect.copy(tag=tag) if str(col).isdigit(): pTsvColSelect1.input = [infile] pTsvColSelect1.args.cols = [0, int(col)] start = pTsvColSelect1 else: pTsvHeader1 = pTsvHeader.copy(tag=tag) pTsvHeader1.input = [infile] start = pTsvHeader1 pShell1 = pShell.copy(tag=tag) pShell1.depends = pTsvHeader1 pShell1.args.cmd = """ head -1 {{i.infile | quote}} > {{o.outfile | quote}} echo "%s" >> {{o.outfile | quote}} """ % col pTsvColSelect1.depends = pShell1 pTsvColSelect1.input = lambda ch: ch.insert(0, infile) pTsv1 = pTsv.copy(tag=tag) pTsv1.depends = pTsvColSelect1 pTsv1.args.row = "lambda row: float(row[1]) < %f" % cut return [start], [pTsv1]
def ceQTL_filter_row(args, pCefile, hasAdjPval, starts): if not args.row: return pCefile pFilterRow = pTsv.copy() pFilterRow.depends = pCefile pFilterRow.args.helper = args.helper pFilterRow.args.row = args.row if hasAdjPval: pFilterRow.args.inopts.row = 'lambda r: setattr(r, "Pval", float(r.Pval)) or setattr(r, "AdjPval", float(r.AdjPval)) or r' else: pFilterRow.args.inopts.row = 'lambda r: setattr(r, "Pval", float(r.Pval)) or r' return pFilterRow
def ceQTL_atsnp(args): pGTMat2Bed.input = [args.cefile] pGTMat2Bed.args.inopts.cnames = False pGTMat2Bed.args.inopts.skip = 1 pGTMat2Bed.args.inopts.delimit = '.' pGTMat2Bed.args.name = 'full' pGTMat2Bed.args.ncol = 8 pSortSnp = pSort.copy() pSortSnp.depends = pGTMat2Bed pSortSnp.args.unique = True pSortByTF.input = [args.cefile] pFilterTFs = pTsvJoin.copy() pFilterTFs.depends = pSortByTF pFilterTFs.input = lambda ch: [ch.insert(0, args.tflist).flatten()] pFilterTFs.args.inopts.cnames = False pFilterTFs.args.inopts.skip = [0, 1] pFilterTFs.args.inopts.delimit = ['\t', '.'] pFilterTFs.args.outopts.cnames = False pFilterTFs.args.match = 'lambda r1, r2: TsvJoin.compare(r1[1], r2[1])' pFilterTFs.args.do = 'lambda out, r1, r2: out.write(r1)' pAtSnp.depends = pFilterTFs, pSortSnp pAtSnp.args.tfmotifs = args.motifdb pAtSnp.args.fdr = False pAtSnp.args.plot = False pAtSnp.args.nthread = args.nthread setOutfile(pAtSnp, args.outfile) if args.man: pToMan = pTsv.copy() pToMan.depends = pAtSnp pToMan.args.outopts.cnames = False # [chr1, 12496021, rs6541023, 0.04] pToMan.args.helper = 'snprec = lambda x: [x[0], int(x[1]) - 1, x[1], x[2], 0, "+"]' pToMan.args.row = 'lambda r: snprec(r.Snp.split("_")[:3]) + [r.Pval_Diff]' pBedSort.depends = pToMan pBedSort.args.chrorder = params.chrorder.value pManhattan.depends = pBedSort if args.hifile: pManhattan.input = lambda ch: ch.cbind(args.hifile) pManhattan.args.gsize = params.gsize.value setOutfile(pManhattan, args.man) PyPPL().start(pGTMat2Bed, pSortByTF).run()
def pipeline(opts): """Construct the pipeline""" start, end = colselect(opts.infile, opts.col, opts.cut) # duplicate gold standard columns pTsvGold = pTsv.copy() pTsvGold.input = [opts.gold] pTsvGold.args.inopts.cnames = False pTsvGold.args.row = 'lambda row: [row[0], 1]' start = [pTsvGold, start] # add header pTsvReplaceHeader.depends = pTsvGold pTsvReplaceHeader.args.inopts.cnames = False pTsvReplaceHeader.args.cnames = ['ROWNAME', 'GOLD'] pTsvCbind.depends = pTsvReplaceHeader, end pTsvCbind.input = (lambda *chs: [[ch.get() for ch in chs]]) # prepare file for hypergeometric test # replace NA with 0 # replace pvalues with 1 (presence) pPrepHG = pTsv.copy() pPrepHG.depends = pTsvCbind pPrepHG.args.row = ('lambda row: row.__setitem__(1, int(row[1] == "1")) ' 'or row.__setitem__(2, int(row[2] != "NA"))') pHypergeom.depends = pPrepHG pHypergeom.args.intype = 'raw' if str(opts.bign).isdigit(): pHypergeom.args.N = int(opts.bign) else: bign = cmdy.wc(l = opts.bign).strip().split()[0] pHypergeom.args.N = int(bign) return start
def pipeline(opts): """Construct the pipeline""" starts = [] ends = [] for i, infile in enumerate(opts.infiles): start, end = colselect(infile, opts.cols[i], opts.cuts[i] if opts.cuts else None, opts.rev, "in%s" % (i + 1)) starts.append(start) ends.append(end) # duplicate gold standard columns pTsvGold = pTsv.copy() pTsvGold.input = [opts.gold] pTsvGold.args.inopts.cnames = False pTsvGold.args.row = 'lambda row: [row[0], 1]' starts.append(pTsvGold) pROC.args.params.bestCut = False pROC.args.ggs.theme_bw = {} pTsvCbind.args.inopts.dup = 'ignore' if not opts.sep: pSort.depends = ends pSort.input = lambda *chs: [ch.get() for ch in chs] pSort.args.inopts.skip = 1 pTsvJoin.depends = pSort pTsvJoin.input = lambda ch: [ch.flatten()] #pTsvJoin.args.outopts.cnames = ["ROWNAME"] + [Path(infile).stem for infile in opts.infiles] pTsvJoin.args.inopts.cnames = True pTsvJoin.args.do = "lambda writer, *rs: writer.write([rs[0][0]] + [r[1] for r in rs])" pTsvCbind.depends = pTsvGold, pTsvJoin pTsvCbind.input = lambda ch1, ch2: [ch1.cbind(ch2).flatten()] pTsvCbind.args.inopts.cnames = False pTsvCbind.args.inopts.rnames = True pTsvCbind.args.fill = True pTsvCbind.args.fn2cname = "NULL" # set the non-hit record to 0 # and remove NAs from predictions pTsvGoldFalse = pTsv.copy() pTsvGoldFalse.depends = pTsvCbind pTsvGoldFalse.args.row = ('lambda row: False ' 'if any(r == "NA" for r in row[2:]) ' 'else row.__setitem__(1, 0) ' 'if row[1] == "NA" ' 'else None') pTsvReplaceHeader.depends = pTsvGoldFalse pTsvReplaceHeader.args.cnames = ["ROWNAME", "GOLD"] + opts.names pROC.depends = pTsvReplaceHeader else: # add header pTsvReplaceHeader.depends = pTsvGold pTsvReplaceHeader.args.inopts.cnames = False pTsvReplaceHeader.args.cnames = ['ROWNAME', 'GOLD'] ends.insert(0, pTsvReplaceHeader) pTsvCbind.depends = ends pTsvCbind.output = 'outfile:file:{{i.infiles | [-1] | stem2 }}.cbound.txt' pTsvCbind.input = ( lambda ch_gold, *chs: [ch_gold.cbind(ch).flatten() for ch in chs]) pTsvGoldFalse = pTsv.copy() pTsvGoldFalse.depends = pTsvCbind pTsvGoldFalse.config.export_dir = opts.outdir pTsvGoldFalse.args.row = ('lambda row: False ' 'if any(r == "NA" for r in row[2:]) ' 'else row.__setitem__(1, 0) ' 'if row[1] == "NA" ' 'else None') pROC.depends = pTsvGoldFalse pROC.config.export_dir = opts.outdir return starts