def markDup(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix = configRobot.popParas(cmdset,['cmd', 'mem', 'time', 'sample', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) programpath = cmdGenerator.checkPath(cmdset.pop('programpath')) bam = configRobot.popParas(cmdset, 'bam') jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) javacmd = 'java -Xmx%dg -jar'%(int(mem.replace('G',''))-1) mdupjar = 'MarkDuplicates.jar' idxcmd = 'samtools index' for sample in samples: jobname = prefix + '_' + sample paraset = copy.deepcopy(cmdset) paraset['INPUT'] = '=%s/%s'%(inputpath + sample, bam) paraset['OUTPUT'] = paraset['INPUT'].replace('.bam', '.mdup.bam') paraset['METRICS_FILE'] = '=%s/%s'%(inputpath + sample, prefix + '_mdupmetrics.txt') paraset = configRobot.validParas(paraset, availParas[mdupjar]) CMDs = [] CMDs.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') ) CMDs.append( cmdGenerator.formatCmd(javacmd, programpath+mdupjar, paraset) ) CMDs.append( cmdGenerator.formatCmd(idxcmd, paraset['OUTPUT'].strip('=')) ) CMDs.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, inputpath+sample)) ) jobmanager.createJob(jobname, CMDs, outpath = inputpath+sample, outfn = jobname) return jobmanager
def HTSeqCount(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, bam = configRobot.popParas(cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix', 'bam']) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) gtf = configRobot.popParas(cmdset, ['GTF']) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) setuppathcmd = cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') if type(samples) != type([]) and type(samples) != type(()): samples = [samples] for sample in samples: paraset = copy.deepcopy(cmdset) jobfnprefix = prefix + '_' + sample if bam == '=sample': inputfile = sample outputfile = sample + '.count' else: inputfile = sample + '/' + bam outputfile = sample + '/' + bam + '.count' cmdGenerator.checkPath(outputpath + sample, create=createpath) samcmd = 'samtools view -h %s | '%(inputpath+inputfile) htseq = 'python -m HTSeq.scripts.count -q ' countcmd = cmdGenerator.formatCmd(samcmd, htseq, paraset, '-', gtf, ' > %s'%(outputpath + outputfile)) mvscriptcmd = cmdGenerator.formatCmd('mv ./%s%s %s'%(jobfnprefix, jobmanager.ext, outputpath)) jobmanager.createJob(jobfnprefix, [setuppathcmd, countcmd, mvscriptcmd], outpath = outputpath, outfn = jobfnprefix) return jobmanager
def gmfit(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmd, call, mem, time, prefix, Ks, datafn = configRobot.popParas(cmdset, ['cmd', 'call', 'mem', 'time', 'prefix', 'Ks', 'datafn']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) cmdGenerator.checkPath(outputpath, create=createpath) if type(Ks) != type([]): Ks = [Ks] for fn in datafn: for k in Ks: jobname = prefix + fn.replace('.mat', '') + 'k%03d'%(int(k)) functionCall = call + "('%s', %s, '%s');"%(inputpath+fn, k, outputpath + jobname + '.mat') CMD = [] CMD.append( cmdGenerator.formatCmd( matlabcmd%(functionCall) ) ) CMD.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, outputpath)) ) jobmanager.createJob(jobname, CMD, outpath=outputpath, outfn=jobname, trackcmd=False) return jobmanager
def cuffcompare(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, gtf = configRobot.popParas(cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix', 'gtf']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) if type(samples) != type([]) and type(samples) != type(()): samples = [samples] sampletext = '' for sample in samples: sampletext = sampletext + '%s%s/%s '%(inputpath, sample, gtf) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) jobname = prefix CMD = [] CMD.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') ) paraset = copy.deepcopy(cmdset) paraset['-o'] = outputpath + paraset['-o'] paraset = configRobot.validParas(paraset, availParas['cuffcompare']) CMD.append( cmdGenerator.formatCmd(cmd, paraset, sampletext) ) CMD.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, outputpath)) ) jobmanager.createJob(jobname, CMD, outpath=outputpath, outfn=jobname, trackcmd=False) return jobmanager
def RSeQC(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time = configRobot.popParas(cmdset, ['cmd', 'mem', 'time']) samples, bam, prefix = configRobot.popParas(cmdset, ['sample', 'bam', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) programpath = cmdGenerator.checkPath(cmdset.pop('programpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) programs = ['inner_distance.py', 'junction_annotation.py', 'junction_saturation.py', 'read_GC.py', 'read_duplication.py'] for sample in samples: jobname = prefix + '_' + sample CMDs = [] CMDs.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') ) for prog in programs: paraset = copy.deepcopy(cmdset) paraset['-i'] = inputpath + sample + '/' + bam paraset['-o'] = outputpath + sample + '.%s'%(prog.replace('.py', '')) paraset = configRobot.validParas(paraset, availParas[prog]) if '-o' not in paraset.keys(): paraset['>'] = outputpath + sample + '.%s'%(prog.replace('.py', '')) CMDs.append( cmdGenerator.formatCmd('python', programpath+prog, paraset) ) CMDs.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, outputpath)) ) jobmanager.createJob(jobname, CMDs, outpath = outputpath, outfn = jobname) return jobmanager
def gseaTestAnt2Pheno(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmd, call, mem, time, prefix, datasets, phenos, features, njobs = configRobot.popParas(cmdset, ['cmd', 'call', 'mem', 'time', 'prefix', 'dataset', 'pheno', 'feature', 'njob']) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) cmdGenerator.checkPath(outputpath, create=createpath) for di in range(len(datasets)): numjobs = int(njobs[di]) if datasets[di] != 'CCLE': continue for jobidx in range(1, numjobs+1): jobname = prefix + datasets[di] + '%02d'%jobidx functionCall = call + "(%d, %d, '%s', 'dataset', '%s', 'pheno', '%s', 'feature', '%s');"%(jobidx, numjobs, outputpath + prefix, datasets[di], phenos[di], features[di]) CMD = [] CMD.append( cmdGenerator.formatCmd( matlabcmd%(functionCall) ) ) CMD.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, outputpath)) ) jobmanager.createJob(jobname, CMD, outpath=outputpath, outfn=jobname, trackcmd=False) return jobmanager
def cuffmerge(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, gtf = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix', 'gtf']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) if type(samples) != type([]) and type(samples) != type(()): samples = [samples] sampletext = '"' for sample in samples: sampletext = sampletext + '%s%s/%s\\n' % (inputpath, sample, gtf) sampletext = sampletext + '"' jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) jobname = prefix CMD = [] CMD.append(cmdGenerator.formatCmd('source ~/libraries/setup_seqtools')) paraset = copy.deepcopy(cmdset) paraset['-o'] = outputpath + paraset['-o'] CMD.append( cmdGenerator.formatCmd('echo', sampletext, '>', paraset['-o'] + '.samples')) paraset = configRobot.validParas(paraset, availParas['cuffmerge']) cmdGenerator.checkPath(paraset['-o'], create=createpath) CMD.append(cmdGenerator.formatCmd(cmd, paraset, paraset['-o'] + '.samples')) CMD.append( cmdGenerator.formatCmd('mv ./%s%s %s' % (jobname, jobmanager.ext, paraset['-o']))) CMD.append(cmdGenerator.formatCmd('rm -f', paraset['-o'] + '.samples')) sgeopt = [] if '-p' in paraset.keys(): if int(paraset['-p']) > 1: #multi threads sgeopt = ['-pe smp ' + paraset['-p']] elif '--num-threads' in paraset.keys(): if int(paraset['--num-threads']) > 1: sgeopt = ['-pe smp ' + paraset['-p']] jobmanager.createJob(jobname, CMD, outpath=paraset['-o'], outfn=jobname, sgeopt=sgeopt, trackcmd=False) return jobmanager
def tophat(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix']) paired = configRobot.popParas(cmdset, ['paired']) readext, outpath, genome = configRobot.popParas( cmdset, ['readext', 'outputpath', 'genome']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) setuppathcmd = cmdGenerator.formatCmd( 'source ~/libraries/setup_seqtools\necho $BOWTIE2_INDEXES') if type(samples) != type([]) and type(samples) != type(()): samples = [samples] cmdGenerator.checkPath(outpath + '%s/' % prefix, create=createpath) for sample in samples: if paired == 'paired' or paired == 'yes': reads = map(lambda (i): inputpath + sample + '_%d' % i + readext, [1, 2]) elif paired == 'single' or paired == 'no': reads = inputpath + sample + readext paraset = copy.deepcopy(cmdset) paraset['-o'] = outpath + '%s/' % prefix + sample jobfnprefix = prefix + '_' + sample tophatcmd = cmdGenerator.formatCmd(cmd, paraset, genome, reads) mvscriptcmd = cmdGenerator.formatCmd( 'mv ./%s%s %s' % (jobfnprefix, jobmanager.ext, paraset['-o'])) if int(paraset['-p']) > 1: #multiple threads per job sgeopt = ['-pe smp ' + paraset['-p']] else: sgeopt = [] #need to create the output directory first, otherwise SGE complains cannot put the stdout in its path cmdGenerator.checkPath(paraset['-o'], create=createpath) jobmanager.createJob(jobfnprefix, [setuppathcmd, tophatcmd, mvscriptcmd], outpath=paraset['-o'], outfn=jobfnprefix, sgeopt=sgeopt) return jobmanager
def DESeqPair(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmdset = configRobot.makeParasList(cmdset, ['meta', 'group1', 'group2']) cmd, mem, time, prefix = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'prefix']) group1, group2 = configRobot.popParas(cmdset, ['group1', 'group2']) template = open(cmdset.pop('template')).read() inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) cmdset['inputpath'] = inputpath outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) cmdset['outputpath'] = outputpath cmdset['prefix'] = prefix meta = configRobot.popParas(cmdset, ['meta']) if meta[0] == "''" and len(meta) == 1: cmdset['meta'] = 'c()' else: cmdset['meta'] = 'c(\'' + '\', \''.join(meta) + '\')' if cmdset['countfnprefix'] == "''": cmdset['countfnprefix'] = '' if cmdset['countfnsuffix'] == "''": cmdset['countfnsuffix'] = '' jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) setuppathcmd = cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') for i in range(len(group1)): paraset = copy.deepcopy(cmdset) paraset['gr1'] = group1[i] paraset['gr2'] = group2[i] jobfnprefix = prefix + '_' + group1[i] + '_' + group2[i] f = open('./%s.R' % (jobfnprefix), 'w') f.write(template % paraset) f.close() deseqcmd = cmdGenerator.formatCmd('Rscript', './%s.R' % (jobfnprefix)) mvRscriptcmd = cmdGenerator.formatCmd('mv ./%s.R %s' % (jobfnprefix, outputpath)) mvscriptcmd = cmdGenerator.formatCmd( 'mv ./%s%s %s' % (jobfnprefix, jobmanager.ext, outputpath)) jobmanager.createJob( jobfnprefix, [setuppathcmd, deseqcmd, mvRscriptcmd, mvscriptcmd], outpath=outputpath, outfn=jobfnprefix) return jobmanager
def cuffdiff_v1(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, gtf, bam = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix', 'gtf', 'bam']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) if type(samples) != type([]) and type(samples) != type(()): samples = [samples] sampletext = '' for sample in samples: sampletext = sampletext + '%s%s/%s ' % (inputpath, sample, bam) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) jobname = prefix CMD = [] CMD.append(cmdGenerator.formatCmd('source ~/libraries/setup_seqtools')) paraset = copy.deepcopy(cmdset) paraset = configRobot.validParas(paraset, availParas['cuffdiff']) cmdGenerator.checkPath(paraset['--output-dir'], create=createpath) CMD.append( cmdGenerator.formatCmd( '/ifs/home/c2b2/dp_lab/bc2252/SeqTool/cufflinks_1/cuffdiff', paraset, gtf, sampletext)) CMD.append( cmdGenerator.formatCmd( 'mv ./%s%s %s' % (jobname, jobmanager.ext, paraset['--output-dir']))) sgeopt = [] if '-p' in paraset.keys(): if int(paraset['-p']) > 1: #multi threads sgeopt = ['-pe smp ' + paraset['-p']] elif '--num-threads' in paraset.keys(): if int(paraset['--num-threads']) > 1: sgeopt = ['-pe smp ' + paraset['-p']] jobmanager.createJob(jobname, CMD, outpath=paraset['--output-dir'], outfn=jobname, sgeopt=sgeopt, trackcmd=False) return jobmanager
def picardReorderSam(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath')) programpath = cmdGenerator.checkPath(cmdset.pop('programpath')) bam = configRobot.popParas(cmdset, 'bam') jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) if '-Djava.io.tmpdir' in cmdset.keys(): javacmd = 'java ' + '-Djava.io.tmpdir' + cmdGenerator.checkPath( cmdset.pop('-Djava.io.tmpdir')) else: javacmd = 'java' javacmd = javacmd + ' -Xmx%dg -jar' % (int(mem.replace('G', '')) - 2) reorder = programpath + 'ReorderSam.jar VALIDATION_STRINGENCY=LENIENT' for sample in samples: CMDs = [] CMDs.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools')) jobname = prefix + '_' + sample #reorder by chrm paraset = copy.deepcopy(cmdset) if bam == '=sample': inputfile = sample else: inputfile = sample + '/' + bam paraset['INPUT'] = '=%s' % (inputpath + inputfile) paraset['OUTPUT'] = '=%s.reorder.bam' % (outputpath + inputfile.replace('.bam', '')) paraset = configRobot.validParas(paraset, availParas['ReorderSam.jar']) CMDs.append(cmdGenerator.formatCmd(javacmd, reorder, paraset)) CMDs.append( cmdGenerator.formatCmd('mv ./%s%s %s' % (jobname, jobmanager.ext, outputpath))) jobmanager.createJob(jobname, CMDs, outpath=outputpath, outfn=jobname) return jobmanager
def RNASeQC(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time = configRobot.popParas(cmdset, ['cmd', 'mem', 'time']) samples, bam, prefix = configRobot.popParas(cmdset, ['sample', 'bam', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) programpath = cmdGenerator.checkPath(cmdset.pop('programpath')) if '-Djava.io.tmpdir' in cmdset.keys(): javacmd = 'java ' + '-Djava.io.tmpdir' + cmdGenerator.checkPath( cmdset.pop('-Djava.io.tmpdir')) else: javacmd = 'java' javacmd = javacmd + ' -Xmx%dg -jar' % (int(mem.replace('G', '')) - 2) #need to generate -s, -o #generate a temperory file samplestr = '"Sample ID\\tBam File\\tNotes\\n' for sample in samples: samplestr = samplestr + '%s\\t%s\\t%s\\n' % ( sample, inputpath + sample + '/' + bam, sample) samplestr = samplestr + '"' samplefile = '%s.samples' % prefix cmdset['-s'] = samplefile cmdset['-o'] = inputpath + 'RNA-SeQC/' jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) setupcmd = cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') createsamplefile = cmdGenerator.formatCmd('echo', samplestr, '>', samplefile) removesamplefile = cmdGenerator.formatCmd('rm -f', samplefile) mvscriptcmd = cmdGenerator.formatCmd( 'mv %s%s %s' % (prefix, jobmanager.ext, cmdset['-o'])) qccmd = cmdGenerator.formatCmd(javacmd, programpath + cmd, cmdset) cmdGenerator.checkPath(cmdset['-o'], create=createpath) jobmanager.createJob( prefix, [setupcmd, createsamplefile, qccmd, removesamplefile, mvscriptcmd], outfn=prefix, outpath=cmdset['-o'], trackcmd=False) return jobmanager
def cufflinks(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, bam = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix', 'bam']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) outputpath = cmdGenerator.checkPath(outputpath + '%s/' % prefix, create=createpath) if type(samples) != type([]) and type(samples) != type(()): samples = [samples] jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) for sample in samples: jobname = prefix + '_' + sample CMD = [] CMD.append(cmdGenerator.formatCmd('source ~/libraries/setup_seqtools')) paraset = copy.deepcopy(cmdset) paraset['-o'] = outputpath + sample paraset = configRobot.validParas(paraset, availParas['cufflinks']) cmdGenerator.checkPath(paraset['-o'], create=createpath) CMD.append( cmdGenerator.formatCmd(cmd, paraset, inputpath + '%s/' % sample + bam)) CMD.append( cmdGenerator.formatCmd('mv ./%s%s %s' % (jobname, jobmanager.ext, paraset['-o']))) sgeopt = [] if '-p' in paraset.keys(): if int(paraset['-p']) > 1: #multi threads sgeopt = ['-pe smp ' + paraset['-p']] elif '--num-threads' in paraset.keys(): if int(paraset['--num-threads']) > 1: sgeopt = ['-pe smp ' + paraset['-p']] jobmanager.createJob(jobname, CMD, outpath=paraset['-o'], outfn=jobname, sgeopt=sgeopt) return jobmanager
def picardQC(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, bam, prefix = configRobot.popParas(cmdset, ['cmd', 'mem', 'time', 'bam', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) programpath = cmdGenerator.checkPath(cmdset.pop('programpath')) samples = cmdset.pop('sample') javacmd = 'java -Xmx%dg -jar'%(int(mem.replace('G',''))-2) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) metrics = {'CollectRnaSeqMetrics.jar': 'RnaSeq', 'CollectMultipleMetrics.jar': '', 'EstimateLibraryComplexity.jar': 'Lib', 'CollectGcBiasMetrics.jar': 'GC'} metrickeys = ['CollectRnaSeqMetrics.jar', 'CollectMultipleMetrics.jar', 'EstimateLibraryComplexity.jar', 'CollectGcBiasMetrics.jar'] for sample in samples: jobname = prefix + '_' + sample allcmds = [] allcmds.append(cmdGenerator.formatCmd('source ~/libraries/setup_seqtools')) paraset = copy.deepcopy(cmdset) if bam == '=sample': paraset['INPUT'] = '=%s'%(inputpath+sample) else: paraset['INPUT'] = '=%s/%s'%(inputpath+sample, bam) paraset['TMP_DIR'] = paraset['TMP_DIR'] + prefix + '_' + sample + '/' cmdGenerator.checkPath(paraset['TMP_DIR'].strip('='), create=createpath) for metric in metrickeys: if 'MultipleMetrics' in metric: paraset['OUTPUT'] = '=%s'%(outputpath + sample + metrics[metric]) else: paraset['OUTPUT'] = '=%s.txt'%(outputpath + sample + '.' + metrics[metric]) paraset['CHART_OUTPUT'] = '%s'%(paraset['OUTPUT'].replace('.txt', '.pdf')) paraset['SUMMARY_OUTPUT'] = '%s'%(paraset['OUTPUT'].replace('.txt', '.summary.txt')) #filter out parameters that are not supported metricparaset = configRobot.validParas(paraset, availParas[metric]) allcmds.append(cmdGenerator.formatCmd(javacmd, programpath + metric, metricparaset)) allcmds.append(cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, outputpath))) allcmds.append(cmdGenerator.formatCmd('rm -Rf', paraset['TMP_DIR'].strip('='))) jobmanager.createJob(jobname, allcmds, outpath = outputpath, outfn = jobname) return jobmanager
def GATK_genotyper(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) gatkpath = cmdGenerator.checkPath(cmdset.pop('gatkpath')) bam = configRobot.popParas(cmdset, 'bam') jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) if '-Djava.io.tmpdir' in cmdset.keys(): javacmd = 'java ' + '-Djava.io.tmpdir' + cmdGenerator.checkPath( cmdset.pop('-Djava.io.tmpdir')) else: javacmd = 'java' javacmd = javacmd + ' -Xmx%dg -jar' % (int(mem.replace('G', '')) - 2) GATK = gatkpath + 'GenomeAnalysisTK.jar ' genotyper = '-T UnifiedGenotyper ' CMDs = [] CMDs.append(cmdGenerator.formatCmd('source ~/libraries/setup_seqtools')) jobname = prefix paraset = copy.deepcopy(cmdset) paraset['-I'] = inputpath + samples[0] + '/' + bam for si in range(1, len(samples)): paraset['-I'] = paraset['-I'] + ' -I ' + inputpath + samples[ si] + '/' + bam CMDs.append(cmdGenerator.formatCmd(javacmd, GATK + genotyper, paraset)) CMDs.append( cmdGenerator.formatCmd('mv ./%s%s %s' % (jobname, jobmanager.ext, outputpath))) jobmanager.createJob(jobname, CMDs, outpath=outputpath, outfn=jobname) return jobmanager
def RSeQC(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time = configRobot.popParas(cmdset, ['cmd', 'mem', 'time']) samples, bam, prefix = configRobot.popParas(cmdset, ['sample', 'bam', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) programpath = cmdGenerator.checkPath(cmdset.pop('programpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) programs = [ 'inner_distance.py', 'junction_annotation.py', 'junction_saturation.py', 'read_GC.py', 'read_duplication.py' ] for sample in samples: jobname = prefix + '_' + sample CMDs = [] CMDs.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools')) for prog in programs: paraset = copy.deepcopy(cmdset) paraset['-i'] = inputpath + sample + '/' + bam paraset['-o'] = outputpath + sample + '.%s' % (prog.replace( '.py', '')) paraset = configRobot.validParas(paraset, availParas[prog]) if '-o' not in paraset.keys(): paraset['>'] = outputpath + sample + '.%s' % (prog.replace( '.py', '')) CMDs.append( cmdGenerator.formatCmd('python', programpath + prog, paraset)) CMDs.append( cmdGenerator.formatCmd('mv ./%s%s %s' % (jobname, jobmanager.ext, outputpath))) jobmanager.createJob(jobname, CMDs, outpath=outputpath, outfn=jobname) return jobmanager
def markDup(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) programpath = cmdGenerator.checkPath(cmdset.pop('programpath')) bam = configRobot.popParas(cmdset, 'bam') jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) javacmd = 'java -Xmx%dg -jar' % (int(mem.replace('G', '')) - 1) mdupjar = 'MarkDuplicates.jar' idxcmd = 'samtools index' for sample in samples: jobname = prefix + '_' + sample paraset = copy.deepcopy(cmdset) paraset['INPUT'] = '=%s/%s' % (inputpath + sample, bam) paraset['OUTPUT'] = paraset['INPUT'].replace('.bam', '.mdup.bam') paraset['METRICS_FILE'] = '=%s/%s' % (inputpath + sample, prefix + '_mdupmetrics.txt') paraset = configRobot.validParas(paraset, availParas[mdupjar]) CMDs = [] CMDs.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools')) CMDs.append( cmdGenerator.formatCmd(javacmd, programpath + mdupjar, paraset)) CMDs.append( cmdGenerator.formatCmd(idxcmd, paraset['OUTPUT'].strip('='))) CMDs.append( cmdGenerator.formatCmd( 'mv ./%s%s %s' % (jobname, jobmanager.ext, inputpath + sample))) jobmanager.createJob(jobname, CMDs, outpath=inputpath + sample, outfn=jobname) return jobmanager
def HTSeqCount(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, bam = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix', 'bam']) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) gtf = configRobot.popParas(cmdset, ['GTF']) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) setuppathcmd = cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') if type(samples) != type([]) and type(samples) != type(()): samples = [samples] for sample in samples: paraset = copy.deepcopy(cmdset) jobfnprefix = prefix + '_' + sample if bam == '=sample': inputfile = sample outputfile = sample + '.count' else: inputfile = sample + '/' + bam outputfile = sample + '/' + bam + '.count' cmdGenerator.checkPath(outputpath + sample, create=createpath) samcmd = 'samtools view -h %s | ' % (inputpath + inputfile) htseq = 'python -m HTSeq.scripts.count -q ' countcmd = cmdGenerator.formatCmd(samcmd, htseq, paraset, '-', gtf, ' > %s' % (outputpath + outputfile)) mvscriptcmd = cmdGenerator.formatCmd( 'mv ./%s%s %s' % (jobfnprefix, jobmanager.ext, outputpath)) jobmanager.createJob(jobfnprefix, [setuppathcmd, countcmd, mvscriptcmd], outpath=outputpath, outfn=jobfnprefix) return jobmanager
def picardReorderSam(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix = configRobot.popParas(cmdset,['cmd', 'mem', 'time', 'sample', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath')) programpath = cmdGenerator.checkPath(cmdset.pop('programpath')) bam = configRobot.popParas(cmdset, 'bam') jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) if '-Djava.io.tmpdir' in cmdset.keys(): javacmd = 'java ' + '-Djava.io.tmpdir' + cmdGenerator.checkPath(cmdset.pop('-Djava.io.tmpdir')) else: javacmd = 'java' javacmd = javacmd + ' -Xmx%dg -jar'%(int(mem.replace('G',''))-2) reorder = programpath + 'ReorderSam.jar VALIDATION_STRINGENCY=LENIENT' for sample in samples: CMDs = [] CMDs.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') ) jobname = prefix + '_' + sample #reorder by chrm paraset = copy.deepcopy(cmdset) if bam == '=sample': inputfile = sample else: inputfile = sample + '/' + bam paraset['INPUT'] = '=%s'%(inputpath + inputfile) paraset['OUTPUT'] = '=%s.reorder.bam'%(outputpath + inputfile.replace('.bam','')) paraset = configRobot.validParas(paraset, availParas['ReorderSam.jar']) CMDs.append( cmdGenerator.formatCmd(javacmd, reorder, paraset) ) CMDs.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, outputpath)) ) jobmanager.createJob(jobname, CMDs, outpath = outputpath, outfn = jobname) return jobmanager
def cuffmerge(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, gtf = configRobot.popParas(cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix', 'gtf']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) if type(samples) != type([]) and type(samples) != type(()): samples = [samples] sampletext = '"' for sample in samples: sampletext = sampletext + '%s%s/%s\\n'%(inputpath, sample, gtf) sampletext = sampletext + '"' jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) jobname = prefix CMD = [] CMD.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') ) paraset = copy.deepcopy(cmdset) paraset['-o'] = outputpath + paraset['-o'] CMD.append( cmdGenerator.formatCmd('echo', sampletext, '>', paraset['-o'] + '.samples') ) paraset = configRobot.validParas(paraset, availParas['cuffmerge']) cmdGenerator.checkPath(paraset['-o'], create=createpath) CMD.append( cmdGenerator.formatCmd(cmd, paraset, paraset['-o'] + '.samples') ) CMD.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, paraset['-o'])) ) CMD.append( cmdGenerator.formatCmd('rm -f', paraset['-o'] + '.samples') ) sgeopt = [] if '-p' in paraset.keys(): if int(paraset['-p']) > 1: #multi threads sgeopt = ['-pe smp ' + paraset['-p']] elif '--num-threads' in paraset.keys(): if int(paraset['--num-threads']) > 1: sgeopt = ['-pe smp ' + paraset['-p']] jobmanager.createJob(jobname, CMD, outpath=paraset['-o'], outfn=jobname, sgeopt=sgeopt, trackcmd=False) return jobmanager
def DESeqPair(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmdset = configRobot.makeParasList(cmdset, ['meta', 'group1', 'group2']) cmd, mem, time, prefix = configRobot.popParas(cmdset, ['cmd', 'mem', 'time', 'prefix']) group1, group2 = configRobot.popParas(cmdset, ['group1', 'group2']) template = open(cmdset.pop('template')).read() inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) cmdset['inputpath'] = inputpath outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) cmdset['outputpath'] = outputpath cmdset['prefix'] = prefix meta = configRobot.popParas(cmdset, ['meta']) if meta[0] == "''" and len(meta) == 1: cmdset['meta'] = 'c()' else: cmdset['meta'] = 'c(\'' + '\', \''.join(meta) + '\')' if cmdset['countfnprefix'] == "''": cmdset['countfnprefix'] = '' if cmdset['countfnsuffix'] == "''": cmdset['countfnsuffix'] = '' jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) setuppathcmd = cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') for i in range(len(group1)): paraset = copy.deepcopy(cmdset) paraset['gr1'] = group1[i] paraset['gr2'] = group2[i] jobfnprefix = prefix + '_' + group1[i] + '_' + group2[i] f = open('./%s.R'%(jobfnprefix), 'w') f.write(template%paraset) f.close() deseqcmd = cmdGenerator.formatCmd('Rscript', './%s.R'%(jobfnprefix)) mvRscriptcmd = cmdGenerator.formatCmd('mv ./%s.R %s'%(jobfnprefix, outputpath)) mvscriptcmd = cmdGenerator.formatCmd('mv ./%s%s %s'%(jobfnprefix, jobmanager.ext, outputpath)) jobmanager.createJob(jobfnprefix, [setuppathcmd, deseqcmd, mvRscriptcmd, mvscriptcmd], outpath = outputpath, outfn = jobfnprefix) return jobmanager
def tophat(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix = configRobot.popParas(cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix']) paired = configRobot.popParas(cmdset, ['paired']) readext, outpath, genome = configRobot.popParas(cmdset, ['readext', 'outputpath', 'genome']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) setuppathcmd = cmdGenerator.formatCmd('source ~/libraries/setup_seqtools\necho $BOWTIE2_INDEXES') if type(samples) != type([]) and type(samples) != type(()): samples = [samples] cmdGenerator.checkPath(outpath + '%s/'%prefix, create=createpath) for sample in samples: if paired == 'paired' or paired == 'yes': reads = map(lambda(i): inputpath + sample + '_%d'%i + readext, [1,2]) elif paired == 'single' or paired == 'no': reads = inputpath + sample + readext paraset = copy.deepcopy(cmdset) paraset['-o'] = outpath + '%s/'%prefix + sample jobfnprefix = prefix + '_' + sample tophatcmd = cmdGenerator.formatCmd(cmd, paraset, genome, reads) mvscriptcmd = cmdGenerator.formatCmd('mv ./%s%s %s'%(jobfnprefix, jobmanager.ext, paraset['-o'])) if int(paraset['-p']) > 1: #multiple threads per job sgeopt = ['-pe smp ' + paraset['-p']] else: sgeopt = [] #need to create the output directory first, otherwise SGE complains cannot put the stdout in its path cmdGenerator.checkPath(paraset['-o'], create=createpath) jobmanager.createJob(jobfnprefix, [setuppathcmd, tophatcmd, mvscriptcmd], outpath = paraset['-o'], outfn = jobfnprefix, sgeopt=sgeopt) return jobmanager
def cuffcompare(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, gtf = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix', 'gtf']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) if type(samples) != type([]) and type(samples) != type(()): samples = [samples] sampletext = '' for sample in samples: sampletext = sampletext + '%s%s/%s ' % (inputpath, sample, gtf) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) jobname = prefix CMD = [] CMD.append(cmdGenerator.formatCmd('source ~/libraries/setup_seqtools')) paraset = copy.deepcopy(cmdset) paraset['-o'] = outputpath + paraset['-o'] paraset = configRobot.validParas(paraset, availParas['cuffcompare']) CMD.append(cmdGenerator.formatCmd(cmd, paraset, sampletext)) CMD.append( cmdGenerator.formatCmd('mv ./%s%s %s' % (jobname, jobmanager.ext, outputpath))) jobmanager.createJob(jobname, CMD, outpath=outputpath, outfn=jobname, trackcmd=False) return jobmanager
def GATK_genotyper(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix = configRobot.popParas(cmdset,['cmd', 'mem', 'time', 'sample', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) gatkpath = cmdGenerator.checkPath(cmdset.pop('gatkpath')) bam = configRobot.popParas(cmdset, 'bam') jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) if '-Djava.io.tmpdir' in cmdset.keys(): javacmd = 'java ' + '-Djava.io.tmpdir' + cmdGenerator.checkPath(cmdset.pop('-Djava.io.tmpdir')) else: javacmd = 'java' javacmd = javacmd + ' -Xmx%dg -jar'%(int(mem.replace('G',''))-2) GATK = gatkpath + 'GenomeAnalysisTK.jar ' genotyper = '-T UnifiedGenotyper ' CMDs = [] CMDs.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') ) jobname = prefix paraset = copy.deepcopy(cmdset) paraset['-I'] = inputpath + samples[0] + '/' + bam for si in range(1,len(samples)): paraset['-I'] = paraset['-I'] + ' -I ' + inputpath + samples[si] + '/' + bam CMDs.append( cmdGenerator.formatCmd(javacmd, GATK+genotyper, paraset) ) CMDs.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, outputpath)) ) jobmanager.createJob(jobname, CMDs, outpath = outputpath, outfn = jobname) return jobmanager
def RNASeQC(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time = configRobot.popParas(cmdset, ['cmd', 'mem', 'time']) samples, bam, prefix = configRobot.popParas(cmdset, ['sample', 'bam', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) programpath = cmdGenerator.checkPath(cmdset.pop('programpath')) if '-Djava.io.tmpdir' in cmdset.keys(): javacmd = 'java ' + '-Djava.io.tmpdir' + cmdGenerator.checkPath(cmdset.pop('-Djava.io.tmpdir')) else: javacmd = 'java' javacmd = javacmd + ' -Xmx%dg -jar'%(int(mem.replace('G',''))-2) #need to generate -s, -o #generate a temperory file samplestr = '"Sample ID\\tBam File\\tNotes\\n' for sample in samples: samplestr = samplestr + '%s\\t%s\\t%s\\n'%(sample, inputpath + sample + '/' + bam, sample) samplestr = samplestr + '"' samplefile = '%s.samples'%prefix cmdset['-s'] = samplefile cmdset['-o'] = inputpath + 'RNA-SeQC/' jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) setupcmd = cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') createsamplefile = cmdGenerator.formatCmd('echo', samplestr, '>', samplefile) removesamplefile = cmdGenerator.formatCmd('rm -f', samplefile) mvscriptcmd = cmdGenerator.formatCmd('mv %s%s %s'%(prefix, jobmanager.ext, cmdset['-o'])) qccmd = cmdGenerator.formatCmd(javacmd, programpath+cmd, cmdset) cmdGenerator.checkPath(cmdset['-o'], create=createpath) jobmanager.createJob(prefix, [setupcmd, createsamplefile, qccmd, removesamplefile, mvscriptcmd], outfn = prefix, outpath = cmdset['-o'], trackcmd=False) return jobmanager
def countmismatches(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, bam = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix', 'bam']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) direct = ['forward', 'reverse'] for sample in samples: for d in direct: jobname = prefix + '_' + sample + '_' + d CMD = [] CMD.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools')) CMD.append( cmdGenerator.formatCmd( 'python', 'countmismatches.py', inputpath + sample + '/' + bam, outputpath + sample + '.%s.countmis' % d, d)) CMD.append( cmdGenerator.formatCmd('mv', '%s%s' % (jobname, jobmanager.ext), outputpath)) jobmanager.createJob(jobname, CMD, outpath=outputpath, outfn=jobname, trackcmd=False) return jobmanager
def runBootstrap(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmd, call, mem, time, prefix, datasets, phenos, njobs = configRobot.popParas(cmdset, ['cmd', 'call', 'mem', 'time', 'prefix', 'dataset', 'pheno', 'njob']) iter, usecn, combineSplit, useDisease = configRobot.popParas(cmdset, ['iter', 'usecn', 'combineSplit', 'useDisease']) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) cmdGenerator.checkPath(outputpath, create=createpath) if type(datasets) != type([]): datasets = [datasets] if type(njobs) != type([]): njobs = [njobs] if type(usecn) != type([]): usecn = [usecn] if type(combineSplit) != type(combineSplit): combineSplit = [combineSplit] for di in range(len(datasets)): numjobs = int(njobs[di]) for cnSet in usecn: if cnSet == 'true': cnStr = 'C1' else: cnStr = 'C0' for splitSet in combineSplit: if splitSet == 'true': splitStr = 'S1' else: splitStr = 'S0' for jobidx in range(1, numjobs+1): CMD = [] fnhead = prefix + datasets[di] + cnStr + splitStr jobname = prefix + datasets[di] + cnStr + splitStr + 'J%02d'%(jobidx) for iteridx in range(1, int(iter)+1): functionCall = "addpath(\'./Gray/data\'); tic; " + call + "(%d, %d, %d, '%s', '%s', 'usecn', %s, 'combineSplit', %s, 'useDisease', %s); toc;"%(jobidx, numjobs, iteridx, outputpath, fnhead, cnSet, splitSet, useDisease ) CMD.append( cmdGenerator.formatCmd( matlabcmd%(functionCall) ) ) CMD.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, outputpath)) ) jobmanager.createJob(jobname, CMD, outpath=outputpath, outfn=jobname, trackcmd=False) return jobmanager
def countmismatches(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, bam = configRobot.popParas(cmdset,['cmd', 'mem', 'time', 'sample', 'prefix', 'bam']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) direct = ['forward', 'reverse'] for sample in samples: for d in direct: jobname = prefix+'_'+sample+'_'+d CMD = [] CMD.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') ) CMD.append( cmdGenerator.formatCmd('python','countmismatches.py',inputpath+sample+'/'+bam, outputpath+sample+'.%s.countmis'%d, d) ) CMD.append( cmdGenerator.formatCmd('mv', '%s%s'%(jobname, jobmanager.ext), outputpath) ) jobmanager.createJob(jobname, CMD, outpath=outputpath, outfn=jobname, trackcmd=False) return jobmanager
def cuffdiff_v1(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, gtf, bam = configRobot.popParas(cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix', 'gtf', 'bam']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) if type(samples) != type([]) and type(samples) != type(()): samples = [samples] sampletext = '' for sample in samples: sampletext = sampletext + '%s%s/%s '%(inputpath, sample, bam) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) jobname = prefix CMD = [] CMD.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') ) paraset = copy.deepcopy(cmdset) paraset = configRobot.validParas(paraset, availParas['cuffdiff']) cmdGenerator.checkPath(paraset['--output-dir'], create=createpath) CMD.append( cmdGenerator.formatCmd('/ifs/home/c2b2/dp_lab/bc2252/SeqTool/cufflinks_1/cuffdiff', paraset, gtf, sampletext) ) CMD.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, paraset['--output-dir'])) ) sgeopt = [] if '-p' in paraset.keys(): if int(paraset['-p']) > 1: #multi threads sgeopt = ['-pe smp ' + paraset['-p']] elif '--num-threads' in paraset.keys(): if int(paraset['--num-threads']) > 1: sgeopt = ['-pe smp ' + paraset['-p']] jobmanager.createJob(jobname, CMD, outpath=paraset['--output-dir'], outfn=jobname, sgeopt=sgeopt, trackcmd=False) return jobmanager
def cufflinks(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, bam = configRobot.popParas(cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix', 'bam']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) outputpath = cmdGenerator.checkPath(outputpath + '%s/'%prefix, create=createpath) if type(samples) != type([]) and type(samples) != type(()): samples = [samples] jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) for sample in samples: jobname = prefix + '_' + sample CMD = [] CMD.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') ) paraset = copy.deepcopy(cmdset) paraset['-o'] = outputpath + sample paraset = configRobot.validParas(paraset, availParas['cufflinks']) cmdGenerator.checkPath(paraset['-o'], create=createpath) CMD.append( cmdGenerator.formatCmd(cmd, paraset, inputpath+'%s/'%sample+bam) ) CMD.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, paraset['-o'])) ) sgeopt = [] if '-p' in paraset.keys(): if int(paraset['-p']) > 1: #multi threads sgeopt = ['-pe smp ' + paraset['-p']] elif '--num-threads' in paraset.keys(): if int(paraset['--num-threads']) > 1: sgeopt = ['-pe smp ' + paraset['-p']] jobmanager.createJob(jobname, CMD, outpath=paraset['-o'], outfn=jobname, sgeopt=sgeopt) return jobmanager
def runJobs(cmdset, runmode='test'): jobmanager = jobFactory.jobManager(mem=cmdset['mem'], time=cmdset['time'], overwrite=cmdset.pop('overwrite')) if 'sampleList' in cmdset: cmdset['LoopOverSample'] = readSampleList(cmdset['sampleList'].format(**cmdset)) if 'LoopOverLine' in cmdset: cmdset['LoopOverLine'] = readInputList(cmdset['LoopOverLine']) loopVarName, loopList, cmdset = configRobot.getLoopOverList( cmdset ) if type(cmdset['call']) is not list: cmdset['call'] = [cmdset['call']] if 'njob' in cmdset: njob = int(cmdset['njob']) chunksize = int( math.ceil(len(loopList)/njob)) else: chunksize = 1 #unpack call cmdset['call_idx'] = [] for i in xrange(len(cmdset['call'])): key = 'call_cmd_%d'%i cmdset[key] = cmdset['call'][i] cmdset['call_idx'].append(key) del cmdset['call'] allowance = chunksize CMDs = [] sampleParser = re.compile('Sample_([\w\-]+)') for loopValue in loopList: paraset = copy.deepcopy(cmdset) if 'randstr' in paraset: paraset['randstr'] = randstr(20) paraset.update( zip( loopVarName, loopValue)) if 'LoopOverLine' in paraset: if type(paraset['LoopOverLine']) is not list: paraset['LoopOverLine'] = [paraset['LoopOverLine']] for fdIdx in xrange(len(paraset['LoopOverLine'])): paraset['line%d'%fdIdx] = paraset['LoopOverLine'][fdIdx] if 'sample' not in paraset: if 'LoopOverFile' in paraset: if 'Sample_' in paraset['LoopOverFile']: paraset['sample'] = sampleParser.findall(paraset['LoopOverFile'])[0] if 'logpath' not in paraset: paraset['logpath'] = paraset['outputpath'] paraset = configRobot.evalListAccess( paraset ) paraset = configRobot.evalRegExp( paraset ) paraset = configRobot.translateAllValues( paraset ) if 'tmppath' in paraset: CMDs.append( cmdGenerator.checkPathOnNode( paraset['tmppath'] ) ) CMDs.append( cmdGenerator.checkPathOnNode( paraset['outputpath'] ) ) CMDs.append( cmdGenerator.checkPathOnNode( paraset['logpath'] ) ) for callKey in cmdset['call_idx']: CMDs.append( cmdGenerator.formatCmd( paraset[callKey] ) ) allowance -= 1 if allowance == 0: CMDs.append( cmdGenerator.formatCmd( 'mv {prefix}.job {logpath}'.format(**paraset) )) CMDs.append( cmdGenerator.formatCmd( 'mv {prefix}.log {logpath}'.format(**paraset) )) jobmanager.createJob(paraset['prefix'], CMDs, outpath = './', outfn = paraset['prefix'], trackcmd=paraset['trackcmd'], sgeJob=True, sgeopt=paraset['sgeopt'], toShell=paraset['toShell'], runThru=paraset['runThru']) CMDs = [] allowance = chunksize else: if len(CMDs) > 0: CMDs.append( cmdGenerator.formatCmd( 'mv {prefix}.job {logpath}'.format(**paraset) )) CMDs.append( cmdGenerator.formatCmd( 'mv {prefix}.log {logpath}'.format(**paraset) )) jobmanager.createJob(paraset['prefix'], CMDs, outpath = './', outfn = paraset['prefix'], trackcmd=paraset['trackcmd'], sgeJob=True, sgeopt=paraset['sgeopt'], toShell=paraset['toShell'], runThru=paraset['runThru']) CMDs = [] allowance = chunksize return jobmanager
def picardQC(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, bam, prefix = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'bam', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath'), create=createpath) programpath = cmdGenerator.checkPath(cmdset.pop('programpath')) samples = cmdset.pop('sample') javacmd = 'java -Xmx%dg -jar' % (int(mem.replace('G', '')) - 2) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) metrics = { 'CollectRnaSeqMetrics.jar': 'RnaSeq', 'CollectMultipleMetrics.jar': '', 'EstimateLibraryComplexity.jar': 'Lib', 'CollectGcBiasMetrics.jar': 'GC' } metrickeys = [ 'CollectRnaSeqMetrics.jar', 'CollectMultipleMetrics.jar', 'EstimateLibraryComplexity.jar', 'CollectGcBiasMetrics.jar' ] for sample in samples: jobname = prefix + '_' + sample allcmds = [] allcmds.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools')) paraset = copy.deepcopy(cmdset) if bam == '=sample': paraset['INPUT'] = '=%s' % (inputpath + sample) else: paraset['INPUT'] = '=%s/%s' % (inputpath + sample, bam) paraset['TMP_DIR'] = paraset['TMP_DIR'] + prefix + '_' + sample + '/' cmdGenerator.checkPath(paraset['TMP_DIR'].strip('='), create=createpath) for metric in metrickeys: if 'MultipleMetrics' in metric: paraset['OUTPUT'] = '=%s' % (outputpath + sample + metrics[metric]) else: paraset['OUTPUT'] = '=%s.txt' % (outputpath + sample + '.' + metrics[metric]) paraset['CHART_OUTPUT'] = '%s' % (paraset['OUTPUT'].replace( '.txt', '.pdf')) paraset['SUMMARY_OUTPUT'] = '%s' % (paraset['OUTPUT'].replace( '.txt', '.summary.txt')) #filter out parameters that are not supported metricparaset = configRobot.validParas(paraset, availParas[metric]) allcmds.append( cmdGenerator.formatCmd(javacmd, programpath + metric, metricparaset)) allcmds.append( cmdGenerator.formatCmd('mv ./%s%s %s' % (jobname, jobmanager.ext, outputpath))) allcmds.append( cmdGenerator.formatCmd('rm -Rf', paraset['TMP_DIR'].strip('='))) jobmanager.createJob(jobname, allcmds, outpath=outputpath, outfn=jobname) return jobmanager
def testSim(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmdset = configRobot.makeParasList(cmdset, ['njob', 'dataset', 'cumulatecount', 'costCoef', 'priormethod', 'transresidual', 'btthres', 'adaptiveSigmoid', 'splitpriornorm']) cmd, call, mem, time, prefix, njobs = configRobot.popParas(cmdset, ['cmd', 'call', 'mem', 'time', 'prefix', 'njob']) datasets, algo, noiseidx, cumulatecount, costCoef, priormethod, transresidual, btthres, adaptiveSigmoid, splitpriornorm = configRobot.popParas(cmdset, ['dataset', 'algo', 'noiseidx', 'cumulatecount', 'costCoef', 'priormethod','transresidual', 'btthres', 'adaptiveSigmoid', 'splitpriornorm']) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) cmdGenerator.checkPath(outputpath, create=createpath) iterlist = list(itertools.product( datasets, cumulatecount, noiseidx, costCoef, priormethod, transresidual, btthres, adaptiveSigmoid, splitpriornorm ) ) for dataset, cumcount, nsidx, betacost, prmethod, transres, btthreshold, adaptSig, splitNorm in iterlist: if len(njobs) == 1: numjobs = int(njobs[0]) else: numjobs = int(njobs[ datasets.index(dataset) ] ) if cumcount == 'true': cumStr = 'Cum1' else: cumStr = 'Cum0' if prmethod == 'bootstrap': prStr = 'BT' elif prmethod == 'bootstraplinear': prStr = 'BL' elif prmethod == 'bootstrapexpected': prStr = 'BE' elif prmethod == 'bayes': prStr = 'BY' else: print 'unknown prior method:', prmethod exit(1) if transres == 'true': if prmethod != 'bootstrap': continue else: resStr = 'Resd1' + '_' + btthreshold else: resStr = 'Resd0' if btthreshold != '0.3': continue else: resStr = resStr + '_0.3' if adaptSig == 'true': sigStr = 'ADSig1' else: sigStr = 'ADSig0' if splitNorm == 'true': splitnormStr = 'SPN1' else: splitnormStr = 'SPN0' if splitNorm == 'true' and prmethod != 'bootstrapexpected': continue resStr = resStr.replace('.','') Nstr = 'N%s'%(nsidx) #costStr = 'P' + betacost costStr = '' cumStr = '' for jobidx in range(1, numjobs+1): fnhead = prefix + dataset + cumStr + costStr + Nstr + prStr + resStr + sigStr + splitnormStr jobname = prefix + dataset + cumStr + costStr + Nstr + prStr + resStr + sigStr + splitnormStr + 'J%02d'%(jobidx) CMD = [] functionCall = "tic; " functionCall = functionCall + call + "(%d, %d, '%s', '%s', %s, '%s', '%s', 'cumulatecount', %s, 'costCoef', %s, 'priormethod', '%s', 'transresidual', %s, 'adaptiveSigmoid', %s, 'btthres', %s, 'splitpriornorm', %s); toc;"%(jobidx, numjobs, dataset, algo, nsidx, outputpath, fnhead, cumcount, betacost, prmethod, transres, adaptSig, btthreshold, splitNorm) if algo == 'lasso': CMD.append( cmdGenerator.formatCmd( matlabcmd2012%(functionCall) ) ) else: CMD.append( cmdGenerator.formatCmd( matlabcmd%(functionCall) ) ) CMD.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, outputpath)) ) jobmanager.createJob(jobname, CMD, outpath=outputpath, outfn=jobname, trackcmd=False) return jobmanager
def runCV(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmdset = configRobot.makeParasList(cmdset, ['njob', 'dataset', 'pheno', 'usecn', 'useDisease', 'combineSplit', 'ylogtransform']) cmd, call, mem, time, prefix, njobs = configRobot.popParas(cmdset, ['cmd', 'call', 'mem', 'time', 'prefix', 'njob']) datasets, phenos, usecn, useDisease, combineSplit = configRobot.popParas(cmdset, ['dataset', 'pheno', 'usecn', 'useDisease', 'combineSplit']) ylogtransform, trainmode = configRobot.popParas(cmdset, ['ylogtransform', 'trainmode']) if 'useallexp' in cmdset.keys(): useallexp = configRobot.popParas(cmdset, ['useallexp']) else: useallexp = 'false' if 'samplinghead' in cmdset.keys(): samplinghead = configRobot.popParas(cmdset, ['samplinghead']) else: samplinghead = 'Samplings' if 'runcv' in cmdset.keys(): runcv = range(1, int(configRobot.popParas(cmdset, ['runcv']))+1) else: runcv = [0] outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) cmdGenerator.checkPath(outputpath, create=createpath) iterlist = list(itertools.product( datasets, phenos, usecn, combineSplit, useDisease, ylogtransform, runcv ) ) for dataset, pheno, cnSet, splitSet, diseaseSet, ylog, cvidx in iterlist: if pheno in ['ACT', 'GI50'] and ylog == 'true': continue if len(njobs) == 1: numjobs = int(njobs[0]) else: numjobs = int(njobs[ datasets.index(dataset) ] ) if cnSet == 'true': cnStr = 'C1' else: cnStr = 'C0' if splitSet == 'true': splitStr = 'S1' else: splitStr = 'S0' if diseaseSet == 'true': diseaseStr = 'D1' else: diseaseStr = 'D0' if ylog == 'true': logStr = 'log' else: logStr = '' if cvidx == 0: cvStr = '' else: cvStr = '%02d'%(cvidx) if pheno == '[]': pheno = '' for jobidx in range(1, numjobs+1): fnhead = prefix + cvStr + logStr + cnStr + splitStr + diseaseStr jobname = prefix + cvStr + logStr + dataset + pheno + cnStr + splitStr + diseaseStr + 'J%02d'%(jobidx) CMD = [] if dataset.lower() == 'joe': functionCall = "addpath(\'./Gray/data\'); tic; " else: functionCall = "addpath(\'./CCLE/data\'); tic; " functionCall = functionCall + call + "(%d, %d, '%s', '%s', '%s', '%s', 'usecn', %s, 'combineSplit', %s, 'useDisease', %s, 'ylogtransform', %s, 'trainmode', %s, 'sampling', '%s'); toc;"%(jobidx, numjobs, outputpath, fnhead, dataset, pheno, cnSet, splitSet, diseaseSet, ylog, trainmode, samplinghead+cvStr ) CMD.append( cmdGenerator.formatCmd( matlabcmd2012%(functionCall) ) ) CMD.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, outputpath)) ) jobmanager.createJob(jobname, CMD, outpath=outputpath, outfn=jobname, trackcmd=False) return jobmanager
def runJobs(cmdset, runmode='test'): jobmanager = jobFactory.jobManager(mem=cmdset['mem'], time=cmdset['time'], overwrite=cmdset.pop('overwrite')) if 'sampleList' in cmdset: cmdset['LoopOverSample'] = readSampleList( cmdset['sampleList'].format(**cmdset)) if 'LoopOverLine' in cmdset: cmdset['LoopOverLine'] = readInputList(cmdset['LoopOverLine']) loopVarName, loopList, cmdset = configRobot.getLoopOverList(cmdset) if type(cmdset['call']) is not list: cmdset['call'] = [cmdset['call']] if 'njob' in cmdset: njob = int(cmdset['njob']) chunksize = int(math.ceil(len(loopList) / njob)) else: chunksize = 1 #unpack call cmdset['call_idx'] = [] for i in xrange(len(cmdset['call'])): key = 'call_cmd_%d' % i cmdset[key] = cmdset['call'][i] cmdset['call_idx'].append(key) del cmdset['call'] allowance = chunksize CMDs = [] sampleParser = re.compile('Sample_([\w\-]+)') for loopValue in loopList: paraset = copy.deepcopy(cmdset) if 'randstr' in paraset: paraset['randstr'] = randstr(20) paraset.update(zip(loopVarName, loopValue)) if 'LoopOverLine' in paraset: if type(paraset['LoopOverLine']) is not list: paraset['LoopOverLine'] = [paraset['LoopOverLine']] for fdIdx in xrange(len(paraset['LoopOverLine'])): paraset['line%d' % fdIdx] = paraset['LoopOverLine'][fdIdx] if 'sample' not in paraset: if 'LoopOverFile' in paraset: if 'Sample_' in paraset['LoopOverFile']: paraset['sample'] = sampleParser.findall( paraset['LoopOverFile'])[0] if 'logpath' not in paraset: paraset['logpath'] = paraset['outputpath'] paraset = configRobot.evalListAccess(paraset) paraset = configRobot.evalRegExp(paraset) paraset = configRobot.translateAllValues(paraset) if 'tmppath' in paraset: CMDs.append(cmdGenerator.checkPathOnNode(paraset['tmppath'])) CMDs.append(cmdGenerator.checkPathOnNode(paraset['outputpath'])) CMDs.append(cmdGenerator.checkPathOnNode(paraset['logpath'])) for callKey in cmdset['call_idx']: CMDs.append(cmdGenerator.formatCmd(paraset[callKey])) allowance -= 1 if allowance == 0: CMDs.append( cmdGenerator.formatCmd( 'mv {prefix}.job {logpath}'.format(**paraset))) CMDs.append( cmdGenerator.formatCmd( 'mv {prefix}.log {logpath}'.format(**paraset))) jobmanager.createJob(paraset['prefix'], CMDs, outpath='./', outfn=paraset['prefix'], trackcmd=paraset['trackcmd'], sgeJob=True, sgeopt=paraset['sgeopt'], toShell=paraset['toShell'], runThru=paraset['runThru']) CMDs = [] allowance = chunksize else: if len(CMDs) > 0: CMDs.append( cmdGenerator.formatCmd( 'mv {prefix}.job {logpath}'.format(**paraset))) CMDs.append( cmdGenerator.formatCmd( 'mv {prefix}.log {logpath}'.format(**paraset))) jobmanager.createJob(paraset['prefix'], CMDs, outpath='./', outfn=paraset['prefix'], trackcmd=paraset['trackcmd'], sgeJob=True, sgeopt=paraset['sgeopt'], toShell=paraset['toShell'], runThru=paraset['runThru']) CMDs = [] allowance = chunksize return jobmanager
def preGATK(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) picardpath = cmdGenerator.checkPath(cmdset.pop('picardpath')) gatkpath = cmdGenerator.checkPath(cmdset.pop('gatkpath')) bam = configRobot.popParas(cmdset, 'bam') jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) if '-Djava.io.tmpdir' in cmdset.keys(): javacmd = 'java ' + '-Djava.io.tmpdir' + cmdGenerator.checkPath( cmdset.pop('-Djava.io.tmpdir')) else: javacmd = 'java' javacmd = javacmd + ' -Xmx%dg -jar' % (int(mem.replace('G', '')) - 2) samview = 'samtools view -b -h -F 264' reorder = picardpath + 'ReorderSam.jar VALIDATION_STRINGENCY=LENIENT' RG = picardpath + 'AddOrReplaceReadGroups.jar VALIDATION_STRINGENCY=LENIENT RGLB=dUTP RGPL=illumina RGPU=1' mdupjar = picardpath + 'MarkDuplicates.jar' GATK = gatkpath + 'GenomeAnalysisTK.jar ' createTg = '-T RealignerTargetCreator ' realign = '-T IndelRealigner ' idxcmd = 'samtools index' clearup = 'rm -f ' for sample in samples: CMDs = [] CMDs.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools')) jobname = prefix + '_' + sample #filter paraset = copy.deepcopy(cmdset) paraset['-o'] = '%s/%s.filter.bam' % (inputpath + sample, bam.replace('.bam', '')) lastoutput = paraset['-o'] del paraset['-R'] del paraset['-filterMBQ'] #paraset = configRobot.validParas(paraset, availParas['samtools']) CMDs.append( cmdGenerator.formatCmd(samview, paraset, inputpath + sample + '/' + bam)) #reorder by chrm paraset = copy.deepcopy(cmdset) paraset['INPUT'] = '=%s' % lastoutput paraset['OUTPUT'] = '=%s.reorder.bam' % (lastoutput.replace( '.bam', '')) paraset['REFERENCE'] = '=%s' % paraset['-R'] paraset = configRobot.validParas(paraset, availParas['ReorderSam.jar']) CMDs.append(cmdGenerator.formatCmd(javacmd, reorder, paraset)) CMDs.append(cmdGenerator.formatCmd(clearup, lastoutput)) lastoutput = paraset['OUTPUT'].strip('=') #add RG paraset = copy.deepcopy(cmdset) paraset['INPUT'] = '=%s' % lastoutput paraset['OUTPUT'] = '=%s.addRG.bam' % (lastoutput.replace('.bam', '')) paraset['RGSM'] = '=%s' % sample paraset = configRobot.validParas( paraset, availParas['AddOrReplaceReadGroups.jar']) CMDs.append(cmdGenerator.formatCmd(javacmd, RG, paraset)) CMDs.append(cmdGenerator.formatCmd(clearup, lastoutput)) lastoutput = paraset['OUTPUT'].strip('=') #mark duplicates paraset = copy.deepcopy(cmdset) paraset['INPUT'] = '=%s' % lastoutput paraset['OUTPUT'] = '=%s.mdup.bam' % (lastoutput.replace('.bam', '')) paraset['METRICS_FILE'] = '=%s/%s' % (inputpath + sample, prefix + '_mdupmetrics.txt') paraset = configRobot.validParas(paraset, availParas['MarkDuplicates.jar']) CMDs.append(cmdGenerator.formatCmd(javacmd, mdupjar, paraset)) CMDs.append( cmdGenerator.formatCmd(idxcmd, paraset['OUTPUT'].strip('='))) lastoutput = paraset['OUTPUT'].strip('=') #create intervals paraset = copy.deepcopy(cmdset) paraset['-I'] = lastoutput paraset['-o'] = lastoutput.replace('.bam', '.intervals') CMDs.append(cmdGenerator.formatCmd(javacmd, GATK + createTg, paraset)) #realign paraset['-targetIntervals'] = paraset['-o'] paraset['-o'] = lastoutput.replace('.bam', '.realign.bam') CMDs.append(cmdGenerator.formatCmd(javacmd, GATK + realign, paraset)) #clear up CMDs.append(cmdGenerator.formatCmd(clearup, lastoutput)) CMDs.append( cmdGenerator.formatCmd(clearup, lastoutput.replace('.bam', '.intervals'))) CMDs.append( cmdGenerator.formatCmd( 'mv ./%s%s %s' % (jobname, jobmanager.ext, inputpath + sample))) jobmanager.createJob(jobname, CMDs, outpath=inputpath + sample, outfn=jobname) return jobmanager
def preGATK(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix = configRobot.popParas(cmdset,['cmd', 'mem', 'time', 'sample', 'prefix']) inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) picardpath = cmdGenerator.checkPath(cmdset.pop('picardpath')) gatkpath = cmdGenerator.checkPath(cmdset.pop('gatkpath')) bam = configRobot.popParas(cmdset, 'bam') jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) if '-Djava.io.tmpdir' in cmdset.keys(): javacmd = 'java ' + '-Djava.io.tmpdir' + cmdGenerator.checkPath(cmdset.pop('-Djava.io.tmpdir')) else: javacmd = 'java' javacmd = javacmd + ' -Xmx%dg -jar'%(int(mem.replace('G',''))-2) samview = 'samtools view -b -h -F 264' reorder = picardpath + 'ReorderSam.jar VALIDATION_STRINGENCY=LENIENT' RG = picardpath + 'AddOrReplaceReadGroups.jar VALIDATION_STRINGENCY=LENIENT RGLB=dUTP RGPL=illumina RGPU=1' mdupjar = picardpath + 'MarkDuplicates.jar' GATK = gatkpath + 'GenomeAnalysisTK.jar ' createTg = '-T RealignerTargetCreator ' realign = '-T IndelRealigner ' idxcmd = 'samtools index' clearup = 'rm -f ' for sample in samples: CMDs = [] CMDs.append( cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') ) jobname = prefix + '_' + sample #filter paraset = copy.deepcopy(cmdset) paraset['-o'] = '%s/%s.filter.bam'%(inputpath+sample, bam.replace('.bam','')) lastoutput = paraset['-o'] del paraset['-R'] del paraset['-filterMBQ'] #paraset = configRobot.validParas(paraset, availParas['samtools']) CMDs.append( cmdGenerator.formatCmd(samview, paraset, inputpath+sample+'/'+bam ) ) #reorder by chrm paraset = copy.deepcopy(cmdset) paraset['INPUT'] = '=%s'%lastoutput paraset['OUTPUT'] = '=%s.reorder.bam'%(lastoutput.replace('.bam','')) paraset['REFERENCE'] = '=%s'%paraset['-R'] paraset = configRobot.validParas(paraset, availParas['ReorderSam.jar']) CMDs.append( cmdGenerator.formatCmd(javacmd, reorder, paraset) ) CMDs.append( cmdGenerator.formatCmd(clearup, lastoutput) ) lastoutput = paraset['OUTPUT'].strip('=') #add RG paraset = copy.deepcopy(cmdset) paraset['INPUT'] = '=%s'%lastoutput paraset['OUTPUT'] = '=%s.addRG.bam'%(lastoutput.replace('.bam','')) paraset['RGSM'] = '=%s'%sample paraset = configRobot.validParas(paraset, availParas['AddOrReplaceReadGroups.jar']) CMDs.append( cmdGenerator.formatCmd(javacmd, RG, paraset) ) CMDs.append( cmdGenerator.formatCmd(clearup, lastoutput) ) lastoutput = paraset['OUTPUT'].strip('=') #mark duplicates paraset = copy.deepcopy(cmdset) paraset['INPUT'] = '=%s'%lastoutput paraset['OUTPUT'] = '=%s.mdup.bam'%(lastoutput.replace('.bam', '')) paraset['METRICS_FILE'] = '=%s/%s'%(inputpath + sample, prefix + '_mdupmetrics.txt') paraset = configRobot.validParas(paraset, availParas['MarkDuplicates.jar']) CMDs.append( cmdGenerator.formatCmd(javacmd, mdupjar, paraset) ) CMDs.append( cmdGenerator.formatCmd(idxcmd, paraset['OUTPUT'].strip('=')) ) lastoutput = paraset['OUTPUT'].strip('=') #create intervals paraset = copy.deepcopy(cmdset) paraset['-I'] = lastoutput paraset['-o'] = lastoutput.replace('.bam', '.intervals') CMDs.append( cmdGenerator.formatCmd(javacmd, GATK+createTg, paraset) ) #realign paraset['-targetIntervals'] = paraset['-o'] paraset['-o'] = lastoutput.replace('.bam', '.realign.bam') CMDs.append( cmdGenerator.formatCmd(javacmd, GATK+realign, paraset) ) #clear up CMDs.append( cmdGenerator.formatCmd(clearup, lastoutput) ) CMDs.append( cmdGenerator.formatCmd(clearup, lastoutput.replace('.bam', '.intervals')) ) CMDs.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, inputpath+sample)) ) jobmanager.createJob(jobname, CMDs, outpath = inputpath+sample, outfn = jobname) return jobmanager
def filetersingleton(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, bam = configRobot.popParas( cmdset, ['cmd', 'mem', 'time', 'sample', 'prefix', 'bam']) if 'TMP_DIR' in cmdset.keys(): TMP_DIR = cmdset.pop('TMP_DIR') else: TMP_DIR = '' inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) programpath = cmdGenerator.checkPath(cmdset.pop('programpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) setuppathcmd = cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') javacmd = 'java -Xmx%dg -jar' % (int(mem.replace('G', '')) - 1) samview = 'samtools view -b -h -F 8' reorder = 'ReorderSam.jar VALIDATION_STRINGENCY=LENIENT' RG = 'AddOrReplaceReadGroups.jar VALIDATION_STRINGENCY=LENIENT RGLB=dUTP RGPL=illumina RGPU=1' mdup = 'MarkDuplicates.jar' for sample in samples: jobfnprefix = prefix + '_' + sample paraset = copy.deepcopy(cmdset) if TMP_DIR != '': paraset['TMP_DIR'] = '%s' % TMP_DIR tmpbam = [] CMDs = [] CMDs.append(setuppathcmd) paraset['INPUT'] = '=%s/%s' % (inputpath + sample, bam) paraset['OUTPUT'] = '=%s/%s.reorder.bam' % (inputpath + sample, bam.replace('.bam', '')) tmpbam.append(paraset['OUTPUT'].strip('=')) CMDs.append( cmdGenerator.formatCmd(javacmd, programpath + reorder, paraset)) paraset['INPUT'] = '=%s/%s.reorder.bam' % (inputpath + sample, bam.replace('.bam', '')) paraset['OUTPUT'] = '=%s/%s.reorder.addRG.bam' % ( inputpath + sample, bam.replace('.bam', '')) paraset['RGSM'] = '=%s' % sample CMDs.append(cmdGenerator.formatCmd(javacmd, programpath + RG, paraset)) paraset = copy.deepcopy(cmdset) paraset['-o'] = '%s/%s.filter.bam' % (inputpath + sample, bam.replace('.bam', '.addRG')) CMDs.append( cmdGenerator.formatCmd( samview, paraset, inputpath + sample + '/' + bam.replace('.bam', '.addRG.bam'))) paraset = copy.deepcopy(cmdset) paraset['INPUT'] = '=%s/%s.filter.bam' % ( inputpath + sample, bam.replace('.bam', '.addRG')) paraset['OUTPUT'] = '%s/%s.mdup.bam' % (paraset['INPUT'].replace( '.bam', '')) CMDs.append( cmdGenerator.formatCmd(javacmd, programpath + mdup, paraset)) paraset = copy.deepcopy(cmdset) CMDs.append( cmdGenerator.formatCmd( 'samtools index', bam.replace('.bam', '.reorder.addRG.filter.mdup.bam'))) CMDs.append(cmdGenerator.formatCmd('rm -f', tmpbam)) CMDs.append( cmdGenerator.formatCmd( 'mv ./%s%s %s' % (jobfnprefix, jobmanager.ext, inputpath + sample))) jobmanager.createJob(jobfnprefix, CMDs, outpath=inputpath + sample, outfn=jobfnprefix) return jobmanager
def filetersingleton(cmdset, runmode='test'): global availParas if runmode == 'test': createpath = False else: createpath = True cmd, mem, time, samples, prefix, bam = configRobot.popParas(cmdset,['cmd', 'mem', 'time', 'sample', 'prefix', 'bam']) if 'TMP_DIR' in cmdset.keys(): TMP_DIR = cmdset.pop('TMP_DIR') else: TMP_DIR = '' inputpath = cmdGenerator.checkPath(cmdset.pop('inputpath')) programpath = cmdGenerator.checkPath(cmdset.pop('programpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) setuppathcmd = cmdGenerator.formatCmd('source ~/libraries/setup_seqtools') javacmd = 'java -Xmx%dg -jar'%(int(mem.replace('G',''))-1) samview = 'samtools view -b -h -F 8' reorder = 'ReorderSam.jar VALIDATION_STRINGENCY=LENIENT' RG = 'AddOrReplaceReadGroups.jar VALIDATION_STRINGENCY=LENIENT RGLB=dUTP RGPL=illumina RGPU=1' mdup = 'MarkDuplicates.jar' for sample in samples: jobfnprefix = prefix + '_' + sample paraset = copy.deepcopy(cmdset) if TMP_DIR != '': paraset['TMP_DIR'] = '%s'%TMP_DIR tmpbam = [] CMDs = [] CMDs.append(setuppathcmd) paraset['INPUT'] = '=%s/%s'%(inputpath+sample, bam) paraset['OUTPUT'] = '=%s/%s.reorder.bam'%(inputpath+sample, bam.replace('.bam','')) tmpbam.append(paraset['OUTPUT'].strip('=')) CMDs.append( cmdGenerator.formatCmd(javacmd, programpath+reorder, paraset) ) paraset['INPUT'] = '=%s/%s.reorder.bam'%(inputpath+sample, bam.replace('.bam','')) paraset['OUTPUT'] = '=%s/%s.reorder.addRG.bam'%(inputpath+sample, bam.replace('.bam','')) paraset['RGSM'] = '=%s'%sample CMDs.append( cmdGenerator.formatCmd(javacmd, programpath+RG, paraset) ) paraset = copy.deepcopy(cmdset) paraset['-o'] = '%s/%s.filter.bam'%(inputpath+sample, bam.replace('.bam','.addRG')) CMDs.append( cmdGenerator.formatCmd(samview, paraset, inputpath+sample+'/'+bam.replace('.bam','.addRG.bam')) ) paraset = copy.deepcopy(cmdset) paraset['INPUT'] = '=%s/%s.filter.bam'%(inputpath+sample, bam.replace('.bam','.addRG')) paraset['OUTPUT'] = '%s/%s.mdup.bam'%(paraset['INPUT'].replace('.bam', '')) CMDs.append( cmdGenerator.formatCmd(javacmd, programpath+mdup, paraset) ) paraset = copy.deepcopy(cmdset) CMDs.append( cmdGenerator.formatCmd('samtools index', bam.replace('.bam', '.reorder.addRG.filter.mdup.bam')) ) CMDs.append( cmdGenerator.formatCmd('rm -f', tmpbam) ) CMDs.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobfnprefix, jobmanager.ext, inputpath+sample)) ) jobmanager.createJob(jobfnprefix, CMDs, outpath = inputpath+sample, outfn = jobfnprefix) return jobmanager
def runBootstrapCCLE(cmdset, runmode='test'): if runmode == 'test': createpath = False else: createpath = True cmdset = configRobot.makeParasList(cmdset, ['njob', 'dataset', 'pheno', 'usecn', 'useDisease', 'combineSplit', 'priormethod', 'ylogtransform', 'useallexp', 'cumulatecount', 'predList']) cmd, call, mem, time, prefix, njobs = configRobot.popParas(cmdset, ['cmd', 'call', 'mem', 'time', 'prefix', 'njob']) iter, priormethod, algo, ylogtransform = configRobot.popParas(cmdset, ['iter', 'priormethod', 'algo', 'ylogtransform']) datasets, phenos, usecn, combineSplit, useDisease = configRobot.popParas(cmdset, ['dataset', 'pheno', 'usecn', 'combineSplit', 'useDisease']) useallexp = configRobot.popParas(cmdset, ['useallexp']) if 'trainmode' in cmdset.keys(): trainmode = configRobot.popParas(cmdset, ['trainmode']) else: trainmode = 'false' if 'cumulatecount' in cmdset.keys(): cumulatecount = configRobot.popParas(cmdset, ['cumulatecount']) if 'runcv' in cmdset.keys(): runcv = range(1, int(configRobot.popParas(cmdset, ['runcv']))+1) else: runcv = [0] if 'samplinghead' in cmdset.keys(): sampling = configRobot.popParas(cmdset, ['samplinghead']) else: sampling = 'Samplings' if 'lassoparahead' in cmdset.keys(): lassopara = configRobot.popParas(cmdset, ['lassoparahead']) else: lassopara = 'lassCvPara' if 'predList' in cmdset.keys(): predlist = configRobot.popParas(cmdset, ['predList']) else: predlist = ['cancerGenes'] iterlist = list(itertools.product( datasets, phenos, usecn, combineSplit, useDisease, priormethod, ylogtransform, useallexp, cumulatecount, runcv, predlist ) ) outputpath = cmdGenerator.checkPath(cmdset.pop('outputpath')) jobmanager = jobFactory.jobManager(mem=mem, time=time, overwrite=cmdset.pop('overwrite')) cmdGenerator.checkPath(outputpath, create=createpath) f = popen('ls %s*.finished'%(outputpath)) finfns = f.read().split() f.close() finfns = map(lambda(l):l.replace(outputpath, '').replace('.finished', ''), finfns) for dataset, pheno, cnSet, splitSet, diseaseSet, prior, ylog, useExp, cumcount, cvidx, plist in iterlist: if pheno in ['ACT', 'GI50'] and ylog == 'true': continue if len(njobs) == 1: numjobs = int(njobs[0]) else: numjobs = int(njobs[ datasets.index(dataset) ] ) if cnSet == 'true': cnStr = 'C1' else: cnStr = 'C0' if splitSet == 'true': splitStr = 'S1' else: splitStr = 'S0' if diseaseSet == 'true': diseaseStr = 'D1' else: diseaseStr = 'D0' if prior == 'bootstrapnorm': priorStr = 'NRM' else: priorStr = '' if ylog == 'true': logStr = 'log' else: logStr = '' if useExp == 'true': expStr = 'AE' else: expStr = '' if cumcount == 'true': cumStr = 'Cum' else: cumStr = '' if cvidx == 0: cvStr = '' else: cvStr = '%02d'%(cvidx) if plist == 'cancerGenes': plStr = '' else: plStr = plist if pheno == '[]': pheno = '' for jobidx in range(1, numjobs+1): fnhead = prefix + cvStr + plStr + priorStr + logStr + cnStr + splitStr + diseaseStr + expStr + cumStr jobname = prefix + cvStr + plStr + priorStr + logStr + dataset + pheno + cnStr + splitStr + diseaseStr + expStr + cumStr + 'J%02d'%(jobidx) CMD = [] for iteridx in range(1, int(iter)+1): if fnhead+dataset+'_J%02dI%02d'%(jobidx,iteridx) in finfns: continue if dataset.lower() == 'joe': functionCall = "addpath(\'./Gray/data\'); tic; " elif 'ccle' in dataset.lower(): functionCall = "addpath(\'./CCLE/data\'); tic; " functionCall = functionCall + call + "(%d, %d, %d, '%s', '%s', '%s', '%s', 'usecn', %s, 'combineSplit', %s, 'useDisease', %s, 'algo', '%s', 'priormethod', '%s', 'ylogtransform', %s, 'trainmode', %s, 'useallexp', %s, 'cumulatecount', %s, 'sampling', '%s', 'lassopara', '%s', 'predList', '%s'); toc;"%(jobidx, numjobs, iteridx, outputpath, fnhead, dataset, pheno, cnSet, splitSet, diseaseSet, algo, prior, ylog, trainmode, useExp, cumcount, sampling+cvStr, lassopara+cvStr, plist ) if algo == 'lasso': CMD.append( cmdGenerator.formatCmd( matlabcmd2012%(functionCall) ) ) else: CMD.append( cmdGenerator.formatCmd( matlabcmd%(functionCall) ) ) if len(CMD) > 0: CMD.append( cmdGenerator.formatCmd('mv ./%s%s %s'%(jobname, jobmanager.ext, outputpath)) ) jobmanager.createJob(jobname, CMD, outpath=outputpath, outfn=jobname, trackcmd=False) return jobmanager