def idbaudInterleave(infile,outfile): seqdat = PipelineMetaAssemblyKit.SequencingData(infile) if os.path.exists(os.getcwd()+"/idbaud_out.dir/{}".format(seqdat.cleanname)) == False: os.mkdir(os.getcwd()+"/idbaud_out.dir/{}".format(seqdat.cleanname)) statement = PipelineMetaAssemblyKit.IdbaudInterleave(seqdat,os.getcwd(),outfile) if statement != None: P.run()
def runIdbaud(infile,outfile): job_memory = str(PARAMS["IDBAUD_clus_memory"])+"G" job_threads = PARAMS["IDBAUD_clus_threads"] seqdat = PipelineMetaAssemblyKit.SequencingData(infile) assembler = PipelineMetaAssemblyKit.Idbaud(seqdat,"idbaud_out.dir",PARAMS) statement = assembler.build() P.run()
def CountReads(infile, params): original = PipelineMetaAssemblyKit.SequencingData(infile) original.readCount() rrna = False genome = False rnadir = os.getcwd() + "/rrna_filter_out.dir/" gendir = os.getcwd() + "/genome_filter_out.dir/" if params["General_rrna_filter"] == "true": rrna = PipelineMetaAssemblyKit.SequencingData(rnadir + original.cleanname + "/other_" + original.filename) rrna.readCount() if params["General_host_filter"] == "true": genome = PipelineMetaAssemblyKit.SequencingData(gendir + original.cleanname + "/hostfiltered_" + original.filename) genome.readCount() ocount = original.readcount if rrna == False: rcount = "NA" else: rcount = rrna.readcount if genome == False: gcount = "NA" else: gcount = genome.readcount return ("{}\t{}\t{}\t{}\n".format(original.cleanname, ocount, rcount, gcount))
def runMegahit(infile, outfile): job_memory = str(PARAMS["Megahit_clus_memory"])+"G" job_threads = PARAMS["Megahit_clus_threads"] seqdat=PipelineMetaAssemblyKit.SequencingData(infile) assembler = PipelineMetaAssemblyKit.Megahit(seqdat,"megahit_out.dir",PARAMS) statement = assembler.build() to_cluster = True P.run()
def pooledName(infiles,PARAMS): ftype = PipelineMetaAssemblyKit.SequencingData(infiles[0]) pooledname = "pooled.dir/"+PARAMS["General_output_prefix"]+"."+ftype.fileformat if ftype.paired == True and ftype.interleaved == False: pooledname += ".1" if ftype.compressed == True: pooledname += ".gz" return(PipelineMetaAssemblyKit.SequencingData(pooledname))
def runMetaspades(infile,outfile): job_memory = str(int(math.ceil(int(PARAMS["Metaspades_memory"])/int(PARAMS["Metaspades_threads"]))))+"G" job_threads = PARAMS["Metaspades_threads"] seqdat = PipelineMetaAssemblyKit.SequencingData(infile) if seqdat.paired == True: assembler = PipelineMetaAssemblyKit.Metaspades(seqdat,"metaspades_out.dir",PARAMS) statement = assembler.build() P.run() else: print("cannot run metaspades on file {} as it requires paired-end data".format(seqdat.filename))
def detectOrfs(infile, outfile): statementlist = [] #set job memory and threads job_memory = str(PARAMS["Prodigal_memory"]) + "G" job_threads = PARAMS["Prodigal_threads"] #command to generate index files seqdat = PipelineMetaAssemblyKit.SequencingData(infile) #ensure input is FASTA if seqdat.paired == True: print( "Prodigal requires single/merged (i.e. not paired-end) reads for ORF detection." ) else: if seqdat.fileformat == "fastq": statementlist.append("reformat.sh in={} out={}".format( infile, "orfs.dir/" + seqdat.cleanname + ".fa")) infile = "orfs.dir/" + seqdat.cleanname + ".fa" #generate the call to prodigal statementlist.append( PipelineMetaAnnotate.runProdigal(infile, outfile, PARAMS)) #remove the temp FASTA if created if seqdat.fileformat == "fastq": statementlist.append("rm {}".format("orfs.dir/" + seqdat.cleanname + ".fa")) statement = " && ".join(statementlist) P.run()
def checkFile(infile, outfile): seqdat=PipelineMetaAssemblyKit.SequencingData(infile) outf=open(outfile,'w') outf.write("name\t{}\nformat\t{}\ncompressed\t{}\npaired\t{}\ninterleaved\t{}\n".format( seqdat.filename,seqdat.fileformat,seqdat.compressed,seqdat.paired,seqdat.interleaved)) seqdat.readCount() outf.write("read_count\t{}\n".format(seqdat.readcount)) outf.close()
def filterMapping(infile, outfile): #use the original sequencing file to pull pairedness, file format and compression seqdat = PipelineMetaAssemblyKit.SequencingData( os.path.basename(infile.strip(".mapped.bam"))) filterer = PipelineMetaFilter.FilterFromBam(infile, outfile, seqdat, PARAMS) statementlist = [] statementlist.append(filterer.build()) statement = " && ".join(statementlist) P.run()
def poolReads(infiles,outfile): statementlist = [] #get file type from first file ftype = PipelineMetaAssemblyKit.SequencingData(infiles[0]) #generate output filename outname = "pooled.dir/"+PARAMS["General_output_prefix"]+"."+ftype.fileformat #pool the reads statementlist.append(PipelinePoolReads.poolReads(ftype,infiles,outname)) #create the log to ensure jpb isn't rerun statementlist.append('echo "Pooled {} files to {}" >> pooled.dir/pool.log'.format(len(infiles),outname)) statement = " && ".join(statementlist) P.run()
def mapBowtie2(infile, outfile): job_threads = PARAMS["Bowtie_threads"] job_memory = str(PARAMS["Bowtie_memory"]) + "G" seqdat = PipelineMetaAssemblyKit.SequencingData(infile) bowtie = PipelineMetaFilter.Bowtie2(seqdat, outfile, PARAMS) statementlist = [] #remove all comments from read names in files (trimming can add comments making non-matching readnames in pairs) statementlist.append(bowtie.cleanNames()) #directory for output statementlist.append("mkdir -p {}".format(os.path.dirname(outfile))) #call to bowtie statementlist.append(bowtie.build()) #convert sam to bam statementlist.append("samtools view -bS {} > {}".format( outfile.replace(".bam", ".sam"), outfile)) #remove the sam file statementlist.append("rm {}".format(outfile.replace(".bam", ".sam"))) statement = " && ".join(statementlist) P.run()
def poolReads(ftype,infiles,outname): statementlist = [] if ftype.paired == True and ftype.interleaved == False: statementlist.append("touch {} && touch {}".format(outname+".1",outname+".2")) else: statementlist.append("touch {}".format(outname)) #concatenate the reads as appropriate for i in infiles: curfile = PipelineMetaAssemblyKit.SequencingData(i) if ftype.paired == True and ftype.interleaved == False: statementlist.append("zcat -f {} >> {} && zcat -f {} >> {}".format(i,outname+".1",curfile.pairedname,outname+".2")) else: statementlist.append("zcat -f {} >> {}".format(i,outname)) #if compressed in, compress the output if ftype.compressed == True: if ftype.paired == True and ftype.interleaved == False: statementlist.append("gzip {} && gzip {}".format(outname+".1",outname+".2")) else: statementlist.append("gzip {}".format(outname)) return(" && ".join(statementlist))
def runSortMeRNA(infile, outfile): seqdat = PipelineMetaAssemblyKit.SequencingData(infile) if PARAMS["General_rrna_filter"] == "true": sortmerna = PipelineMetaFilter.SortMeRNA(seqdat, outfile, PARAMS) if PARAMS["SortMeRNA_memory"] != "false": job_memory = str(PARAMS["SortMeRNA_memory"]) + "G" else: job_memory = "1G" job_threads = PARAMS["SortMeRNA_threads"] statement = sortmerna.build() else: #if skipping rRNA filtering symlink files and make appropriate directory statementlist = ["rm -r ref_index.dir"] statementlist.append('mkdir -p rrna_filter_out.dir/{}'.format( seqdat.cleanname)) statementlist.append('ln -s {} {}'.format(os.getcwd() + "/" + infile, outfile)) if seqdat.paired == True and seqdat.interleaved == False: statementlist.append( 'ln -s {} rrna_filter_out.dir/{}/other_{}'.format( os.getcwd() + "/" + seqdat.pairedname, seqdat.cleanname, seqdat.pairedname)) statement = " && ".join(statementlist) P.run()
def summariseContigs(infile,outfile): #summarise each contigs file statement = PipelineMetaAssemblyKit.SummariseContigs(infile,outfile) P.run()
def cleanUp(infile, outfile): seqdat = PipelineMetaAssemblyKit.SequencingData(infile) statement = PipelineMetaFilter.CleanUp(seqdat, outfile, PARAMS) P.run()