def perform_qc(self,sra_object,out_dir="",out_suffix="_bbduk",objectid="NA"): """Run bbduk on fastq files specified by the sra_object sra_object: SRA An SRA object whose fastq files will be used out_dir: str Path to output directory out_suffix: string Suffix for the output sam file objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired. :rtype: tuple """ #make out_dir if not out_dir: out_dir=sra_object.directory else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if sra_object.layout=='PAIRED': fq1=sra_object.fastq_path fq2=sra_object.fastq2_path out_fileName1=pu.get_file_basename(fq1)+out_suffix+".fastq" out_fileName2=pu.get_file_basename(fq2)+out_suffix+".fastq" out_file1Path=os.path.join(out_dir,out_fileName1) out_file2Path=os.path.join(out_dir,out_fileName2) internal_args=() internal_kwargs={"in":fq1,"in2":fq2,"out":out_file1Path,"out2":out_file2Path} #run bbduk status=self.run(*internal_args,objectid=objectid,target=[out_file1Path,out_file2Path],**internal_kwargs) if status: if not pu.check_files_exist(out_file1Path,out_file2Path) and not _dryrun: return("",) return(out_file1Path,out_file2Path) else: fq=sra_object.fastq_path out_fileName=pu.get_file_basename(fq)+out_suffix+".fastq" out_filePath=os.path.join(out_dir,out_fileName) internal_args=() internal_kwargs={"in":fq,"out":out_filePath} #run bbduk status=self.run(*internal_args,objectid=objectid,target=out_filePath,**internal_kwargs) if status: if not pu.check_files_exist(out_filePath) and not _dryrun: return("",) return(out_filePath,)
def bamtofq(self, bam, oid, rm_bam=True): out_dir = pu.get_file_directory(bam) fastq_name = os.path.join(out_dir, oid) sname = os.path.join(out_dir, 's.fq') oname = os.path.join(out_dir, 'o.fq') o2name = os.path.join(out_dir, 'o2.fq') #tempfilename tmpdir = os.environ.get('LOCAL') if not tmpdir: tmpdir = './' tmpfile = os.path.join(tmpdir, pu.get_file_basename(bam) + pu.get_timestamp()) internal_kwargs = { 'F': fastq_name + '_1.fastq', 'F2': fastq_name + '_2.fastq', 'S': sname, 'O': oname, 'O2': o2name, 'T': tmpfile, 'filename': bam } #call run status = self.run(None, objectid=oid, **internal_kwargs) if status and rm_bam: pe.delete_file(bam) return status
def search_fastq(self, path): """Search .fastq file under a dir and create SRA object Return True if found otherwise False """ #search files under the path fq_files = pe.find_files(path, "*.fastq") if len(fq_files) < 1: return False if len(fq_files) > 2: pu.print_boldred("Can not determine .fastq. Exiting...") return False #case with single fastq if len(fq_files) == 1: self.localfastqPath = fq_files[0] pu.print_green("Found .fastq " + self.localfastqPath) self.layout = "SINGLE" #case with paired fastq if len(fq_files) == 2: self.localfastq1Path = fq_files[0] self.localfastq2Path = fq_files[1] pu.print_green("Found .fastq " + self.localfastq1Path + " " + self.localfastq2Path) self.layout = "PAIRED" self.location = path self.srr_accession = pu.get_file_basename(fq_files[0]) return True
def search_sra(self, path): """Search .sra file under a dir Return True if found otherwise False """ #search files under the path sra_files = pe.find_files(path, "*.sra") if len(sra_files) < 1: return False if len(sra_files) > 1: pu.print_boldred( "Found multiple .sra files. Using the first entry...") sra_path = sra_files[0] #self.location=path self.srr_accession = pu.get_file_basename(sra_path) self.localSRAFilePath = sra_path self.sraFileSize = pu.get_file_size(self.localSRAFilePath) #test if file is paired or single end if pe.is_paired(self.localSRAFilePath): self.layout = "PAIRED" else: self.layout = "SINGLE" pu.print_green("Found .sra " + self.localSRAFilePath) return True
def createMikadoGTFlist(self, out_file, out_dir, searchPath, searchQuery="*.gtf", strand=False): """Create a file to be used by mikado configure """ files = pe.find_files(searchPath, searchQuery) args = files #create out dir if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) outFilePath = os.path.join(out_dir, out_file + ".txt") gtfs = [] for l in args: thisName = pu.get_file_basename(l) if thisName: gtfs.append("\t".join([l, thisName, str(strand)])) f = open(outFilePath, "w") f.write("\n".join(gtfs)) f.close() pu.print_green("Mikado list file written to:" + outFilePath) return outFilePath
def shell(): print("Generating bash script") parser = argparse.ArgumentParser( description='pyrpipe diagnostic utility\nGenerate shell script.', usage='''pyrpipe_diagnostic report [<args>] <logfile> ''') parser.add_argument('-o', help='out file \ndefault: same as input logfile',action="store") parser.add_argument('-c',help='Dump command options [(a)ll,fa(i)l,(p)ass]\ndefault: a',default='a',action="store") parser.add_argument('-v',help='verbose',action="store_true") parser.add_argument('-f',help='Filter by programs. Provide a comma-separated list e.g., prefetch,STAR,bowtie2 \ndefault None') parser.add_argument('logfile', help='The log file generated by pyrpipe',action="store") args = parser.parse_args(sys.argv[2:]) logFile=args.logfile #parse args vFlag=args.v if vFlag: print("Generating report") outFile="" if args.o is None: outFile=pu.get_file_basename(logFile) else: outFile=args.o outFile+='.sh' filters=[] if args.f is not None: filters= args.f.split(',') generateBashScript(logFile,outFile,filters,args.c)
def checkEnvLog(logFile): #check all logs exist logFileDir=pu.get_file_directory(logFile) basename=pu.get_file_basename(logFile) envLog=os.path.join(logFileDir,basename+"ENV.log") if not pu.check_files_exist(logFile,envLog): print("Please check missing log files. Exiting.") sys.exit(1) return envLog
def sortbam(bam, oid): outfile = pu.get_file_basename(bam) + "_sorted.bam" outdir = pu.get_file_directory(bam) outpath = os.path.join(outdir, outfile) cmd = 'sambamba sort -t 25 -m 100G -o ' + outpath + ' ' + bam st = pe.execute_command(cmd.split(), logs=True, objectid=oid) if not st: return "" return outpath
def perform_assembly(self, bam_file, out_dir=None, out_suffix="_stringtie", objectid="NA"): """Function to run stringtie using a bam file. Parameters ---------- bam_file: string path to the bam file out_dir: string Path to out file out_suffix: string Suffix for the output gtf file objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the path to output GTF file :rtype: string """ #create path to output file fname = pu.get_file_basename(bam_file) if not out_dir: out_dir = pu.get_file_directory(bam_file) if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) out_gtf_file = os.path.join(out_dir, fname + out_suffix + ".gtf") #Add output file name and input bam internal_args = (bam_file, ) internal_kwargs = {"-o": out_gtf_file} #add positional args internal_kwargs['--'] = internal_args #call stringtie status = self.run(None, objectid=objectid, target=out_gtf_file, **internal_kwargs) if status: #check if sam file is present in the location directory of sraOb if not pu.check_files_exist(out_gtf_file) and not _dryrun: return "" return out_gtf_file return ""
def benchmark(): print("Generating benchmarks") parser = argparse.ArgumentParser( description='pyrpipe diagnostic utility\nGenerate benchmark report.', usage='''pyrpipe_diagnostic report [<args>] <logfile> ''') parser.add_argument('-o', help='out file \ndefault: same as input logfile',action="store") parser.add_argument('-e', help='report output type: [MD,PDF,HTML] \ndefault: PDF',default='PDF',action="store") parser.add_argument('-v',help='verbose',action="store_true") parser.add_argument('-f',help='Filter by programs. Provide a comma-separated list e.g., prefetch,STAR,bowtie2 \ndefault None') parser.add_argument('-t',help='Temporary directory. \ndefault ./tmp',action="store") parser.add_argument('logfile', help='The log file generated by pyrpipe',action="store") args = parser.parse_args(sys.argv[2:]) logFile=args.logfile envLog=checkEnvLog(logFile) #parse args vFlag=args.v if vFlag: print("Generating benchmarks") outFile="" if args.o is None: outFile=pu.get_file_basename(args.logfile) else: outFile=args.o outFile+='.'+args.e filters=[] if args.f is not None: filters= args.f.split(',') #create temp dir tempDir="" if args.t is not None: tempDir= args.t else: tempDir=os.path.join(os.getcwd(),"tmp") #create tmp dir if not pu.check_paths_exist(tempDir): pu.mkdir(tempDir) generateBenchmarkReport(logFile,envLog,filters,tempDir,outFile=outFile,verbose=args.v)
def createMikadoGTFlist(self, out_file, out_dir, searchPath, searchQuery="*.gtf", strand=False): """Create a file to be used by mikado configure out_file: str outfile name out_dir: str path to out_dir searchPath: str Path where gtf/gff files will be searched searchQuery: str Query to perform search. Default: "*.gtf" strand: bool Stranded flag: Default false """ files = pe.find_files(searchPath, searchQuery, recursive=True) args = files #create out dir if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) outFilePath = os.path.join(out_dir, out_file + ".txt") gtfs = [] for l in args: thisName = pu.get_file_basename(l) if thisName: gtfs.append("\t".join([l, thisName, str(strand)])) f = open(outFilePath, "w") f.write("\n".join(gtfs)) f.close() pu.print_green("Mikado list file written to:" + outFilePath) return outFilePath
def report(): parser = argparse.ArgumentParser( description='pyrpipe diagnostic utility\nGenerate analysis report.', usage='''pyrpipe_diagnostic report [<args>] <logfile> ''') parser.add_argument('-o', help='out file \ndefault: same as input logfile',action="store") parser.add_argument('-e', help='report output type: [md,pdf,html] \ndefault: pdf',default='pdf',action="store") parser.add_argument('-c',help='Report options [(f)ull,fa(i)l,(p)ass]\ndefault: f',default='f',action="store") parser.add_argument('-v',help='verbose',action="store_true") parser.add_argument('logfile', help='The log file generated by pyrpipe',action="store") args = parser.parse_args(sys.argv[2:]) logFile=args.logfile envLog=checkEnvLog(logFile) #parse args vFlag=args.v if vFlag: print("Generating report") outFile="" if args.o is None: outFile=pu.get_file_basename(args.logfile) else: outFile=args.o outFile+='.'+args.e if args.e in ['pdf','html','md']: htmlReport=generateHTMLReport('simpleDiv.html',logFile,envLog,coverage=args.c) if args.e=='pdf': writeHtmlToPdf(htmlReport,outFile) elif args.e=='html': writeHtml(htmlReport,outFile) elif args.e == 'md': writeHtmlToMarkdown(htmlReport,outFile) else: pu.print_boldred("unknown extension:"+args.e+". Exiting")
def checkEnvLog(logFile): """ Check log exist and return path to corresponding ENV log Parameters ---------- logFile : str path to log file. Returns ------- envLog : TYPE DESCRIPTION. """ #check all logs exist logFileDir = pu.get_file_directory(logFile) basename = pu.get_file_basename(logFile) envLog = os.path.join(logFileDir, basename + "ENV.log") if not pu.check_files_exist(logFile, envLog): print("Please check missing log files. Exiting.") sys.exit(1) return envLog
def sam_to_bam(self, sam_file, out_dir="", out_suffix="", delete_sam=False, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Convert sam file to a bam file. Output bam file will have same name as input sam. out_suffix: string Suffix for the output sam file delete_sam: bool delete the sam file after conversion verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to trimgalore. This will override the existing options :return: Returns the path to the bam file. Returns empty string if operation failed. :rtype: string """ if not out_dir: out_dir = pu.get_file_directory(sam_file) else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) fname = pu.get_file_basename(sam_file) #output will be out_bam out_bam = os.path.join(out_dir, fname + out_suffix + '.bam') newOpts = {"--": (sam_file, ), "-o": out_bam, "-b": ""} mergedOpts = {**kwargs, **newOpts} status = self.run_samtools("view", verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) if not status: print("Sam to bam failed for:" + sam_file) return "" #check if bam file exists if not pu.check_files_exist(out_bam): return "" #delete_sam_file if delete_sam: if not pe.deleteFileFromDisk(sam_file): print("Error deleting sam file:" + sam_file) #return path to file return out_bam
def perform_assembly(self, bam_file, out_dir="", out_suffix="_stringtie", overwrite=True, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Function to run stringtie using a bam file. Parameters ---------- bam_file: string path to the bam file out_suffix: string Suffix for the output gtf file overwrite: bool Overwrite if output file already exists. verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to stringtie. This will override the existing options in self.passed_args_dict (only replace existing arguments and not replace all the arguments). :return: Returns the path to output GTF file :rtype: string """ #create path to output file fname = pu.get_file_basename(bam_file) if not out_dir: out_dir = pu.get_file_directory(bam_file) out_gtf_file = os.path.join(out_dir, fname + out_suffix + ".gtf") """ Handle overwrite """ if not overwrite: #check if file exists. return if yes if os.path.isfile(out_gtf_file): print("The file " + out_gtf_file + " already exists. Exiting..") return out_gtf_file #Add output file name and input bam new_opts = {"-o": out_gtf_file, "--": (bam_file, )} merged_opts = {**kwargs, **new_opts} #call stringtie status = self.run_stringtie(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **merged_opts) if status: #check if sam file is present in the location directory of sraOb if pu.check_files_exist(out_gtf_file): return out_gtf_file else: return ""
def sort_bam(self, bam_file, out_dir="", out_suffix="", threads=None, delete_bam=False, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Sorts an input bam file. Outpufile will end in _sorted.bam bam_file: str Path to the input bam file out_dir: str Path to output directory out_suffix: str Output file suffix threads: int Number of threads. Default: Use self.threads initialized in init(). delete_bam: bool Delete input bam_file verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to samtools. This will override the existing options :return: Returns path to the sorted bam file. Returns empty string if operation failed. :rtype: string """ if not out_dir: out_dir = pu.get_file_directory(bam_file) else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) fname = pu.get_file_basename(bam_file) #output will be out_bam outSortedbam_file = os.path.join(out_dir, fname + out_suffix + '_sorted.bam') #handle threads if not threads: threads = self.threads newOpts = { "--": (bam_file, ), "-o": outSortedbam_file, "-@": str(threads) } mergedOpts = {**newOpts, **kwargs} status = self.run_samtools("sort", verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) if not status: print("Bam sort failed for:" + bam_file) return "" #check if bam file exists if not pu.check_files_exist(outSortedbam_file): return "" if delete_bam: if not pe.deleteFileFromDisk(bam_file): print("Error deleting sam file:" + bam_file) #return path to file return outSortedbam_file
def perform_qc(self,sra_object,out_dir="",out_suffix="_trimgalore",objectid="NA"): """Function to perform qc using trimgalore. The function perform_qc() is consistent for all QC classess. Parameters ---------- sra_object: SRA An SRA object whose fastq files will be used out_dir: str Path to output directory out_suffix: string Suffix for the output sam file objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the path of fastq files after QC. tuple has one item for single end files and two for paired. :rtype: tuple """ if not out_dir: out_dir=sra_object.directory else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #get layout if sra_object.layout=='PAIRED': fq1=sra_object.fastq_path fq2=sra_object.fastq2_path internal_args=(fq1,fq2) internal_kwargs={"--paired":"","-o":out_dir} """ running trim galore will create two files named <input>_val_1.fq and <input>_val_2.fq move these files to the specified out files """ file1=os.path.join(out_dir,pu.get_file_basename(fq1)+"_val_1.fq") file2=os.path.join(out_dir,pu.get_file_basename(fq2)+"_val_2.fq") #targets out_file1=os.path.join(out_dir,pu.get_file_basename(fq1)+out_suffix+".fastq") out_file2=os.path.join(out_dir,pu.get_file_basename(fq2)+out_suffix+".fastq") #check if final files already exists if not _force and pu.check_files_exist(out_file1,out_file2): pu.print_green('Target files {}, {} already exist.'.format(out_file1,out_file2)) return out_file1,out_file2 #run trimgalore status=self.run(*internal_args,objectid=objectid,target=[file1,file2],**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(file1,out_file1,verbose=False) pe.move_file(file2,out_file2,verbose=False) if not pu.check_files_exist(out_file1,out_file2): return "" return out_file1,out_file2 return ("",) else: fq=sra_object.fastq_path internal_args=(fq,) internal_kwargs={"-o":out_dir} """ running trim galore will create one file named <input>_trimmed.fq move these files to the specified out files """ file=os.path.join(out_dir,pu.get_file_basename(fq)+"_trimmed.fq") #target out_file=os.path.join(out_dir, pu.get_file_basename(fq)+out_suffix+".fastq") #check if final files already exists if not _force and pu.check_files_exist(out_file): pu.print_green('Target files {} already exist.'.format(out_file)) return (out_file,) #run trimgalore status=self.run(*internal_args,objectid=objectid,target=file,**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(file,out_file) if not pu.check_files_exist(out_file): return "" return (out_file,) return ("",)
def perform_qc(self, sra_object, out_dir="", out_suffix="_bbduk", overwrite=True, threads=None, max_memory=None, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Run bbduk on fastq files specified by the sra_object Parameters ---------- sra_object: SRA an SRA object out_dir: string Path to out dir. Default: sra_object.location out_suffix: string Suffix for output file name overwrite: bool overwrite existing files threads: int Num threads to use max_memory: float Max memory to use in GB verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict options passed to bbduk :return: Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired. :rtype: tuple """ #make out_dir if not out_dir: out_dir = sra_object.location else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if not threads: threads = self.threads if not max_memory: max_memory = self.max_memory memory_flag = "-Xmx" + str(max_memory) + "g" #optimize parameters #if optimize: # print("generating suggested parameters XXX TD") if sra_object.layout == 'PAIRED': fq1 = sra_object.localfastq1Path fq2 = sra_object.localfastq2Path out_fileName1 = pu.get_file_basename(fq1) + out_suffix + ".fastq" out_fileName2 = pu.get_file_basename(fq2) + out_suffix + ".fastq" out_file1Path = os.path.join(out_dir, out_fileName1) out_file2Path = os.path.join(out_dir, out_fileName2) newOpts = { "in": fq1, "in2": fq2, "out": out_file1Path, "out2": out_file2Path, "--": (memory_flag, ), "threads": str(threads) } mergedOpts = {**newOpts, **kwargs} #run bbduk if self.run_bbduk(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts): if pu.check_files_exist(out_file1Path, out_file2Path): return (out_file1Path, out_file2Path) return ("", ) else: fq = sra_object.localfastqPath out_fileName = pu.get_file_basename(fq) + out_suffix + ".fastq" out_filePath = os.path.join(out_dir, out_fileName) newOpts = {"in": fq, "out": out_filePath, "--": (memory_flag, )} mergedOpts = {**newOpts, **kwargs} #run bbduk if self.run_bbduk(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts): if pu.check_files_exist(out_filePath): return (out_filePath, ) return ("", )
def perform_assembly(self,bam_file,out_dir="",out_suffix="_cufflinks",reference_gtf=None,threads=None,overwrite=True,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs): """Function to run cufflinks with BAM file as input. Parameters ---------- bam_file: string path to bam file out_dir: output directory out_suffix: string Suffix for the output gtf file reference_gtf: str Path to reference gtf threads: int Number of threads to use overwrite: bool Overwrite if output file already exists. verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. kwargs: dict Options to pass to cufflinks. :return: Returns the path to output GTF file :rtype: string """ #create path to output file fname=pu.get_file_basename(bam_file) if not out_dir: out_dir=pu.get_file_directory(bam_file) else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) out_gtf_file=os.path.join(out_dir,fname+out_suffix+".gtf") """ Handle overwrite """ if not overwrite: #check if file exists. return if yes if os.path.isfile(out_gtf_file): print("The file "+out_gtf_file+" already exists. Exiting..") return out_gtf_file if not threads: threads=self.threads #Add output file name and input bam new_opts={"-o":out_dir,"--":(bam_file,),"-p":str(threads)} #add ref gtf if reference_gtf: if not pu.check_files_exist(reference_gtf): pu.print_boldred("Error: Provided reference GTF {} doesn't exist. Exiting...".format(reference_gtf)) return "" new_opts["-g"]=reference_gtf merged_opts={**new_opts,**kwargs} #call cufflinks status=self.run_cufflinks(verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**merged_opts) if status: #move out_dir/transcripts.gtf to outfile pe.move_file(os.path.join(out_dir,"transcripts.gtf"),out_gtf_file) #check if sam file is present in the location directory of sraOb if pu.check_files_exist(out_gtf_file): return out_gtf_file else: return ""
def perform_qc(self, sra_object, out_dir="", out_suffix="_trimgalore", verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Function to perform qc using trimgalore. The function perform_qc() is consistent for all QC classess. Parameters ---------- sra_object: SRA An SRA object whose fastq files will be used out_suffix: string Suffix for the output sam file verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to trimgalore. This will override the existing options :return: Returns the path of fastq files after QC. tuple has one item for single end files and two for paired. :rtype: tuple """ if not out_dir: out_dir = sra_object.location else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #create new options based on parametrs newOpts = {} #get layout if sra_object.layout == 'PAIRED': fq1 = sra_object.localfastq1Path fq2 = sra_object.localfastq2Path out_file1 = os.path.join( out_dir, pu.get_file_basename(fq1) + out_suffix + ".fastq") out_file2 = os.path.join( out_dir, pu.get_file_basename(fq2) + out_suffix + ".fastq") newOpts = {"--paired": "", "--": (fq1, fq2), "-o": out_dir} mergedOpts = {**kwargs, **newOpts} #run trimgalore self.run_trimgalore(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) """ running trim galore will create two files named <input>_val_1.fq and <input>_val_2.fq move these files to the specified out files """ oldFile1 = os.path.join(out_dir, pu.get_file_basename(fq1) + "_val_1.fq") oldFile2 = os.path.join(out_dir, pu.get_file_basename(fq2) + "_val_2.fq") pe.move_file(oldFile1, out_file1) pe.move_file(oldFile2, out_file2) if not pu.check_files_exist(out_file1, out_file2): print("Trimgalore failed") return ("", ) return out_file1, out_file2 else: fq = sra_object.localfastqPath out_file = os.path.join( out_dir, pu.get_file_basename(fq) + out_suffix + ".fastq") #giving input arguments as a tuple "--":(fq,) newOpts = {"--": (fq, ), "-o": out_dir} #run trimgalore mergedOpts = {**kwargs, **newOpts} self.run_trimgalore(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) """ running trim galore will create one file named <input>_trimmed.fq move these files to the specified out files """ oldFile = os.path.join(out_dir, pu.get_file_basename(fq) + "_trimmed.fq") pe.move_file(oldFile, out_file) if not pu.check_files_exist(out_file): print("Trimgalore failed") return ("", ) return (out_file, )
def perform_qc(self, sra_object, out_dir="", out_suffix="_bbduk", overwrite=True, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Run bbduk on fastq files specified by the sra_object Parameters ---------- arg1: SRA an SRA object arg2: string Suffix for output file name arg3: bool overwrite existing files verbose (bool): Print stdout and std error quiet (bool): Print nothing logs (bool): Log this command to pyrpipe logs objectid (str): Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. arg3: dict options passed to bbduk Returns tuple Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired. """ #make out_dir if not out_dir: out_dir = sra_object.location else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if sra_object.layout == 'PAIRED': fq1 = sra_object.localfastq1Path fq2 = sra_object.localfastq2Path out_fileName1 = pu.get_file_basename(fq1) + out_suffix + ".fastq" out_fileName2 = pu.get_file_basename(fq2) + out_suffix + ".fastq" out_file1Path = os.path.join(out_dir, out_fileName1) out_file2Path = os.path.join(out_dir, out_fileName2) newOpts = { "in": fq1, "in2": fq2, "out": out_file1Path, "out2": out_file2Path } mergedOpts = {**kwargs, **newOpts} #run bbduk if self.run_bbduk(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts): if pu.check_files_exist(out_file1Path, out_file2Path): return (out_file1Path, out_file2Path) return ("", ) else: fq = sra_object.localfastqPath out_fileName = pu.get_file_basename(fq) + out_suffix + ".fastq" out_filePath = os.path.join(out_dir, out_fileName) newOpts = {"in": fq, "out": out_filePath} mergedOpts = {**kwargs, **newOpts} #run bbduk if self.run_bbduk(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts): if pu.check_files_exist(out_filePath): return (out_filePath, ) return ("", )
def stringtie_merge(self,*args,out_dir=None,out_suffix="_stringtieMerge",threads=None,overwrite=False,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs): """Function to run stringtie merge. Parameters ---------- args: tuple path to gtf files to merge out_suffix: string Suffix for output gtf file name threads: int Number of threads to use overwrite: bool Overwrite if output file already exists. verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to stringtie. :return: Returns the path to the merged GTF file :rtype: string """ if len(args) < 1: print("ERROR: No input gtf for stringtie merge.") return "" #create path to output sam file fname=pu.get_file_basename(args[0]) if not out_dir: out_dir=pu.get_file_directory(args[0]) out_gtf_file=os.path.join(out_dir,fname+out_suffix+".gtf") if not overwrite: #check if file exists. return if yes if os.path.isfile(out_gtf_file): print("The file "+out_gtf_file+" already exists. Exiting..") return out_gtf_file if not threads: threads=self.threads #Add merge flag, output file name and input bam new_opts={"--merge":"","-o":out_gtf_file,"--":args,"-p":str(threads)} merged_opts={**new_opts,**kwargs} #call stringtie status=self.run_stringtie(verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**merged_opts) if status: #check if sam file is present in the location directory of sraOb if pu.check_files_exist(out_gtf_file): return out_gtf_file else: return ""
def perform_cleaning(self,sra_object,bbsplit_index,out_dir="",out_suffix="_bbsplit",objectid="NA",**kwargs): """ Remove contaminated reads mapping to given reference using bbsplit Parameters ---------- sra_object: SRA an SRA object bbsplit_index: string Path to bbsplit index or fasta file which will generate index out_dir: string Path to output dir. Default: sra_object.directory out_suffix: string Suffix for output file name objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict options passed to bbsplit :return: Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired. :rtype: tuple """ #check index indexPath="" if not pu.check_paths_exist(bbsplit_index): #index folder doesn't exist #check if input is path to fasta if not pu.check_files_exist(bbsplit_index): print("Error: Please check bbsplit index") return ("",) #check if index folder "ref" exists in this directory indexPath=os.path.join(pu.get_file_directory(bbsplit_index),"ref") if pu.check_paths_exist(indexPath): print("Using bbsplit index: "+indexPath) else: #create new index print("Creating new index"+indexPath) newOpts={"ref_x":bbsplit_index,"path": pu.get_file_directory(bbsplit_index)} mergedOpts={**kwargs,**newOpts} #run bbduk if not self.run_bbsplit(objectid=objectid,**mergedOpts): print("Error creating bbsplit index.") return ("",) if not pu.check_paths_exist(indexPath): print("Error creating bbsplit index.") return ("",) else: indexPath=bbsplit_index #indexPath point to the ref directory, go one directory higher indexPath=os.path.dirname(indexPath) #make out_dir if not out_dir: out_dir=sra_object.directory else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if sra_object.layout=='PAIRED': fq1=sra_object.fastq_path fq2=sra_object.fastq2_path #append input and output options out_fileName1=pu.get_file_basename(fq1)+out_suffix+".fastq" out_fileName2=pu.get_file_basename(fq2)+out_suffix+".fastq" out_file1Path=os.path.join(out_dir,out_fileName1) out_file2Path=os.path.join(out_dir,out_fileName2) newOpts={"in1":fq1,"in2":fq2,"outu1":out_file1Path,"outu2":out_file2Path,"path":indexPath} mergedOpts={**kwargs,**newOpts} #run bbsplit if self.run_bbsplit(objectid=objectid,target=[out_file1Path,out_file2Path],**mergedOpts): if pu.check_files_exist(out_file1Path,out_file2Path): return(out_file1Path,out_file2Path) return("",) else: fq=sra_object.fastq_path #append input and output options out_fileName=pu.get_file_basename(fq)+out_suffix+".fastq" out_filePath=os.path.join(out_dir,out_fileName) newOpts={"in":fq,"outu":out_filePath,"path":indexPath} mergedOpts={**kwargs,**newOpts} #run bbsplit if self.run_bbsplit(objectid=objectid,target=out_filePath,**mergedOpts): if pu.check_files_exist(out_filePath): return(out_filePath,) return("",)
def perform_assembly(self,bam_file,out_dir=None,out_suffix="_stringtie",reference_gtf=None,threads=None,overwrite=False,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs): """Function to run stringtie using a bam file. Parameters ---------- bam_file: string path to the bam file out_suffix: string Suffix for the output gtf file reference_gtf: str Path to the reference gtf used as guide threads: int Number of threads to use overwrite: bool Overwrite if output file already exists. verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to stringtie. :return: Returns the path to output GTF file :rtype: string """ #create path to output file fname=pu.get_file_basename(bam_file) if not out_dir: out_dir=pu.get_file_directory(bam_file) out_gtf_file=os.path.join(out_dir,fname+out_suffix+".gtf") """ Handle overwrite """ if not overwrite: #check if file exists. return if yes if os.path.isfile(out_gtf_file): print("The file "+out_gtf_file+" already exists. Exiting..") return out_gtf_file if not threads: threads=self.threads #Add output file name and input bam new_opts={"-o":out_gtf_file,"--":(bam_file,),"-p":str(threads)} if reference_gtf: if not pu.check_files_exist(reference_gtf): pu.print_boldred("Error: Provided reference GTF {} doesn't exist. Exiting...".format(reference_gtf)) return "" new_opts["-G"]=reference_gtf merged_opts={**new_opts,**kwargs} #call stringtie status=self.run_stringtie(verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**merged_opts) if status: #check if sam file is present in the location directory of sraOb if pu.check_files_exist(out_gtf_file): return out_gtf_file else: return ""
def perform_assembly(self, bam_file, out_dir=None, out_suffix="_cufflinks", objectid="NA"): """Function to run cufflinks with BAM file as input. Parameters ---------- bam_file: string path to bam file out_dir: output directory out_suffix: string Suffix for the output gtf file objectid: str Provide an id to attach with this command e.g. the SRR accession. :return: Returns the path to output GTF file :rtype: string """ #create path to output file fname = pu.get_file_basename(bam_file) if not out_dir: out_dir = pu.get_file_directory(bam_file) else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #Add output file name and input bam internal_args = (bam_file, ) internal_kwargs = {"-o": out_dir} #add positional args internal_kwargs['--'] = internal_args #targets outfile = os.path.join(out_dir, "transcripts.gtf") out_gtf_file = os.path.join(out_dir, fname + out_suffix + ".gtf") #if final file already exists if not _force and pu.check_files_exist(out_gtf_file): pu.print_green( 'Target files {} already exist.'.format(out_gtf_file)) return out_gtf_file #call cufflinks status = self.run(None, objectid=objectid, target=outfile, **internal_kwargs) if status: if not _dryrun: pe.move_file(outfile, out_gtf_file) if not pu.check_files_exist(out_gtf_file): return "" return out_gtf_file return ""