def runtest(): failed = False sraob = sra.SRA('ERR726985', directory='./pyrpipe_sratest') if not sraob.fastq_exists(): pu.print_boldred('Test failed') failed = True pu.print_notification('Cleaning up...') sraob.delete_fastq() os.rmdir(sraob.directory) if failed: pu.print_boldred('Paired end test failed') failed = False sraob = sra.SRA('SRR2134545', directory='./pyrpipe_sratest') if not sraob.fastq_exists(): pu.print_boldred('Test failed') failed = True pu.print_notification('Cleaning up...') sraob.delete_fastq() os.rmdir(sraob.directory) if failed: pu.print_boldred('Single end test failed') failed = False if not failed: pu.print_green( '\n#####################All Tests Passed#####################\n') os.rmdir('./pyrpipe_sratest')
def search_sra(self, path): """Search .sra file under a dir Return True if found otherwise False """ #search files under the path sra_files = pe.find_files(path, "*.sra") if len(sra_files) < 1: return False if len(sra_files) > 1: pu.print_boldred( "Found multiple .sra files. Using the first entry...") sra_path = sra_files[0] #self.location=path self.srr_accession = pu.get_file_basename(sra_path) self.localSRAFilePath = sra_path self.sraFileSize = pu.get_file_size(self.localSRAFilePath) #test if file is paired or single end if pe.is_paired(self.localSRAFilePath): self.layout = "PAIRED" else: self.layout = "SINGLE" pu.print_green("Found .sra " + self.localSRAFilePath) return True
def search_fastq(self, path): """Search .fastq file under a dir and create SRA object Return True if found otherwise False """ #search files under the path fq_files = pe.find_files(path, "*.fastq") if len(fq_files) < 1: return False if len(fq_files) > 2: pu.print_boldred("Can not determine .fastq. Exiting...") return False fq_files.sort() #case with single fastq if len(fq_files) == 1: self.localfastqPath = fq_files[0] pu.print_green("Found .fastq " + self.localfastqPath) self.layout = "SINGLE" #case with paired fastq if len(fq_files) == 2: self.localfastq1Path = fq_files[0] self.localfastq2Path = fq_files[1] pu.print_green("Found .fastq " + self.localfastq1Path + " " + self.localfastq2Path) self.layout = "PAIRED" #self.location=path #self.srr_accession=pu.get_file_basename(fq_files[0]) return True
def createMikadoGTFlist(self, out_file, out_dir, searchPath, searchQuery="*.gtf", strand=False): """Create a file to be used by mikado configure """ files = pe.find_files(searchPath, searchQuery) args = files #create out dir if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) outFilePath = os.path.join(out_dir, out_file + ".txt") gtfs = [] for l in args: thisName = pu.get_file_basename(l) if thisName: gtfs.append("\t".join([l, thisName, str(strand)])) f = open(outFilePath, "w") f.write("\n".join(gtfs)) f.close() pu.print_green("Mikado list file written to:" + outFilePath) return outFilePath
def build_index(self, in_fasta, dbname, out_dir=None, threads=None, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Build a diamond index and store its path in self """ #check input files if not pu.check_files_exist(in_fasta): pu.print_boldred( "Input fasta: {} not found...\n diamond makedb failed".format( in_fasta)) return False #create out_dir if not out_dir: out_dir = os.getcwd() if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #check if index already exists index_path = os.path.join(out_dir, dbname) self.index = index_path if self.check_index(): pu.print_green("Diamond index: {} exists, using it...".format( self.index)) self.index = index_path return True if not threads: threads = self.threads newOpts = { "--in": in_fasta, "-d": index_path, "--threads": str(threads) } #add input files to kwargs, overwrite newOpts with kwargs mergedOpts = {**newOpts, **kwargs} #call run_diamond status = self.run_diamond("makedb", verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) if status: self.index = index_path return True return False
def generateBenchmarkReport(logFile,envLog,filterList,tempDir,outFile="",verbose=False): """ ignores failed commands with exitcode !=0 """ ob=bm.Benchmark(logFile,envLog,out_dir=tempDir) #generate benchmarks ob.plot_time_perobject() ob.plot_time_perprogram() pu.print_green("Benchmark report saved to:"+tempDir+"/benchmark_reports")
def perform_quant(self,sra_object,out_suffix="",out_dir="",objectid="NA"): """Run kallisto quant sra_object: SRA SRA object contatining paths to fastq files out_suffix: str suffix for output file out_dir: str path to output directory objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Path to kallisto out directory :rtype: string """ if not out_dir: out_dir=os.path.join(sra_object.directory,"kallisto_out") else: #create out_dir if not exists if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if sra_object.layout == 'PAIRED': args=(sra_object.fastq_path,sra_object.fastq2_path) internal_kwargs={"-o":out_dir,"-i":self.index} else: args=(sra_object.fastq_path,) internal_kwargs={"-o":out_dir,"--single":"","-i":self.index} #targets outfile=os.path.join(out_dir,"abundance.tsv") newfile=os.path.join(out_dir,"abundance"+out_suffix+".tsv") #check if final files already exists if not _force and pu.check_files_exist(newfile): pu.print_green('Target files {} already exist.'.format(newfile)) return newfile #call kallisto status=self.run(*args,subcommand='quant',objectid=sra_object.srr_accession,target=outfile,**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(outfile,newfile) if not pu.check_files_exist(newfile): return "" return newfile return ""
def build_index(self,index_path,index_name,fasta,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs): """ build salmon index and store the path to index in self index_path: str path to the output directory index_name: str index name verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to salmon. This will override the existing options :return: status of salmon index :rtype: bool """ #check input if not pu.check_files_exist(fasta): pu.print_boldred("{} does not exist. Exiting".format(fasta)) return False #create out dir if not pu.check_paths_exist(index_path): if not pu.mkdir(index_path): print("ERROR in building hisat2 index. Failed to create index directory.") return False indexOut=os.path.join(index_path,index_name) newOpts={"-t":fasta,"-i":indexOut} mergedOpts={**kwargs,**newOpts} #call salmon status=self.run_salmon("index",verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**mergedOpts) if status: #check if sam file is present in the location directory of sra_object #if check_files_exist(os.path.join(indexOut,"versionInfo.json")): #not sure if this is reliable if pu.check_paths_exist(indexOut): self.salmon_index=indexOut self.passedArgumentDict['-i']=self.salmon_index pu.print_green("salmon index is:"+self.salmon_index) return True pu.print_boldred("Failed to create salmon index") return False
def build_index(self,index_path,index_name,fasta,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs): """Function to build kallisto index index_path: str path to the output directory index_name: str index name verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to kallisto. This will override the existing options in self.passed_args_dict (only replace existing arguments and not replace all the arguments). :return: Status of kallisto index :rtype: bool """ #check input if not pu.check_files_exist(fasta): pu.print_boldred("{} does not exist. Exiting".format(fasta)) return False #create out dir if not pu.check_paths_exist(index_path): if not pu.mkdir(index_path): print("ERROR in building kallisto index. Failed to create index directory.") return False indexOut=os.path.join(index_path,index_name) newOpts={"--":(fasta,),"-i":indexOut} mergedOpts={**kwargs,**newOpts} #call salmon status=self.run_kallisto("index",verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**mergedOpts) if status: #check if sam file is present in the location directory of sra_object if pu.check_files_exist(indexOut): self.kallisto_index=indexOut self.passedArgumentDict['-i']=self.kallisto_index pu.print_green("kallisto_index is:"+self.kallisto_index) return True else: pu.print_boldred("Failed to create kallisto index") return False
def perform_quant(self,sra_object,out_suffix="",out_dir="",objectid="NA"): """run salmon quant sra_object: SRA An SRA object with valid fastq files out_suffix: str suffix string fout out file out_dir: str path to outdir objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Path to salmon out file :rtype: string """ if not out_dir: out_dir=os.path.join(sra_object.directory,"salmon_out") else: #create out_dir if not exists if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if sra_object.layout == 'PAIRED': internal_kwargs={"-o":out_dir,"-l":"A","-1":sra_object.fastq_path,"-2":sra_object.fastq2_path,"-i":self.index} else: internal_kwargs={"-o":out_dir,"-l":"A","-r":sra_object.fastq_path,"-i":self.index} #targets outfile=os.path.join(out_dir,"quant.sf") newfile=os.path.join(out_dir,"quant"+out_suffix+".sf") #check if final files already exists if not _force and pu.check_files_exist(newfile): pu.print_green('Target files {} already exist.'.format(newfile)) return newfile #call salmon status=self.run(None,subcommand='quant',objectid=sra_object.srr_accession,target=newfile,**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(outfile,newfile) if not pu.check_files_exist(newfile): return "" return newfile return ""
def createMikadoGTFlist(self, out_file, out_dir, searchPath, searchQuery="*.gtf", strand=False): """Create a file to be used by mikado configure out_file: str outfile name out_dir: str path to out_dir searchPath: str Path where gtf/gff files will be searched searchQuery: str Query to perform search. Default: "*.gtf" strand: bool Stranded flag: Default false """ files = pe.find_files(searchPath, searchQuery, recursive=True) args = files #create out dir if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) outFilePath = os.path.join(out_dir, out_file + ".txt") gtfs = [] for l in args: thisName = pu.get_file_basename(l) if thisName: gtfs.append("\t".join([l, thisName, str(strand)])) f = open(outFilePath, "w") f.write("\n".join(gtfs)) f.close() pu.print_green("Mikado list file written to:" + outFilePath) return outFilePath
def search_fastq(self,path): """Search .fastq file under a dir and create SRA object Return True if found otherwise False """ #check files with names <SRR>_1.fastq and <SRR>_2.fastq fq=os.path.join(path,self.srr_accession+'_1.fastq') fq2=os.path.join(path,self.srr_accession+'_2.fastq') if pu.check_files_exist(fq,fq2): self.fastq_path=fq self.fastq2_path=fq2 pu.print_green("Found .fastq "+self.fastq_path+" "+self.fastq2_path) self.layout="PAIRED" return True #check single end file fq=os.path.join(path,self.srr_accession+'.fastq') if pu.check_files_exist(fq): self.fastq_path=fq pu.print_green("Found .fastq "+self.fastq_path) self.layout="SINGLE" return True #search files under the path #fq_files=pe.find_files(path,"*.fastq") fq_files=pu.find_files(path,".fastq$") if len(fq_files)<1: return False if len(fq_files)>2: return False fq_files.sort() #case with single fastq if len(fq_files)==1: self.fastq_path=fq_files[0] pu.print_green("Found .fastq "+self.fastq_path) self.layout="SINGLE" #case with paired fastq if len(fq_files)==2: self.fastq_path=fq_files[0] self.fastq2_path=fq_files[1] pu.print_green("Found .fastq "+self.fastq_path+" "+self.fastq2_path) self.layout="PAIRED" return True
def init_from_accession(self, srr_accession, location): """Create SRA object using provided srr accession and location to save the data """ self.dep_list = ['prefetch', "fasterq-dump"] if not pe.check_dependencies(self.dep_list): raise Exception("ERROR: Please install missing programs.") if srr_accession is None: raise Exception("Please provide a valid accession") if location is None: location = os.getcwd() #pu.print_info("Creating SRA: "+srr_accession) self.srr_accession = srr_accession #create a dir named <srr_accession> and use as location self.location = os.path.join(location, self.srr_accession) #search for existing files in location #self.search_fastq(self.location) #scan path for sra #self.search_sra(self.location) #check SRA file if pu.check_files_exist( os.path.join(self.location, self.srr_accession + ".sra")): pu.print_green(self.srr_accession + ".sra exists.") self.localSRAFilePath = os.path.join(self.location, self.srr_accession + ".sra") self.sraFileSize = pu.get_file_size(self.localSRAFilePath) #test if file is paired or single end if pe.is_paired(self.localSRAFilePath): self.layout = "PAIRED" else: self.layout = "SINGLE" #check fastq file self.search_fastq(self.location)
def build_index(self, index_path, genome, objectid="NA"): """Build a bowtie2 index with given parameters and saves the new index to self.index. Parameters ---------- index_path: string Path where the index will be created genome: string Path to the reference genome objectid : string Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the status of bowtie2-build :rtype: bool """ #check input references if not _force: if pu.check_bowtie2index(index_path): pu.print_green( "bowtie index {} already exists.".format(index_path)) self.index = index_path return True #check input files if not (pu.check_files_exist(genome)): pu.print_boldred( "Please provide a valid input fasta file to build bowtie2 index" ) raise ValueError("Please check input to star build index") return False bowtie2_build_args = [ '-f', '-c', '--large-index', '--debug', '--sanitized', '--verbose', '-a', '--noauto', '-p', '--packed', '--bmax', '--bmaxdivn', '--dcv', '--nodc', '-r', '--noref', '-3', '--justref', '-o', '--offrate', '-t', '--ftabchars', '--threads', '--seed', '-q', '--quiet' ] #create the out dir indexdir = pu.get_file_directory(index_path) if not pu.check_paths_exist(indexdir): if not pu.mkdir(indexdir): raise OSError( "Error creating bowtie2 index. Failed to create index directory." ) return False args = (genome, index_path) internal_kwargs = {"--threads": self._threads} #read build parameters yamlfile = os.path.join(_params_dir, 'bowtie2_index.yaml') if pu.check_files_exist(yamlfile): yaml_params = pl.YAML_loader(yamlfile) yaml_kwargs = yaml_params.get_kwargs() internal_kwargs = {**yaml_kwargs, **internal_kwargs} #add positional args internal_kwargs['--'] = args bowtie2Build_Cmd = ['bowtie2-build'] #add options bowtie2Build_Cmd.extend( pu.parse_unix_args(bowtie2_build_args, internal_kwargs)) #start ececution status = pe.execute_command(bowtie2Build_Cmd, objectid=objectid) if not status: pu.print_boldred("bowtie2-build failed") return False if status: if pu.check_bowtie2index(index_path) and not _dryrun: #update object's index self.index = index_path if self.check_index(): return True else: raise OSError("Error building bowtie2 index") return True
def run_fasterqdump(self, delete_sra=False, verbose=False, quiet=False, logs=True, **kwargs): """Execute fasterq-dump to convert .sra file to fastq files. The fastq files will be stored in the same directory as the sra file. All fastq files should be consistently named using the extension .fastq Parameters ---------- delete_sra: bool delete sra file after completion verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs kwargs: dict A dict containing fasterq-dump arguments :return: Return status of the fasterq-dump command. True if successful download and False if failed. :rtype: bool Examples -------- >>> object.run_fasterqdump() True """ #check if fastq files exists already if self.fastqFilesExistsLocally(): pu.print_green("Fastq files exist already") return True #first check is sra exists if not self.sraFileExistsLocally(): pu.print_boldred( "Error executing fasterq-dump: .sra file not found. Please run download_sra()." ) return False #else directly run fasterq-dump on accession ? fasterqdumpArgsList = [ '-f', '-t', '-s', '-N', '-X', '-a', '-p', '-c', '-o', '-O', '-h', '-V', '-L', '-v', '-q', '-b', '-m', '-e', '-x', '-S', '-3', '-P', '-M', '-B', '--option-file', '--strict', '--table', '--include-technical', '--skip-technical', '--concatenate-reads' ] #ignore location and file name arguments if given if '-O' in kwargs: print("Ignoring -O flag." + " location is: " + self.location) #delete -O parameter del kwargs['-O'] if '-o' in kwargs: print("Ignoring -o flag." + " File name is: " + self.srr_accession) #delete -o parameter del kwargs['-o'] #execute command fstrqd_Cmd = ['fasterq-dump'] fstrqd_Cmd.extend(pu.parse_unix_args(fasterqdumpArgsList, kwargs)) #add location fstrqd_Cmd.extend(['-O', self.location]) #add output filename. output will be <srr_accession>.fastq or <srr_accession>_1.fastq and <srr_accession>_2.fastq fstrqd_Cmd.extend(['-o', self.srr_accession + ".fastq"]) fstrqd_Cmd.append(self.localSRAFilePath) #execute command cmdStatus = pe.execute_command(fstrqd_Cmd, objectid=self.srr_accession) if not cmdStatus: print("fasterqdump failed for:" + self.srr_accession) return False #check if fastq files are downloaded if (self.layout == "SINGLE"): self.localfastqPath = os.path.join(self.location, self.srr_accession + ".fastq") if not pu.check_files_exist(self.localfastqPath): pu.print_boldred("Error running fasterq-dump file. File " + self.localfastqPath + " does not exist!!!") return False else: self.localfastq1Path = os.path.join( self.location, self.srr_accession + "_1.fastq") self.localfastq2Path = os.path.join( self.location, self.srr_accession + "_2.fastq") if not pu.check_files_exist(self.localfastq1Path, self.localfastq2Path): pu.print_boldred("Error running fasterq-dump file. File " + self.localfastq1Path + " does not exist!!!") return False #delete sra file if specified if delete_sra: self.delete_sra() return True
def download_sra(self, verbose=False, quiet=False, logs=True, **kwargs): """This function downloads .sra file from NCBI SRA servers using the prefetch command. NCBI sra-toolkit 2.9 or higher must be installed on the system in order to use prefetch. prefetch will create a folder with name same as <srr_accession> under the location (path) specified. The path of downloaded file is saved in the object as localSRAPath. This localSRAPath is then used by other functions to access the downloaded data. The **kwargs is for passing arguments to the prefetch command. Parameters ---------- kwargs: dict dict containing additional prefetch arguments :return: Return status of the prefetch command. True if successful download and False if failed. :rtype: bool Examples -------- >>> object.download_sra() True """ #store path to the downloaded sra file self.localSRAFilePath = os.path.join(self.location, self.srr_accession + ".sra") #check if already exists if pu.check_files_exist(self.localSRAFilePath): pu.print_green("File already exists:" + self.localSRAFilePath) #save file .sra file size self.sraFileSize = pu.get_file_size(self.localSRAFilePath) #test if file is paired or single end if pe.is_paired(self.localSRAFilePath): self.layout = "PAIRED" else: self.layout = "SINGLE" return True pu.print_info("Downloading " + self.srr_accession + " ...") #scan for prefetch arguments prefetchArgsList = [ '-f', '-t', '-l', '-n', '-s', '-R', '-N', '-X', '-o', '-a', '--ascp-options', '-p', '--eliminate-quals', '-c', '-o', '-O', '-h', '-V', '-L', '-v', '-q' ] #ignore location and file name arguments if given if '-O' in kwargs: print("Ignoring -O flag." + " location is: " + self.location) #delete -O parameter del kwargs['-O'] if '-o' in kwargs: print("Ignoring -o flag." + " File name is: " + self.srr_accession) #delete -o parameter del kwargs['-o'] prefetch_Cmd = ['prefetch'] prefetch_Cmd.extend(pu.parse_unix_args(prefetchArgsList, kwargs)) prefetch_Cmd.extend(['-O', self.location]) prefetch_Cmd.append(self.srr_accession) cmdStatus = pe.execute_command(prefetch_Cmd, objectid=self.srr_accession) if not cmdStatus: pu.print_boldred("prefetch failed for:" + self.srr_accession) return False #validate path exists if not pu.check_files_exist(self.localSRAFilePath): pu.print_boldred("Error downloading file. File " + self.localSRAFilePath + " does not exist!!!") return False print("Downloaded file: " + self.localSRAFilePath + " {0} ".format(pu.get_file_size(self.localSRAFilePath))) #save file .sra file size self.sraFileSize = pu.get_file_size(self.localSRAFilePath) #test if file is paired or single end if pe.is_paired(self.localSRAFilePath): self.layout = "PAIRED" else: self.layout = "SINGLE" return True
def download_fastq(self, verbose=False, quiet=False, logs=True, procs=2, **kwargs): """Function to download fastq files """ #check if fastq files exists already if self.fastqFilesExistsLocally(): pu.print_green("Fastq files exist already") return True fasterqdumpArgsList = [ '-f', '-t', '-s', '-N', '-X', '-a', '-p', '-c', '-o', '-O', '-h', '-V', '-L', '-v', '-q', '-b', '-m', '-x', '-S', '-3', '-P', '-M', '-B', '--option-file', '--strict', '--table', '--include-technical', '--skip-technical', '--concatenate-reads' ] fstrqd_Cmd = ['fasterq-dump'] fstrqd_Cmd.extend(pu.parse_unix_args(fasterqdumpArgsList, kwargs)) #add location fstrqd_Cmd.extend(['-O', self.location]) #add output filename. output will be <srr_accession>.fastq or <srr_accession>_1.fastq and <srr_accession>_2.fastq fstrqd_Cmd.extend(['-o', self.srr_accession + ".fastq"]) fstrqd_Cmd.extend(['-e', str(procs)]) if self.sraFileExistsLocally(): fstrqd_Cmd.append(self.localSRAFilePath) else: fstrqd_Cmd.append(self.srr_accession) #execute command cmdStatus = pe.execute_command(fstrqd_Cmd, objectid=self.srr_accession) if not cmdStatus: print("fasterqdump failed for:" + self.srr_accession) return False if not hasattr(self, 'layout'): fq_files = pe.find_files(self.location, self.srr_accession + "*.fastq") if len(fq_files) == 1: self.layout = 'SINGLE' else: self.layout = 'PAIRED' #check if fastq files are downloaded if (self.layout == "SINGLE"): self.localfastqPath = os.path.join(self.location, self.srr_accession + ".fastq") if not pu.check_files_exist(self.localfastqPath): pu.print_boldred("Error running fasterq-dump file. File " + self.localfastqPath + " does not exist!!!") return False else: self.localfastq1Path = os.path.join( self.location, self.srr_accession + "_1.fastq") self.localfastq2Path = os.path.join( self.location, self.srr_accession + "_2.fastq") if not pu.check_files_exist(self.localfastq1Path, self.localfastq2Path): pu.print_boldred("Error running fasterq-dump file. File " + self.localfastq1Path + " does not exist!!!") return False return True
def run(self, *args, subcommand=None, target=None, requires=None, objectid=None, verbose=None, logs=None, **kwargs): """ Parameters ---------- *args : Tuple Positoinal arguments passed to a command. This will copmletely REPLACE the exsiting self._args created during initialization of the runnable object. subcommand : String or List, optional DESCRIPTION. subcommand passed to the command. The default is None. target : Str or List of Str, optional DESCRIPTION. The expected output/target files produced by the run operation. False is returned is all target files are not found after the command. The default is None. requires : Str or List of Str, optional DESCRIPTION. Files required to strat the run method. Exception is thrown if files are missing. The default is None. objectid : Str, optional DESCRIPTION. A uniq id to identify the run operation in the logs. Thi is useful for benchmarks. The default is None. **kwargs : Keyword arguments DESCRIPTION. The options to be passed to the command. This will OVERRIDE ANY EXISTING options in the self._kwargs created during initialization of the runnable object. Raises ------ TypeError If incorerct types are used for target and required. FileNotFoundError Raises FileNotFoundError if any of the required files are missing. OSError Raises OSError if the command is incorrect or not present in path. ValueError Raises ValueError if args_type is something other than LINUX or JAVA. Returns ------- bool Return the status of command as True or False. True implies command had 0 exit-code and all target files were found after the command finished. """ #create target list target_list = [] locks = [] requires_list = [] if target: if isinstance(target, str): target_list = [target] elif isinstance(target, list): target_list = target else: raise TypeError("target must be a string or a list object") #ckeck for locks and remove previous locks and associated targets if exist for target in target_list: self.verify_integrity(target) #if target already present and not overwrite exists then return if not _force and target_list: if self.verify_target_list(target_list): pu.print_green('Target files {} already exist.'.format( ', '.join(target_list))) return True #check if all requirements are satisfied if requires: if isinstance(requires, str): requires_list = [requires] elif isinstance(requires, list): requires_list = requires else: raise TypeError("requires must be a string or a list object") #Raise exception if requirements not satisfied if requires_list: if not self.verify_target_list(requires_list): pu.print_boldred('Required files {} fot found.'.format( ', '.join(requires_list))) raise FileNotFoundError("FilesNotFound") #check if any required file had lock for file in requires_list: if len(self.get_lock_files(file)): pu.print_boldred( 'Required file {} is corrupt. Please verify file is correct and remove any .Lock files' .format(', '.join(requires_list))) raise FileNotFoundError("FilesNotFound") #override class kwargs by passed kwargs kwargs = {**self._kwargs, **kwargs} #if no args provided use constructor's args if not args: args = self._args #if args are not None if args and args[0]: kwargs['--'] = args #make a copy of self._command if not self._command: pu.print_boldred("Error: command can not be None or empty") raise OSError("CommandNotFoundException") cmd = [] if isinstance(self._command, list): cmd = self._command.copy() elif isinstance(self._command, str): cmd = [self._command] #if subcommand supplied #get valid args valid_args_subcommand = self.get_valid_parameters(subcommand) if subcommand: if isinstance(subcommand, str): subcommand = [subcommand] #add to command cmd.extend(subcommand) #parse and add parameters if self._args_style == 'LINUX': cmd.extend(pu.parse_unix_args(valid_args_subcommand, kwargs)) elif self._args_style == 'JAVA': cmd.extend(pu.parse_java_args(valid_args_subcommand, kwargs)) else: pu.print_boldred("Unknown args style: {}".format(self._args_style)) raise ValueError("Unknown args style") #create locks on target; locks indicate incomplete commands if not _dryrun: locks = self.create_lock(target_list, ' '.join(cmd)) #execute command cmd_status = pe.execute_command(cmd, objectid=objectid, verbose=verbose, logs=logs) # if command finished remove locks self.remove_locks(locks) if not cmd_status: pu.print_boldred("{} failed: {}".format(self._command, " ".join(cmd))) #remove target files if not _dryrun and target_list: pu.print_boldred("Removing target files {}: ".format( ', '.join(target_list))) pe.delete_files(*target_list) return False if cmd_status and target_list and not _dryrun: return self.verify_target_list(target_list, verbose=True) #return status return cmd_status
def execute_command(cmd, verbose=False, quiet=False, logs=True, dryrun=False, objectid="NA", command_name=""): """Function to execute commands using popen. All commands executed by this function can be logged and saved to pyrpipe logs. Parameters ---------- cmd: list command to execute via popen in a list verbose: bool Whether to print stdout and stderr. Default: False. All stdout and stderr will be saved to logs regardless of this flag. quiet: bool Absolutely no output on screen logs: bool Log the execution dryrun: bool If True, perform a dry run i.e. print commands to screen and log and exit objectid: string An id to be attached with the command. This is useful fo storing logs for SRA objects where object id is the SRR id. command_name: string Name of command to be save in log. If empty it is determined as the first element of the cmd list. :return: Return status.True is returncode is 0 :rtype: bool """ if not command_name: command_name = cmd[0] log_message = " ".join(cmd) #dryrun: print and exit if dryrun: pu.print_blue("$ " + log_message) #log #create a dict and dump as json logDict = { 'cmd': log_message, 'exitcode': "0", 'runtime': "0", 'starttime': "0", 'stdout': "dryrun", 'stderr': "", 'objectid': objectid, 'commandname': command_name } pyrpipeLoggerObject.cmd_logger.debug(json.dumps(logDict)) return True if not quiet: pu.print_blue("$ " + log_message) time_start = time.time() starttime_str = time.strftime("%y-%m-%d %H:%M:%S", time.localtime(time.time())) try: result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, stderr = result.communicate() #convert to string if stdout: stdout = stdout.decode("utf-8") else: stdout = "" if stderr: stderr = stderr.decode("utf-8") else: stderr = "" timeDiff = round(time.time() - time_start) #round to remove microsecond term if verbose: if stdout: pu.print_blue("STDOUT:\n" + stdout) if stderr: pu.print_boldred("STDERR:\n" + stderr) if not quiet: pu.print_green("Time taken:" + str(timedelta(seconds=timeDiff))) exitCode = result.returncode ##Add to logs if logs: ##get the program used and log its path if command_name not in pyrpipeLoggerObject.logged_programs: ##get which thisProgram #if subcommands are present use parent command parent_command = cmd[0] progDesc = { 'name': command_name, 'version': getProgramVersion(parent_command).strip(), 'path': getProgramPath(parent_command).strip() } pyrpipeLoggerObject.env_logger.debug(json.dumps(progDesc)) pyrpipeLoggerObject.logged_programs.append(command_name) #create a dict and dump as json logDict = { 'cmd': log_message, 'exitcode': exitCode, 'runtime': str(timedelta(seconds=timeDiff)), 'starttime': str(starttime_str), 'stdout': stdout, 'stderr': stderr, 'objectid': objectid, 'commandname': command_name } pyrpipeLoggerObject.cmd_logger.debug(json.dumps(logDict)) if exitCode == 0: return True else: #print the output print( "Following error occured executing above command (return code={}):" .format(str(exitCode))) print("STDOUT:\n" + stdout) print("STDERR:\n" + stderr) return False #handle exceptions except OSError as e: pu.print_boldred("OSError exception occured.\n" + str(e)) #log error timeDiff = round(time.time() - time_start) logDict = { 'cmd': log_message, 'exitcode': '-1', 'runtime': str(timedelta(seconds=timeDiff)), 'starttime': str(starttime_str), 'stdout': "", 'stderr': "OSError exception occured.\n" + str(e), 'objectid': objectid, 'commandname': command_name } pyrpipeLoggerObject.cmd_logger.debug(json.dumps(logDict)) return False except subprocess.CalledProcessError as e: pu.print_boldred("CalledProcessError exception occured.\n" + str(e)) #log error timeDiff = round(time.time() - time_start) logDict = { 'cmd': log_message, 'exitcode': '-1', 'runtime': str(timedelta(seconds=timeDiff)), 'starttime': str(starttime_str), 'stdout': "", 'stderr': "CalledProcessError exception occured.\n" + str(e), 'objectid': objectid, 'commandname': command_name } pyrpipeLoggerObject.cmd_logger.debug(json.dumps(logDict)) return False except: pu.print_boldred("Fatal error occured during execution.\n" + str(sys.exc_info()[0])) #log error timeDiff = round(time.time() - time_start) logDict = { 'cmd': log_message, 'exitcode': '-1', 'runtime': str(timedelta(seconds=timeDiff)), 'starttime': str(starttime_str), 'stdout': "", 'stderr': str("Fatal error occured during execution.\n" + str(sys.exc_info()[0])), 'objectid': objectid, 'commandname': command_name } pyrpipeLoggerObject.cmd_logger.debug(json.dumps(logDict)) return False
def generate_summary(cmdLog, envLog, coverage='a'): """Generates summary at the end of run. Simillar to generateHTMLReport Parameters ---------- templatefile: string path to a template file cmdlog: string path to the log file envlog: string path to the env log file coverage: string tpye of report: full, summary, fail, pass """ #vars for generating summary startTime = "" endTime = "" numCommands = 0 failedCommands = 0 passedCommands = 0 numPrograms = 0 progNames = [] #parse envLog sysInfo, progList = parseEnvLog(envLog) #get starttime #end time is calculated from log below startTime = dt.datetime.strptime(sysInfo['now'], "%y-%m-%d %H:%M:%S") #total progs used progNames = progList.keys() numPrograms = len(progNames) with open(cmdLog) as f: data = f.read().splitlines() for l in data: if not l.startswith("#"): thisDict = json.loads(l) numCommands += 1 #add color to table if int(thisDict['exitcode']) == 0: passedCommands += 1 else: failedCommands += 1 #if nothing in logs exit if numCommands < 1: pu.print_message('\n=========Summary=========') pu.print_message('No commands were executed via pyrpipe') return #get start and runtime of last command and compute end time lastDict = json.loads(data[-1]) lastST = dt.datetime.strptime(lastDict['starttime'], "%y-%m-%d %H:%M:%S") try: lastruntime = dt.datetime.strptime(lastDict['runtime'], "%H:%M:%S") deltaTime = dt.timedelta(days=0, hours=lastruntime.hour, minutes=lastruntime.minute, seconds=lastruntime.second) except ValueError: #try days format timeString = lastDict['runtime'].split(",") days = int(timeString[0].split(" ")[0].strip()) rest = timeString[1].strip() #hours=int(days)*24 lastruntime = dt.datetime.strptime(rest, "%H:%M:%S") #one day less #lastruntime=lastruntime+dt.timedelta(days=days-1) deltaTime = dt.timedelta(days=days, hours=lastruntime.hour, minutes=lastruntime.minute, seconds=lastruntime.second) endTime = lastST + deltaTime #remove one extra day #endTime=dt.timedelta(days=endTime.day-1,hours=endTime.hour, minutes=endTime.minute, seconds=endTime.second) #generate summary #summary='\n pyrpipe Summary' #summary+='\n Time start: {} Time end: {} Total time: {}'.format(str(startTime),str(endTime), str(endTime-startTime)) #summary+='\n Num commands: {}'.format(numCommands) #summary+='\n Num failed commands: {}'.format(failedCommands) #summary+='\n Num passed commands: {}'.format(passedCommands) #summary+='\n Total programs: {}'.format(numPrograms) #summary+='\n Programs: {}'.format(",".join(progNames)) pu.print_message('\n=========Summary=========') pu.print_message( 'Time start: {} \nTime end: {} \nTotal runtime: {}'.format( str(startTime), str(endTime), str(endTime - startTime))) pu.print_message('Total commands run: {}'.format(numCommands)) pu.print_green('Passed commands: {}'.format(passedCommands)) pu.print_boldred('Failed commands: {}'.format(failedCommands)) pu.print_message('Total unique commands/tools: {}'.format(numPrograms)) pu.print_message('Command/tools list: {}'.format(",".join(progNames)))
def build_index(self,index_path,transcriptome,objectid="NA"): """Function to build kallisto index index_path: str path to the index transcriptome: str Path to transcriptome objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Status of kallisto index :rtype: bool """ #if index already exists then exit if not _force: #check if files exists if pu.check_files_exist(index_path): pu.print_green("Kallisto index {} already exists.".format(index_path)) self.index=index_path return True #check input if not pu.check_files_exist(transcriptome): pu.print_boldred("{} does not exist. Exiting".format(transcriptome)) raise ValueError("Please check input to kallisto index") #create out dir indexdir=pu.get_file_directory(index_path) #create the out dir if not pu.check_paths_exist(indexdir): if not pu.mkdir(indexdir): raise OSError("Error creating kallisto index. Failed to create index directory.") args=(transcriptome,) internal_kwargs={"-i":index_path} #read build parameters yamlfile=os.path.join(_params_dir,'kallisto_index.yaml') if pu.check_files_exist(yamlfile): yaml_params=pl.YAML_loader(yamlfile) yaml_kwargs=yaml_params.get_kwargs() internal_kwargs={**yaml_kwargs,**internal_kwargs} #add positional args internal_kwargs['--']=args validArgsIndex=valid_args._args_KALLISTO_INDEX kallisto_cmd=['kallisto','index'] kallisto_cmd.extend(pu.parse_unix_args(validArgsIndex,internal_kwargs)) #call kallisto status=pe.execute_command(kallisto_cmd,objectid=objectid) if status: if pu.check_files_exist(index_path) and not _dryrun: #update object's index self.index=index_path if self.check_index(): return True else: raise OSError("Error building kallisto index") return False
def perform_alignment(self, sra_object, out_suffix="_hisat2", out_dir="", objectid="NA"): """Function to perform alignment using sra_object. Parameters ---------- sra_object SRA object An object of type SRA. The path to fastq files will be obtained from this object. out_suffix: string Suffix for the output sam file out_dir: string Directory to save the results. Default value is sra_object.directory objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the sorted bam file path after converting sam to bam and sorting it :rtype: string """ #check out dir if not out_dir: out_dir = sra_object.directory else: #create out_dir if not exists if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #create path to output sam file outSamFile = os.path.join( out_dir, sra_object.srr_accession + out_suffix + ".sam") outBamFile = os.path.join( out_dir, sra_object.srr_accession + out_suffix + "_sorted.bam") #check if final bam already exists if not _force and pu.check_files_exist(outBamFile): pu.print_green('Target files {} already exist.'.format(outBamFile)) return outBamFile #find layout and fq file paths if sra_object.layout == 'PAIRED': internal_kwargs = { "-1": sra_object.fastq_path, "-2": sra_object.fastq2_path, "-S": outSamFile } else: internal_kwargs = {"-U": sra_object.fastq_path, "-S": outSamFile} #call run_hisat2 status = self.run(None, objectid=sra_object.srr_accession, target=outSamFile, **internal_kwargs) if status: if not pu.check_files_exist(outSamFile) and not _dryrun: return "" #convert to bam before returning; returns outBamFile return tools.Samtools().sam_sorted_bam(outSamFile) #return outSamFile return ""
def build_index(self,index_path,transcriptome,objectid="NA"): """ Parameters ---------- index_path : TYPE DESCRIPTION. transcriptome : TYPE DESCRIPTION. objectid : TYPE, optional DESCRIPTION. The default is "NA". Raises ------ OSError DESCRIPTION. Returns ------- bool DESCRIPTION. """ #if index already exists then exit if not _force: #check if files exists if pu.check_salmonindex(index_path): pu.print_green("Salmon index {} already exists.".format(index_path)) self.index=index_path return True #check input if not pu.check_files_exist(transcriptome): pu.print_boldred("{} does not exist. Exiting".format(transcriptome)) return False #create out dir indexdir=pu.get_file_directory(index_path) #create the out dir if not pu.check_paths_exist(indexdir): if not pu.mkdir(indexdir): raise OSError("Error creating salmon index. Failed to create index directory.") validArgsIndex=valid_args._args_SALMON_INDEX internal_kwargs={"--threads":_threads,"-t":transcriptome,"-i":index_path} #read build parameters yamlfile=os.path.join(_params_dir,'salmon_index.yaml') if pu.check_files_exist(yamlfile): yaml_params=pl.YAML_loader(yamlfile) yaml_kwargs=yaml_params.get_kwargs() internal_kwargs={**yaml_kwargs,**internal_kwargs} salmon_cmd=['salmon','index'] salmon_cmd.extend(pu.parse_unix_args(validArgsIndex,internal_kwargs)) #call salmon status=pe.execute_command(salmon_cmd,objectid=objectid) if status: if pu.check_salmonindex(index_path) and not _dryrun: #update object's index self.index=index_path if self.check_index(): return True else: raise OSError("Error building salmon index") return False
def download_fastq(self,*args,**kwargs): """Function to download fastq files """ #check if fastq files exists already if self.fastq_exists(): pu.print_green("Fastq files exist already") return True #internal_args are created by pyrpipe and will always replace external passed args #add the positional args if self.sra_exists(): internal_args=(self.sra_path,) else: #fstrqd_Cmd.append(self.srr_accession) internal_args=(self.srr_accession,) #keyword args; boolean flags have empty values internal_kwargs={'-O':self.directory, '-o':self.srr_accession+".fastq", '-e':_threads, '-f':"" } #merge args, kwargs, internal_args, internal_kwargs #If args and kwargs are present if args or kwargs: internal_kwargs={**kwargs,**internal_kwargs} internal_args=tuple(set(args+internal_args)) #append the args to the kwargs using special key '--' internal_kwargs['--']=internal_args else: #check for yaml parameters filepath=os.path.join(_params_dir,'fasterq-dump.yaml') yaml_params=pl.YAML_loader(filepath) yaml_kwargs=yaml_params.get_kwargs() #yaml_args=yaml_params.get_args() internal_kwargs={**yaml_kwargs,**internal_kwargs} #internal_args=tuple(set(yaml_args+internal_args)) internal_kwargs['--']=internal_args params_list=pu.parse_unix_args(valid_args._args_FASTERQDUMP,internal_kwargs) fstrqd_Cmd=['fasterq-dump'] #add command and params fstrqd_Cmd.extend(params_list) #execute command cmdStatus=pe.execute_command(fstrqd_Cmd,objectid=self.srr_accession) if not cmdStatus: pu.print_boldred("fasterqdump failed for:"+self.srr_accession) return False #self.search_fastq(self.directory) #determine layout self.layout='PAIRED' #check files with names <SRR>_1.fastq and <SRR>_2.fastq fq=os.path.join(self.directory,self.srr_accession+'_1.fastq') fq2=os.path.join(self.directory,self.srr_accession+'_2.fastq') self.fastq_path=fq self.fastq2_path=fq2 #if dry run if _dryrun: return True if pu.check_files_exist(fq,fq2): self.fastq_path=fq self.fastq2_path=fq2 self.layout="PAIRED" #remove SRA self.delete_sra() return True #check single end file fq=os.path.join(self.directory,self.srr_accession+'.fastq') if pu.check_files_exist(fq): self.fastq_path=fq self.layout="SINGLE" #remove SRA self.delete_sra() return True return False
def perform_alignment(self, sra_object, out_suffix="_star", out_dir="", objectid="NA"): """Function to perform STAR alignment using sra_object. Parameters ---------- sra_object SRA object An object of type SRA. The path to fastq files will be obtained from this object. out_suffix: string Suffix for the output sam file out_dir: string Directory to save the results. Default value is sra_object.directory objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the path to output bam :rtype: string """ if not out_dir: out_dir = sra_object.directory else: #create out_dir if not exists if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #find layout and fq file paths if sra_object.layout == 'PAIRED': internal_kwargs = { "--readFilesIn": sra_object.fastq_path + " " + sra_object.fastq2_path } else: internal_kwargs = {"--readFilesIn": sra_object.fastq_path} #add out dir internal_kwargs["--outFileNamePrefix"] = out_dir + "/" #the expected out file #star can return Aligned.sortedByCoord.out.bam Aligned.out.bam Aligned.toTranscriptome.out.bam #return sorted bam or unsorted bam which ever is present bam = os.path.join(out_dir, 'Aligned.out.bam') #if outSAMtype is not specified make it bam by default if not '--outSAMtype' in self._kwargs: self._kwargs['--outSAMtype'] = 'BAM SortedByCoordinate' if '--outSAMtype' in self._kwargs and 'SortedByCoordinate' in self._kwargs[ '--outSAMtype']: bam = os.path.join(out_dir, 'Aligned.sortedByCoord.out.bam') finalbam = bam.split('.bam')[0] + out_suffix + '.bam' #check if final bam already exists if not _force and pu.check_files_exist(finalbam): pu.print_green('Target files {} already exist.'.format(finalbam)) return finalbam #call star status = self.run(None, objectid=sra_object.srr_accession, target=bam, **internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(bam, finalbam) if not pu.check_files_exist(finalbam): return "" return finalbam return ""
def perform_assembly(self, bam_file, out_dir=None, out_suffix="_cufflinks", objectid="NA"): """Function to run cufflinks with BAM file as input. Parameters ---------- bam_file: string path to bam file out_dir: output directory out_suffix: string Suffix for the output gtf file objectid: str Provide an id to attach with this command e.g. the SRR accession. :return: Returns the path to output GTF file :rtype: string """ #create path to output file fname = pu.get_file_basename(bam_file) if not out_dir: out_dir = pu.get_file_directory(bam_file) else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #Add output file name and input bam internal_args = (bam_file, ) internal_kwargs = {"-o": out_dir} #add positional args internal_kwargs['--'] = internal_args #targets outfile = os.path.join(out_dir, "transcripts.gtf") out_gtf_file = os.path.join(out_dir, fname + out_suffix + ".gtf") #if final file already exists if not _force and pu.check_files_exist(out_gtf_file): pu.print_green( 'Target files {} already exist.'.format(out_gtf_file)) return out_gtf_file #call cufflinks status = self.run(None, objectid=objectid, target=outfile, **internal_kwargs) if status: if not _dryrun: pe.move_file(outfile, out_gtf_file) if not pu.check_files_exist(out_gtf_file): return "" return out_gtf_file return ""
def build_index(self, index_path, genome, objectid="NA"): """Build a STAR index with given parameters and saves the new index to self.index. Parameters ---------- index_path: string Path where the index will be created genome: string Path to the reference genome objectid : string Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the status of STAR-build index :rtype: bool """ #if index already exists then exit if not _force: if pu.check_starindex(index_path): pu.print_green( "STAR index {} already exists.".format(index_path)) self.index = index_path return True #check input files if not (pu.check_files_exist(genome)): pu.print_boldred( "Please provide a valid input fasta file to build STAR index") raise ValueError("Please check input to build star index") #create index path if doesnt exist if not pu.check_paths_exist(index_path): if not pu.mkdir(index_path): raise OSError( "Error creating STAR index. Failed to create index directory." ) return False #determine parameters and execute cmd #internal_args=() internal_kwargs = { "--runMode": "genomeGenerate", "--genomeDir": index_path, "--genomeFastaFiles": genome, "--runThreadN": self._threads } #read build parameters yamlfile = os.path.join(_params_dir, 'star_index.yaml') if pu.check_files_exist(yamlfile): yaml_params = pl.YAML_loader(yamlfile) yaml_kwargs = yaml_params.get_kwargs() internal_kwargs = {**yaml_kwargs, **internal_kwargs} starbuild_Cmd = ['STAR'] starbuild_Cmd.extend( pu.parse_unix_args(valid_args._args_STAR, internal_kwargs)) #execute command status = pe.execute_command(starbuild_Cmd, objectid=objectid) if status: if pu.check_paths_exist(index_path) and not _dryrun: #update object's index self.index = index_path if self.check_index(): return True else: raise OSError("Error building STAR index") return True
def perform_qc(self,sra_object,out_dir="",out_suffix="_trimgalore",objectid="NA"): """Function to perform qc using trimgalore. The function perform_qc() is consistent for all QC classess. Parameters ---------- sra_object: SRA An SRA object whose fastq files will be used out_dir: str Path to output directory out_suffix: string Suffix for the output sam file objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the path of fastq files after QC. tuple has one item for single end files and two for paired. :rtype: tuple """ if not out_dir: out_dir=sra_object.directory else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #get layout if sra_object.layout=='PAIRED': fq1=sra_object.fastq_path fq2=sra_object.fastq2_path internal_args=(fq1,fq2) internal_kwargs={"--paired":"","-o":out_dir} """ running trim galore will create two files named <input>_val_1.fq and <input>_val_2.fq move these files to the specified out files """ file1=os.path.join(out_dir,pu.get_file_basename(fq1)+"_val_1.fq") file2=os.path.join(out_dir,pu.get_file_basename(fq2)+"_val_2.fq") #targets out_file1=os.path.join(out_dir,pu.get_file_basename(fq1)+out_suffix+".fastq") out_file2=os.path.join(out_dir,pu.get_file_basename(fq2)+out_suffix+".fastq") #check if final files already exists if not _force and pu.check_files_exist(out_file1,out_file2): pu.print_green('Target files {}, {} already exist.'.format(out_file1,out_file2)) return out_file1,out_file2 #run trimgalore status=self.run(*internal_args,objectid=objectid,target=[file1,file2],**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(file1,out_file1,verbose=False) pe.move_file(file2,out_file2,verbose=False) if not pu.check_files_exist(out_file1,out_file2): return "" return out_file1,out_file2 return ("",) else: fq=sra_object.fastq_path internal_args=(fq,) internal_kwargs={"-o":out_dir} """ running trim galore will create one file named <input>_trimmed.fq move these files to the specified out files """ file=os.path.join(out_dir,pu.get_file_basename(fq)+"_trimmed.fq") #target out_file=os.path.join(out_dir, pu.get_file_basename(fq)+out_suffix+".fastq") #check if final files already exists if not _force and pu.check_files_exist(out_file): pu.print_green('Target files {} already exist.'.format(out_file)) return (out_file,) #run trimgalore status=self.run(*internal_args,objectid=objectid,target=file,**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(file,out_file) if not pu.check_files_exist(out_file): return "" return (out_file,) return ("",)
def build_index(self, index_path, genome, objectid="NA"): """Build a hisat index with given parameters and saves the new index to self.index. Parameters ---------- index_path: string Path where the index will be created genome: string Path to the reference genome objectid : string Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the status of hisat2-build :rtype: bool """ #if index already exists then exit if not _force: #check if files exists if pu.check_hisatindex(index_path): pu.print_green( "Hisat2 index {} already exists.".format(index_path)) self.index = os.path.join(index_path) return True #check input files if not pu.check_files_exist(genome): pu.print_boldred( "Please provide a valid input fasta file to build Hisat2 index" ) raise ValueError("Please check input to hisat2 build index") indexdir = pu.get_file_directory(index_path) #create the out dir if not pu.check_paths_exist(indexdir): if not pu.mkdir(indexdir): raise OSError( "Error creating hisat2 index. Failed to create index directory." ) hisat2Buildvalid_args = valid_args._args_HISAT2BUILD args = (genome, index_path) internal_kwargs = {"-p": self._threads} #read build parameters yamlfile = os.path.join(_params_dir, 'hisat2_index.yaml') if pu.check_files_exist(yamlfile): yaml_params = pl.YAML_loader(yamlfile) yaml_kwargs = yaml_params.get_kwargs() internal_kwargs = {**yaml_kwargs, **internal_kwargs} #add positional args internal_kwargs['--'] = args hisat2Build_Cmd = ['hisat2-build'] hisat2Build_Cmd.extend( pu.parse_unix_args(hisat2Buildvalid_args, internal_kwargs)) #execute command status = pe.execute_command(hisat2Build_Cmd, objectid=objectid) if status: if pu.check_hisatindex(index_path) and not _dryrun: #update object's index self.index = index_path if self.check_index(): return True else: raise OSError("Error building Hisat2 index") return True
def build_index(self, index_path, *args, threads=None, overwrite=False, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Build a star index with given parameters and saves the new index to self.star_index. Parameters ---------- index_path: string Path where the index will be created args: tuple Path to reference input files threads: int Num threads to use overwrite: bool Overwrite if index already exists verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Parameters for the star command :return: Returns status of star command :rtype: bool """ #if index already exists then exit if not overwrite: if pu.check_starindex(index_path): pu.print_green("STAR index already exists. Using it...") self.star_index = index_path return True #check input files if len(args) < 1: pu.print_boldred( "Please provide input fasta file to build STAR index") return False if not pu.check_files_exist(*args): raise Exception("Please check input to star index") return False #create path if doesnt exist if not pu.check_paths_exist(index_path): if not pu.mkdir(index_path): raise Exception("Error creating STAR index. Exiting.") return False if not threads: threads = self.threads #add runMode newOpts = { "--runMode": "genomeGenerate", "--genomeDir": index_path, "--genomeFastaFiles": " ".join(args), "--runThreadN": str(threads) } mergedOpts = {**newOpts, **kwargs} starbuild_Cmd = ['STAR'] starbuild_Cmd.extend(pu.parse_unix_args(None, mergedOpts)) #execute command status = pe.execute_command(starbuild_Cmd, verbose=verbose, quiet=quiet, logs=logs, objectid=objectid) if status: if pu.check_paths_exist(index_path): #update object's index self.star_index = index_path if self.check_index(): return True else: return False