Example #1
0
    def createMikadoGTFlist(self,
                            out_file,
                            out_dir,
                            searchPath,
                            searchQuery="*.gtf",
                            strand=False):
        """Create a file to be used by mikado configure
        """

        files = pe.find_files(searchPath, searchQuery)
        args = files

        #create out dir
        if not pu.check_paths_exist(out_dir):
            pu.mkdir(out_dir)
        outFilePath = os.path.join(out_dir, out_file + ".txt")

        gtfs = []
        for l in args:
            thisName = pu.get_file_basename(l)
            if thisName:
                gtfs.append("\t".join([l, thisName, str(strand)]))

        f = open(outFilePath, "w")
        f.write("\n".join(gtfs))
        f.close()

        pu.print_green("Mikado list file written to:" + outFilePath)
        return outFilePath
Example #2
0
    def create_lock(self, target_list, message):
        """
        Cretes a temporary .Lock file associated with a target file and write a message in it.

        Parameters
        ----------
        target_list : List
            List of target files.
        message : Str
            Message to write in file.

        Returns
        -------
        templist : List
            A list of .Lock file names coressponding to the target files.

        """
        templist = []
        for f in target_list:
            temp_path = pu.get_file_directory(f)
            if not pu.check_paths_exist(temp_path): pu.mkdir(temp_path)
            prefix = pu.get_filename(f) + '_'
            temp = tempfile.NamedTemporaryFile(prefix=prefix,
                                               suffix='.Lock',
                                               dir=temp_path,
                                               delete=False)
            #TODO: dump command in lock
            timestamp = pu.get_timestamp()
            temp.write(str.encode(timestamp + '\t' + message))

            templist.append(temp.name)
        return templist
Example #3
0
def download_gtex_bams(manifest_file, outdir):
    #load list of bam files
    with open(manifest_file, 'r') as fi:
        thisdata = json.load(fi)
    flist = []
    #check existing files
    for d in thisdata:
        f = d["file_name"]
        gid = f.split('.Aligned')[0]
        outfile = os.path.join(outdir, gid, f)
        #if pu.check_files_exist(outfile) and pu.get_mdf(outfile)==d["md5sum"]:
        if pu.check_files_exist(outfile):
            print("Outfile {} exists. Skipping...".format(outfile))
            #copy it back to out dir
            os.rename(outfile, os.path.join(outdir, f))
        flist.append(d["file_name"])

    cmd = 'gen3-client download-multiple --profile={} --manifest={} --download-path={} --protocol=s3 --numparallel={} --skip-completed --no-prompt'.format(
        profile, m, outdir, threads)
    cdcmd = 'cd {}'.format(cwd)
    sshcmd = dtn_ssh + " '{}; {}'".format(cdcmd, cmd)
    out = pe.get_shell_output(sshcmd, verbose=True)

    #move the files
    for f in flist:
        source = os.path.join(outdir, f)
        gid = f.split('.Aligned')[0]
        destdir = os.path.join(outdir, gid)
        pu.mkdir(destdir)
        dest = os.path.join(destdir, f)
        #print('Moving {}-->{}'.format(source,dest))
        os.rename(source, dest)
Example #4
0
def generate_multiqc_from_log(logFile,
                              filterList,
                              tempDir,
                              outDir="",
                              coverage='a',
                              verbose=False,
                              cleanup=False):
    #dump stdout from logs to temp directory
    stdout = getStdoutFromLog(logFile, filterList, coverage)
    #create tmpdir
    pu.mkdir(tempDir)
    flist = []
    for o in stdout:
        thisName = o + ".txt"
        tempFile = os.path.join(tempDir, thisName)
        #        print("opening:"+tempFile)
        f = open(tempFile, "w")
        f.write(stdout[o])
        #rint(stdout[o])
        f.close()
        flist.append(tempFile)

    #run multiqc
    #tempDir stores .txt files for MQC to read
    mc.run(analysis_dir=tempDir, outdir=outDir)

    #cleanup
    if cleanup:
        for f in flist:
            pu.print_blue("Removing {}".format(f))
            os.remove(f)
Example #5
0
    def build_index(self,
                    in_fasta,
                    dbname,
                    out_dir=None,
                    threads=None,
                    verbose=False,
                    quiet=False,
                    logs=True,
                    objectid="NA",
                    **kwargs):
        """Build a diamond index and store its path in self
        """

        #check input files
        if not pu.check_files_exist(in_fasta):
            pu.print_boldred(
                "Input fasta: {} not found...\n diamond makedb failed".format(
                    in_fasta))
            return False
        #create out_dir
        if not out_dir:
            out_dir = os.getcwd()
        if not pu.check_paths_exist(out_dir):
            pu.mkdir(out_dir)

        #check if index already exists
        index_path = os.path.join(out_dir, dbname)
        self.index = index_path
        if self.check_index():
            pu.print_green("Diamond index: {} exists, using it...".format(
                self.index))
            self.index = index_path
            return True

        if not threads:
            threads = self.threads

        newOpts = {
            "--in": in_fasta,
            "-d": index_path,
            "--threads": str(threads)
        }

        #add input files to kwargs, overwrite newOpts with kwargs
        mergedOpts = {**newOpts, **kwargs}

        #call run_diamond
        status = self.run_diamond("makedb",
                                  verbose=verbose,
                                  quiet=quiet,
                                  logs=logs,
                                  objectid=objectid,
                                  **mergedOpts)

        if status:
            self.index = index_path
            return True

        return False
Example #6
0
 def perform_qc(self,sra_object,out_dir="",out_suffix="_bbduk",objectid="NA"):
     """Run bbduk on fastq files specified by the sra_object
    
     sra_object: SRA
         An SRA object whose fastq files will be used
     out_dir: str
         Path to output directory
     out_suffix: string
         Suffix for the output sam file
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
     
     :return: Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired.
     :rtype: tuple
         
     """
     #make out_dir
     if not out_dir:
             out_dir=sra_object.directory
     else:
         if not pu.check_paths_exist(out_dir):
             pu.mkdir(out_dir)
            
     if sra_object.layout=='PAIRED':
         fq1=sra_object.fastq_path
         fq2=sra_object.fastq2_path
         out_fileName1=pu.get_file_basename(fq1)+out_suffix+".fastq"
         out_fileName2=pu.get_file_basename(fq2)+out_suffix+".fastq"
         out_file1Path=os.path.join(out_dir,out_fileName1)
         out_file2Path=os.path.join(out_dir,out_fileName2)
         
         internal_args=()
         internal_kwargs={"in":fq1,"in2":fq2,"out":out_file1Path,"out2":out_file2Path}
                     
         #run bbduk
         status=self.run(*internal_args,objectid=objectid,target=[out_file1Path,out_file2Path],**internal_kwargs)
         
         if status:
             if not pu.check_files_exist(out_file1Path,out_file2Path) and not _dryrun:
                     return("",)
                     
         return(out_file1Path,out_file2Path)
         
         
     else:
         fq=sra_object.fastq_path
         out_fileName=pu.get_file_basename(fq)+out_suffix+".fastq"
         out_filePath=os.path.join(out_dir,out_fileName)
         internal_args=()
         internal_kwargs={"in":fq,"out":out_filePath}
         
         #run bbduk
         status=self.run(*internal_args,objectid=objectid,target=out_filePath,**internal_kwargs)
         if status:
             if not pu.check_files_exist(out_filePath) and not _dryrun:
                 return("",)
             
         return(out_filePath,) 
Example #7
0
    def perform_alignment(self,
                          sra_object,
                          out_suffix="_bowtie2",
                          out_dir="",
                          objectid="NA"):
        """Function to perform alignment using sra_object.
        
        Parameters
        ----------
        
        sra_object SRA object
            An object of type SRA. The path to fastq files will be obtained from this object.
        out_suffix: string
            Suffix for the output sam file
        out_dir: string
            Directory to save the results. Default value is sra_object.directory
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        :return: Returns the sorted bam file path after converting sam to bam and sorting it
        :rtype: string
        """
        if not out_dir:
            out_dir = sra_object.directory
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        #create path to output sam file
        outSamFile = os.path.join(
            out_dir, sra_object.srr_accession + out_suffix + ".sam")
        #outBamFile=os.path.join(out_dir,sra_object.srr_accession+out_suffix+"_sorted.bam")

        #find layout and fq file paths
        if sra_object.layout == 'PAIRED':
            internal_kwargs = {
                "-1": sra_object.fastq_path,
                "-2": sra_object.fastq2_path,
                "-S": outSamFile
            }
        else:
            internal_kwargs = {"-U": sra_object.fastq_path, "-S": outSamFile}

        status = self.run(None,
                          objectid=sra_object.srr_accession,
                          target=outSamFile,
                          **internal_kwargs)

        if status:
            if not pu.check_files_exist(outSamFile) and not _dryrun:
                return ""
            #convert to bam before returning; returns outBamFile
            return tools.Samtools().sam_sorted_bam(outSamFile)

        return ""
Example #8
0
    def run_transdecoder_predict(self,
                                 infasta,
                                 longorfs_dir,
                                 out_dir=None,
                                 verbose=False,
                                 quiet=False,
                                 logs=True,
                                 objectid="NA",
                                 **kwargs):

        if not pu.check_files_exist(infasta):
            pu.print_boldred("Please check input file:" + infasta)
        if not pu.check_paths_exist(longorfs_dir):
            pu.print_boldred("Path {} doesn't exist".format(longorfs_dir))

        move_flag = True
        if not out_dir:
            out_dir = os.getcwd()
            move_flag = False

        if not pu.check_paths_exist(out_dir):
            pu.mkdir(out_dir)

        newOpts = {"-t": infasta, "-O": longorfs_dir}
        mergedOpts = {**newOpts, **kwargs}

        #execute Predict
        status = self.run_transdecoder('TransDecoder.Predict',
                                       verbose=verbose,
                                       quiet=quiet,
                                       logs=logs,
                                       objectid=objectid,
                                       **mergedOpts)
        if not status:
            pu.print_boldred("Transdecoder failed")
            return ""

        #move output files to outdir
        if move_flag:
            outfile_prefix = pu.get_filename(infasta) + ".transdecoder"
            pe.move_file(outfile_prefix + ".bed",
                         os.path.join(out_dir, outfile_prefix + ".bed"),
                         verbose)
            pe.move_file(outfile_prefix + ".cds",
                         os.path.join(out_dir, outfile_prefix + ".cds"),
                         verbose)
            pe.move_file(outfile_prefix + ".gff3",
                         os.path.join(out_dir, outfile_prefix + ".gff3"),
                         verbose)
            pe.move_file(outfile_prefix + ".pep",
                         os.path.join(out_dir, outfile_prefix + ".pep"),
                         verbose)
        return out_dir
Example #9
0
def generate_multiqc(directory,
                     tempDir,
                     outDir="",
                     coverage='a',
                     verbose=False,
                     cleanup=False):
    """
    Generate reports using multiqc

    Parameters
    ----------
    directory : str
        path to directory containing logs.
    tempDir : str
        temp dir.
    outDir : str, optional
        output dir. The default is "".
    coverage : char, optional
        commands to use in pyrpipe log: fa(i)led (p)assed or (a)ll. The default is 'a'.
    verbose : bool, optional
        print messages. The default is False.
    cleanup : bool, optional
        remove temp files. The default is False.

    Returns
    -------
    None.

    """
    #searg all _pyrpipe.log files under current directory
    files = pu.find_files(directory, ".*_pyrpipe\.log$", recursive=True)
    #extract stdout from each file and save to temp
    if not outDir:
        outDir = 'MultiQC_out'
    #create tempdir
    if not pu.check_paths_exist(tempDir):
        pu.mkdir(tempDir)
    for f in files:
        #dump stdout from logs to temp directory
        stdout = getStdoutFromLog(f, None, coverage)
        fid = f.split('_pyrpipe')[0].split('_')[-1]
        for o in stdout:
            thisName = o + "_" + fid + ".txt"
            tempFile = os.path.join(tempDir, thisName)
            f = open(tempFile, "w")
            f.write(stdout[o])
            #print('written',tempFile)
            f.close()

    #run multiqc
    mc.run(analysis_dir=directory, outdir=outDir)

    pass
Example #10
0
    def perform_assembly(self,
                         bam_file,
                         out_dir=None,
                         out_suffix="_stringtie",
                         objectid="NA"):
        """Function to run stringtie using a bam file.
                
        Parameters
        ----------
        
        bam_file: string
            path to the bam file
        out_dir: string
            Path to out file
        out_suffix: string
            Suffix for the output gtf file
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        :return: Returns the path to output GTF file
        :rtype: string
        """

        #create path to output file
        fname = pu.get_file_basename(bam_file)

        if not out_dir:
            out_dir = pu.get_file_directory(bam_file)

        if not pu.check_paths_exist(out_dir):
            pu.mkdir(out_dir)

        out_gtf_file = os.path.join(out_dir, fname + out_suffix + ".gtf")

        #Add output file name and input bam
        internal_args = (bam_file, )
        internal_kwargs = {"-o": out_gtf_file}
        #add positional args
        internal_kwargs['--'] = internal_args

        #call stringtie
        status = self.run(None,
                          objectid=objectid,
                          target=out_gtf_file,
                          **internal_kwargs)

        if status:
            #check if sam file is present in the location directory of sraOb
            if not pu.check_files_exist(out_gtf_file) and not _dryrun:
                return ""
            return out_gtf_file

        return ""
Example #11
0
 def perform_quant(self,sra_object,out_suffix="",out_dir="",objectid="NA"):
     """Run kallisto quant
     
     sra_object: SRA
         SRA object contatining paths to fastq files
     out_suffix: str
         suffix for output file
     out_dir: str
         path to output directory
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
    
     :return: Path to kallisto out directory
     :rtype: string
     """
     
     if not out_dir:
         out_dir=os.path.join(sra_object.directory,"kallisto_out")
     else:
         #create out_dir if not exists
         if not pu.check_paths_exist(out_dir):
             pu.mkdir(out_dir)
     
     
     if sra_object.layout == 'PAIRED':
         args=(sra_object.fastq_path,sra_object.fastq2_path)
         internal_kwargs={"-o":out_dir,"-i":self.index}
     else:
         args=(sra_object.fastq_path,)
         internal_kwargs={"-o":out_dir,"--single":"","-i":self.index}
         
     
     #targets
     outfile=os.path.join(out_dir,"abundance.tsv")
     newfile=os.path.join(out_dir,"abundance"+out_suffix+".tsv")
     #check if final files already exists
     if not _force and pu.check_files_exist(newfile):
         pu.print_green('Target files {} already exist.'.format(newfile))
         return newfile
     
     #call kallisto
     status=self.run(*args,subcommand='quant',objectid=sra_object.srr_accession,target=outfile,**internal_kwargs)
     
     if status:
         #return rename the bam  file and return path
         if not _dryrun:
             pe.move_file(outfile,newfile)
             if not pu.check_files_exist(newfile):
                 return ""            
         return newfile
     
     return ""
Example #12
0
 def perform_quant(self,sra_object,out_suffix="",out_dir="",objectid="NA"):
     """run salmon quant
     sra_object: SRA
         An SRA object with valid fastq files
     out_suffix: str
         suffix string fout out file
     out_dir: str
         path to outdir
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
     
     :return: Path to salmon out file
     :rtype: string
     """
         
     if not out_dir:
         out_dir=os.path.join(sra_object.directory,"salmon_out")
     else:
         #create out_dir if not exists
         if not pu.check_paths_exist(out_dir):
             pu.mkdir(out_dir)
     
     
     if sra_object.layout == 'PAIRED':
         internal_kwargs={"-o":out_dir,"-l":"A","-1":sra_object.fastq_path,"-2":sra_object.fastq2_path,"-i":self.index}
     else:
         internal_kwargs={"-o":out_dir,"-l":"A","-r":sra_object.fastq_path,"-i":self.index}
     
     #targets
     outfile=os.path.join(out_dir,"quant.sf")
     newfile=os.path.join(out_dir,"quant"+out_suffix+".sf")
     #check if final files already exists
     if not _force and pu.check_files_exist(newfile):
         pu.print_green('Target files {} already exist.'.format(newfile))
         return newfile
     
     #call salmon
     status=self.run(None,subcommand='quant',objectid=sra_object.srr_accession,target=newfile,**internal_kwargs)
     
     if status:
         #return rename the bam  file and return path
         if not _dryrun:
             pe.move_file(outfile,newfile)
             if not pu.check_files_exist(newfile):
                 return ""            
         return newfile
     
     return ""
Example #13
0
def multiqc():
    print("Generating html report with multiqc")
    parser = argparse.ArgumentParser(
   
            description='pyrpipe diagnostic utility\nGenerate report with multiqc.',
            
            usage='''pyrpipe_diagnostic multiqc [<args>] <logfile>
                    
                    ''')    
    parser.add_argument('-o', help='out directory \ndefault: <./>',action="store")
    parser.add_argument('-c',help='Dump command options [(a)ll,fa(i)l,(p)ass]\ndefault: a',default='a',action="store")
    parser.add_argument('-v',help='verbose',action="store_true")
    parser.add_argument('-f',help='Filter by programs. Provide a comma-separated list e.g., prefetch,STAR,bowtie2 \ndefault None')
    parser.add_argument('-t',help='Temporary directory. \ndefault ./tmp',action="store")
    parser.add_argument('-r',help='Remove stdout files after processing. \ndefault ./tmp',action="store_true")
    parser.add_argument('logfile', help='The log file generated by pyrpipe',action="store")
    args = parser.parse_args(sys.argv[2:])
    
    logFile=args.logfile
    
    #parse args
    vFlag=args.v
    if vFlag:
        print("Generating MutiQC report")
    outDir=""
    if args.o is None:
        outDir=os.getcwd()
    else:
        outDir=args.o
    
    
    filters=[]
    if args.f is not None:
        filters= args.f.split(',')
    
    #create temp dir
    tempDir=""
    if args.t is not None:
        tempDir= args.t
    else:
        tempDir=os.path.join(os.getcwd(),"tmp")
    #create tmp dir
    if not pu.check_paths_exist(tempDir):
        pu.mkdir(tempDir) 
    
    #run multiqc
    generateMultiqcReport(logFile,filters,tempDir,outDir=outDir,coverage=args.c,verbose=args.v,cleanup=args.r)
Example #14
0
def benchmark():
    print("Generating benchmarks")
    parser = argparse.ArgumentParser(
   
            description='pyrpipe diagnostic utility\nGenerate benchmark report.',
            
            usage='''pyrpipe_diagnostic report [<args>] <logfile>
                    
                    ''')    
    parser.add_argument('-o', help='out file \ndefault: same as input logfile',action="store")
    parser.add_argument('-e', help='report output type: [MD,PDF,HTML] \ndefault: PDF',default='PDF',action="store")
    parser.add_argument('-v',help='verbose',action="store_true")
    parser.add_argument('-f',help='Filter by programs. Provide a comma-separated list e.g., prefetch,STAR,bowtie2 \ndefault None')
    parser.add_argument('-t',help='Temporary directory. \ndefault ./tmp',action="store")
    parser.add_argument('logfile', help='The log file generated by pyrpipe',action="store")
    args = parser.parse_args(sys.argv[2:])
    
    logFile=args.logfile
    envLog=checkEnvLog(logFile)    
    #parse args
    vFlag=args.v
    if vFlag:
        print("Generating benchmarks")
    outFile=""
    if args.o is None:
        outFile=pu.get_file_basename(args.logfile)
    else:
        outFile=args.o
    outFile+='.'+args.e
    
    filters=[]
    if args.f is not None:
        filters= args.f.split(',')
    #create temp dir
    tempDir=""
    if args.t is not None:
        tempDir= args.t
    else:
        tempDir=os.path.join(os.getcwd(),"tmp")
    #create tmp dir
    if not pu.check_paths_exist(tempDir):
        pu.mkdir(tempDir)
        
    generateBenchmarkReport(logFile,envLog,filters,tempDir,outFile=outFile,verbose=args.v)
Example #15
0
    def createMikadoGTFlist(self,
                            out_file,
                            out_dir,
                            searchPath,
                            searchQuery="*.gtf",
                            strand=False):
        """Create a file to be used by mikado configure
        out_file: str
            outfile name
        out_dir: str
            path to out_dir
        searchPath: str
            Path where gtf/gff files will be searched
        searchQuery: str
            Query to perform search. Default: "*.gtf"
        strand: bool
            Stranded flag: Default false
        
            
        """

        files = pe.find_files(searchPath, searchQuery, recursive=True)
        args = files

        #create out dir
        if not pu.check_paths_exist(out_dir):
            pu.mkdir(out_dir)
        outFilePath = os.path.join(out_dir, out_file + ".txt")

        gtfs = []
        for l in args:
            thisName = pu.get_file_basename(l)
            if thisName:
                gtfs.append("\t".join([l, thisName, str(strand)]))

        f = open(outFilePath, "w")
        f.write("\n".join(gtfs))
        f.close()

        pu.print_green("Mikado list file written to:" + outFilePath)
        return outFilePath
Example #16
0
 def build_index(self,index_path,index_name,fasta,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs):
     """
     build salmon index and store the path to index in self
     
     index_path: str
         path to the output directory
     index_name: str
         index name
     verbose: bool
         Print stdout and std error
     quiet: bool
         Print nothing
     logs: bool
         Log this command to pyrpipe logs
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
     kwargs: dict
         Options to pass to salmon. This will override the existing options
         
     :return: status of salmon index
     :rtype: bool
     """
     
     #check input
     if not pu.check_files_exist(fasta):
         pu.print_boldred("{} does not exist. Exiting".format(fasta))
         return False
     #create out dir
     if not pu.check_paths_exist(index_path):
         if not pu.mkdir(index_path):
             print("ERROR in building hisat2 index. Failed to create index directory.")
             return False
     indexOut=os.path.join(index_path,index_name)
     newOpts={"-t":fasta,"-i":indexOut}
     mergedOpts={**kwargs,**newOpts}
     
     #call salmon
     status=self.run_salmon("index",verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**mergedOpts)
     
     if status:
         #check if sam file is present in the location directory of sra_object
         #if check_files_exist(os.path.join(indexOut,"versionInfo.json")): #not sure if this is reliable
         if pu.check_paths_exist(indexOut):
             self.salmon_index=indexOut
             self.passedArgumentDict['-i']=self.salmon_index
             pu.print_green("salmon index is:"+self.salmon_index)
             return True
     
     pu.print_boldred("Failed to create salmon index")
     return False
Example #17
0
 def build_index(self,index_path,index_name,fasta,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs):
     """Function to  build kallisto index
     
     index_path: str
         path to the output directory
     index_name: str
         index name
     verbose: bool
         Print stdout and std error
     quiet: bool
         Print nothing
     logs: bool
         Log this command to pyrpipe logs
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
     kwargs: dict
         Options to pass to kallisto. This will override the existing options in self.passed_args_dict (only replace existing arguments and not replace all the arguments).
         
     :return: Status of kallisto index
     :rtype: bool
     """
     
     #check input
     if not pu.check_files_exist(fasta):
         pu.print_boldred("{} does not exist. Exiting".format(fasta))
         return False
     
     #create out dir
     if not pu.check_paths_exist(index_path):
         if not pu.mkdir(index_path):
             print("ERROR in building kallisto index. Failed to create index directory.")
             return False
         
     indexOut=os.path.join(index_path,index_name)
     newOpts={"--":(fasta,),"-i":indexOut}
     mergedOpts={**kwargs,**newOpts}
     
     #call salmon
     status=self.run_kallisto("index",verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**mergedOpts)
     
     if status:
         #check if sam file is present in the location directory of sra_object
         if pu.check_files_exist(indexOut):
             self.kallisto_index=indexOut
             self.passedArgumentDict['-i']=self.kallisto_index
             pu.print_green("kallisto_index is:"+self.kallisto_index)
             return True
     else:
         pu.print_boldred("Failed to create kallisto index")
         return False
Example #18
0
 def __init__(self,log_file,env_log,out_dir=""):
     
     
     if not pu.check_files_exist(log_file,env_log):
         raise Exception("Please check input for benchmark report. {} {}".format(log_file,env_log))
     if not out_dir:
         out_dir=os.getcwd()
     self.log_file=log_file
     self.env_log=env_log
     self.runtimes_by_prog={}
     self.runtimes_by_object={}
     #init
     pu.print_blue("parsing log...")
     self.parse_logs()
     pu.print_blue("done.")
     #out_dir
     self.benchmark_dir=os.path.join(out_dir,'benchmark_reports')
     if not pu.check_paths_exist(self.benchmark_dir):
         if not pu.mkdir(self.benchmark_dir):
             raise Exception("Error running benchmarks. Can not create output directory {}".format(self.benchmark_dir))
Example #19
0
 def perform_assembly(self,bam_file,out_dir="",out_suffix="_cufflinks",reference_gtf=None,threads=None,overwrite=True,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs):
     """Function to run cufflinks with BAM file as input.
             
     Parameters
     ----------
     bam_file: string
         path to bam file
     out_dir: 
         output directory
     out_suffix: string
         Suffix for the output gtf file
     reference_gtf: str
         Path to reference gtf 
     threads: int
         Number of threads to use
     overwrite: bool
         Overwrite if output file already exists.
     verbose: bool
         Print stdout and std error
     quiet: bool
         Print nothing
     logs: bool
         Log this command to pyrpipe logs
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession.
     kwargs: dict
         Options to pass to cufflinks. 
         
     :return: Returns the path to output GTF file
     :rtype: string       
     """
     
     #create path to output file
     fname=pu.get_file_basename(bam_file)
     if not out_dir:
         out_dir=pu.get_file_directory(bam_file)
     else:
         if not pu.check_paths_exist(out_dir):
             pu.mkdir(out_dir)
     out_gtf_file=os.path.join(out_dir,fname+out_suffix+".gtf")
     
     """
     Handle overwrite
     """
     if not overwrite:
         #check if file exists. return if yes
         if os.path.isfile(out_gtf_file):
             print("The file "+out_gtf_file+" already exists. Exiting..")
             return out_gtf_file
     
     if not threads:
         threads=self.threads
         
     #Add output file name and input bam
     new_opts={"-o":out_dir,"--":(bam_file,),"-p":str(threads)}
     
     #add ref gtf
     if reference_gtf:
         if not pu.check_files_exist(reference_gtf):
             pu.print_boldred("Error: Provided reference GTF {} doesn't exist. Exiting...".format(reference_gtf))
             return ""
         
         new_opts["-g"]=reference_gtf
     
     merged_opts={**new_opts,**kwargs}
     
     #call cufflinks
     status=self.run_cufflinks(verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**merged_opts)
     
     if status:
         #move out_dir/transcripts.gtf to outfile
         pe.move_file(os.path.join(out_dir,"transcripts.gtf"),out_gtf_file)
         #check if sam file is present in the location directory of sraOb
         if pu.check_files_exist(out_gtf_file):
             return out_gtf_file
     else:
         return ""
Example #20
0
    def runMikadoConfigure(self,
                           listFile,
                           genome,
                           mode,
                           scoring,
                           junctions,
                           out_file,
                           out_dir=os.getcwd(),
                           verbose=False,
                           quiet=False,
                           logs=True,
                           objectid="NA",
                           **kwargs):
        """Wrapper to run mikado configure
        Make sure the paths in list file are global.
        Parameters
        ----------

        :return: Path to the created configuration file
        :rtype: string
        """

        #check all file exists
        if not pu.check_files_exist(listFile, genome, junctions, scoring):
            print("Please check mikado input")
            return ""

        #create out dir
        if not pu.check_paths_exist(out_dir):
            if not pu.mkdir(out_dir):
                raise Exception("Exception in mikado configure.")

        outFilePath = os.path.join(out_dir, out_file + ".yaml")

        newOpts = {
            "--list": listFile,
            "--reference": genome,
            "--mode": mode,
            "--scoring": scoring,
            "--junctions": junctions,
            "--": (outFilePath, )
        }

        #merge with kwargs
        mergedOpts = {**kwargs, **newOpts}

        status = self.runMikado("configure",
                                verbose=verbose,
                                quiet=quiet,
                                logs=logs,
                                objectid=objectid,
                                **mergedOpts)

        if not status:
            pu.print_boldred(
                "Mikado configure failed.\nPlease make sure the paths in list file are global."
            )
            return ""

        #check if bam file exists
        if not pu.check_files_exist(outFilePath):
            return ""

        return outFilePath
Example #21
0
    def perform_qc(self,
                   sra_object,
                   out_dir="",
                   out_suffix="_bbduk",
                   overwrite=True,
                   threads=None,
                   max_memory=None,
                   verbose=False,
                   quiet=False,
                   logs=True,
                   objectid="NA",
                   **kwargs):
        """Run bbduk on fastq files specified by the sra_object
        
        Parameters
        ----------
        
        sra_object: SRA
            an SRA object
        out_dir: string
            Path to out dir. Default: sra_object.location
        out_suffix: string
            Suffix for output file name
        overwrite: bool
            overwrite existing files
        threads: int
            Num threads to use
        max_memory: float
            Max memory to use in GB
        verbose: bool
            Print stdout and std error
        quiet: bool
            Print nothing
        logs: bool
            Log this command to pyrpipe logs
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        
        kwargs: dict
            options passed to bbduk
            
        :return: Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired.
        :rtype: tuple
            
        """

        #make out_dir
        if not out_dir:
            out_dir = sra_object.location
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        if not threads:
            threads = self.threads
        if not max_memory:
            max_memory = self.max_memory

        memory_flag = "-Xmx" + str(max_memory) + "g"

        #optimize parameters
        #if optimize:
        #    print("generating suggested parameters XXX TD")

        if sra_object.layout == 'PAIRED':
            fq1 = sra_object.localfastq1Path
            fq2 = sra_object.localfastq2Path

            out_fileName1 = pu.get_file_basename(fq1) + out_suffix + ".fastq"
            out_fileName2 = pu.get_file_basename(fq2) + out_suffix + ".fastq"
            out_file1Path = os.path.join(out_dir, out_fileName1)
            out_file2Path = os.path.join(out_dir, out_fileName2)

            newOpts = {
                "in": fq1,
                "in2": fq2,
                "out": out_file1Path,
                "out2": out_file2Path,
                "--": (memory_flag, ),
                "threads": str(threads)
            }
            mergedOpts = {**newOpts, **kwargs}

            #run bbduk
            if self.run_bbduk(verbose=verbose,
                              quiet=quiet,
                              logs=logs,
                              objectid=objectid,
                              **mergedOpts):
                if pu.check_files_exist(out_file1Path, out_file2Path):
                    return (out_file1Path, out_file2Path)
            return ("", )

        else:
            fq = sra_object.localfastqPath
            out_fileName = pu.get_file_basename(fq) + out_suffix + ".fastq"
            out_filePath = os.path.join(out_dir, out_fileName)
            newOpts = {"in": fq, "out": out_filePath, "--": (memory_flag, )}
            mergedOpts = {**newOpts, **kwargs}

            #run bbduk
            if self.run_bbduk(verbose=verbose,
                              quiet=quiet,
                              logs=logs,
                              objectid=objectid,
                              **mergedOpts):
                if pu.check_files_exist(out_filePath):
                    return (out_filePath, )
            return ("", )
Example #22
0
    def sam_to_bam(self,
                   sam_file,
                   out_dir="",
                   out_suffix="",
                   delete_sam=False,
                   verbose=False,
                   quiet=False,
                   logs=True,
                   objectid="NA",
                   **kwargs):
        """Convert sam file to a bam file. 
        Output bam file will have same name as input sam.
        
        out_suffix: string
            Suffix for the output sam file
        delete_sam: bool
            delete the sam file after conversion
        verbose: bool
            Print stdout and std error
        quiet: bool
            Print nothing
        logs: bool
            Log this command to pyrpipe logs
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        kwargs: dict
            Options to pass to trimgalore. This will override the existing options 

        :return: Returns the path to the bam file. Returns empty string if operation failed.
        :rtype: string
        """
        if not out_dir:
            out_dir = pu.get_file_directory(sam_file)
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        fname = pu.get_file_basename(sam_file)

        #output will be out_bam
        out_bam = os.path.join(out_dir, fname + out_suffix + '.bam')

        newOpts = {"--": (sam_file, ), "-o": out_bam, "-b": ""}
        mergedOpts = {**kwargs, **newOpts}

        status = self.run_samtools("view",
                                   verbose=verbose,
                                   quiet=quiet,
                                   logs=logs,
                                   objectid=objectid,
                                   **mergedOpts)

        if not status:
            print("Sam to bam failed for:" + sam_file)
            return ""

        #check if bam file exists
        if not pu.check_files_exist(out_bam):
            return ""

        #delete_sam_file
        if delete_sam:
            if not pe.deleteFileFromDisk(sam_file):
                print("Error deleting sam file:" + sam_file)

        #return path to file
        return out_bam
Example #23
0
    def merge_bam(self,
                  *args,
                  out_file="merged",
                  out_dir="",
                  delete_bams=False,
                  verbose=False,
                  quiet=False,
                  logs=True,
                  objectid="NA",
                  **kwargs):
        """Merge multiple bam files into a single file
        
        Parameters
        ----------
        out_file: string
            Output file name to save the results. .bam will be added at the end.
        args:tuple
            Paths to bam files to combine
        out_dir: string
            Path where to save the merged bam file. Default path is the same as the first bam_file's
        verbose: bool
            Print stdout and std error
        quiet: bool
            Print nothing
        logs: bool
            Log this command to pyrpipe logs
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        kwargs: dict
            Options to pass to trimgalore. This will override the existing options 

        :return: Returns the path to the merged bam file.
        :rtype: string
        """

        if len(args) < 2:
            print("Please supply at least 2 files to merge")
            return ""

        if not out_dir:
            out_dir = pu.get_file_directory(args[0])
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        outMergedFile = os.path.join(out_dir, out_file + ".bam")

        newOpts = {"--": (outMergedFile, ) + args}

        mergedOpts = {**kwargs, **newOpts}

        status = self.run_samtools("merge",
                                   verbose=verbose,
                                   quiet=quiet,
                                   logs=logs,
                                   objectid=objectid,
                                   **mergedOpts)

        if not status:
            print("Bam merge failed for:" + outMergedFile)
            return ""

        #check if bam file exists
        if not pu.check_files_exist(outMergedFile):
            return ""

        if delete_bams:
            for bam_file in args:
                if not pe.deleteFileFromDisk(bam_file):
                    print("Error deleting sam file:" + bam_file)

        return outMergedFile
Example #24
0
    def build_index(self,index_path,transcriptome,objectid="NA"):
        """

        Parameters
        ----------
        index_path : TYPE
            DESCRIPTION.
        transcriptome : TYPE
            DESCRIPTION.
        objectid : TYPE, optional
            DESCRIPTION. The default is "NA".

        Raises
        ------
        OSError
            DESCRIPTION.

        Returns
        -------
        bool
            DESCRIPTION.

        """
        
        #if index already exists then exit
        if not _force:
            #check if files exists
            if pu.check_salmonindex(index_path):
                pu.print_green("Salmon index {} already exists.".format(index_path))
                self.index=index_path
                return True
            
        #check input
        if not pu.check_files_exist(transcriptome):
            pu.print_boldred("{} does not exist. Exiting".format(transcriptome))
            return False
        
        #create out dir
        indexdir=pu.get_file_directory(index_path)
        #create the out dir
        if not pu.check_paths_exist(indexdir):
            if not pu.mkdir(indexdir):
                raise OSError("Error creating salmon index. Failed to create index directory.")
        
        
        validArgsIndex=valid_args._args_SALMON_INDEX
        
            
        internal_kwargs={"--threads":_threads,"-t":transcriptome,"-i":index_path}
        #read build parameters
        yamlfile=os.path.join(_params_dir,'salmon_index.yaml')
        if pu.check_files_exist(yamlfile):
            yaml_params=pl.YAML_loader(yamlfile)
            yaml_kwargs=yaml_params.get_kwargs()
            internal_kwargs={**yaml_kwargs,**internal_kwargs}
            
        salmon_cmd=['salmon','index']
        salmon_cmd.extend(pu.parse_unix_args(validArgsIndex,internal_kwargs))
        
        #call salmon
        status=pe.execute_command(salmon_cmd,objectid=objectid)
        
        if status:
            if pu.check_salmonindex(index_path) and not _dryrun:
                #update object's index
                self.index=index_path
                if self.check_index():
                    return True
        else:
            raise OSError("Error building salmon index")
        
        return False
Example #25
0
 def build_index(self,index_path,transcriptome,objectid="NA"):
     """Function to  build kallisto index
     
     index_path: str
         path to the index
     transcriptome: str
         Path to transcriptome
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
             
     :return: Status of kallisto index
     :rtype: bool
     """
     #if index already exists then exit
     if not _force:
         #check if files exists
         if pu.check_files_exist(index_path):
             pu.print_green("Kallisto index {} already exists.".format(index_path))
             self.index=index_path
             return True
         
     #check input
     if not pu.check_files_exist(transcriptome):
         pu.print_boldred("{} does not exist. Exiting".format(transcriptome))
         raise ValueError("Please check input to kallisto index")
         
     
     #create out dir
     indexdir=pu.get_file_directory(index_path)
     #create the out dir
     if not pu.check_paths_exist(indexdir):
         if not pu.mkdir(indexdir):
             raise OSError("Error creating kallisto index. Failed to create index directory.")
     
     args=(transcriptome,)
     internal_kwargs={"-i":index_path}
     #read build parameters
     yamlfile=os.path.join(_params_dir,'kallisto_index.yaml')
     if pu.check_files_exist(yamlfile):
         yaml_params=pl.YAML_loader(yamlfile)
         yaml_kwargs=yaml_params.get_kwargs()
         internal_kwargs={**yaml_kwargs,**internal_kwargs}
     
     #add positional args
     internal_kwargs['--']=args
     
     validArgsIndex=valid_args._args_KALLISTO_INDEX
     
     kallisto_cmd=['kallisto','index']
     kallisto_cmd.extend(pu.parse_unix_args(validArgsIndex,internal_kwargs))
     
     #call kallisto
     status=pe.execute_command(kallisto_cmd,objectid=objectid)
             
     if status:
         if pu.check_files_exist(index_path) and not _dryrun:
             #update object's index
             self.index=index_path
             if self.check_index():
                 return True
     else:
         raise OSError("Error building kallisto index")
     
     return False
Example #26
0
    def perform_qc(self,sra_object,out_dir="",out_suffix="_trimgalore",objectid="NA"):
        """Function to perform qc using trimgalore.
        The function perform_qc() is consistent for all QC classess.
        
        Parameters
        ----------
        
        sra_object: SRA
            An SRA object whose fastq files will be used
        out_dir: str
            Path to output directory
        out_suffix: string
            Suffix for the output sam file
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
            
        :return: Returns the path of fastq files after QC. tuple has one item for single end files and two for paired.
        :rtype: tuple
        """
        if not out_dir:
            out_dir=sra_object.directory
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)
        
        #get layout
        if sra_object.layout=='PAIRED':
            fq1=sra_object.fastq_path
            fq2=sra_object.fastq2_path
            internal_args=(fq1,fq2)
            internal_kwargs={"--paired":"","-o":out_dir}
            
            
            """
            running trim galore will create two files named <input>_val_1.fq and <input>_val_2.fq
            move these files to the specified out files
            """
            file1=os.path.join(out_dir,pu.get_file_basename(fq1)+"_val_1.fq")
            file2=os.path.join(out_dir,pu.get_file_basename(fq2)+"_val_2.fq")
            #targets
            out_file1=os.path.join(out_dir,pu.get_file_basename(fq1)+out_suffix+".fastq")
            out_file2=os.path.join(out_dir,pu.get_file_basename(fq2)+out_suffix+".fastq")
            
            #check if final files already exists
            if not _force and pu.check_files_exist(out_file1,out_file2):
                pu.print_green('Target files {}, {} already exist.'.format(out_file1,out_file2))
                return out_file1,out_file2
            
            
            #run trimgalore
            status=self.run(*internal_args,objectid=objectid,target=[file1,file2],**internal_kwargs)
            
            if status:
                #return rename the bam  file and return path
                if not _dryrun:
                    pe.move_file(file1,out_file1,verbose=False)
                    pe.move_file(file2,out_file2,verbose=False)
                    if not pu.check_files_exist(out_file1,out_file2):
                        return ""
                
                return out_file1,out_file2
            
            return ("",)
            
            
        else:
            fq=sra_object.fastq_path
            internal_args=(fq,)
            internal_kwargs={"-o":out_dir}

            """
            running trim galore will create one file named <input>_trimmed.fq
            move these files to the specified out files
            """
            file=os.path.join(out_dir,pu.get_file_basename(fq)+"_trimmed.fq")
            #target
            out_file=os.path.join(out_dir, pu.get_file_basename(fq)+out_suffix+".fastq")
            #check if final files already exists
            if not _force and pu.check_files_exist(out_file):
                pu.print_green('Target files {} already exist.'.format(out_file))
                return (out_file,)
            
            #run trimgalore
            status=self.run(*internal_args,objectid=objectid,target=file,**internal_kwargs)
            if status:
                #return rename the bam  file and return path
                if not _dryrun:
                    pe.move_file(file,out_file)
                    if not pu.check_files_exist(out_file):
                        return ""
                
                return (out_file,)
            
            return ("",)
Example #27
0
    def perform_cleaning(self,sra_object,bbsplit_index,out_dir="",out_suffix="_bbsplit",objectid="NA",**kwargs):
        """
        Remove contaminated reads mapping to given reference using bbsplit
        
        Parameters
        ----------
        
        sra_object: SRA
            an SRA object
        bbsplit_index: string
            Path to bbsplit index or fasta file which will generate index
        out_dir: string
            Path to output dir. Default: sra_object.directory
        out_suffix: string
            Suffix for output file name
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.        
        kwargs: dict
            options passed to bbsplit

            :return: Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired.
            :rtype: tuple
        """
        
        #check index
        indexPath=""
        if not pu.check_paths_exist(bbsplit_index):
            #index folder doesn't exist
            #check if input is path to fasta
            if not pu.check_files_exist(bbsplit_index):
                print("Error: Please check bbsplit index")
                return ("",)
            #check if index folder "ref" exists in this directory
            indexPath=os.path.join(pu.get_file_directory(bbsplit_index),"ref")
            if pu.check_paths_exist(indexPath):
                print("Using bbsplit index: "+indexPath)
            else:
                #create new index
                print("Creating new index"+indexPath)
                newOpts={"ref_x":bbsplit_index,"path": pu.get_file_directory(bbsplit_index)}
                mergedOpts={**kwargs,**newOpts}
                #run bbduk
                if not self.run_bbsplit(objectid=objectid,**mergedOpts):
                    print("Error creating bbsplit index.")
                    return ("",)
                if not pu.check_paths_exist(indexPath):
                    print("Error creating bbsplit index.")
                    return ("",)
        else:
            indexPath=bbsplit_index
                
        
        #indexPath point to the ref directory, go one directory higher
        indexPath=os.path.dirname(indexPath)
        
        
        #make out_dir
        if not out_dir:
                out_dir=sra_object.directory
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)
        
        if sra_object.layout=='PAIRED':
            fq1=sra_object.fastq_path
            fq2=sra_object.fastq2_path
            #append input and output options
            
            out_fileName1=pu.get_file_basename(fq1)+out_suffix+".fastq"
            out_fileName2=pu.get_file_basename(fq2)+out_suffix+".fastq"
            out_file1Path=os.path.join(out_dir,out_fileName1)
            out_file2Path=os.path.join(out_dir,out_fileName2)
            
            newOpts={"in1":fq1,"in2":fq2,"outu1":out_file1Path,"outu2":out_file2Path,"path":indexPath}
            mergedOpts={**kwargs,**newOpts}
            
            #run bbsplit
            if self.run_bbsplit(objectid=objectid,target=[out_file1Path,out_file2Path],**mergedOpts):
                if pu.check_files_exist(out_file1Path,out_file2Path):
                    return(out_file1Path,out_file2Path)
            return("",)
            
            
        else:
            fq=sra_object.fastq_path
            #append input and output options
           
            out_fileName=pu.get_file_basename(fq)+out_suffix+".fastq"
            out_filePath=os.path.join(out_dir,out_fileName)
            newOpts={"in":fq,"outu":out_filePath,"path":indexPath}
            mergedOpts={**kwargs,**newOpts}
            
            #run bbsplit
            if self.run_bbsplit(objectid=objectid,target=out_filePath,**mergedOpts):
                if pu.check_files_exist(out_filePath):
                    return(out_filePath,)
            
            return("",)
Example #28
0
from pyrpipe import sra, mapping, assembly, qc, tools
from pyrpipe import pyrpipe_utils as pu
from pyrpipe import pyrpipe_engine as pe

maizeRun = [
    'SRR1573523', 'SRR999058', 'SRR520999', 'SRR1168424', 'SRR1621015',
    'SRR3084882', 'SRR1620828', 'SRR3053545', 'SRR1620949', 'SRR1620947'
]
workingDir = "maize_out"
if not pu.check_paths_exist(workingDir):
    pu.mkdir(workingDir)

GENOME = workingDir + "/Zm-B73-REFERENCE-NAM-5.0.fa"
if not pu.check_files_exist(GENOME):
    print("Downloading genome fasta file")
    wget = "wget https://download.maizegdb.org/Zm-B73-REFERENCE-NAM-5.0/Zm-B73-REFERENCE-NAM-5.0.fa.gz -q -O " + GENOME + ".gz"
    pe.execute_command(wget.split(), verbose=True, logs=False)
    pe.execute_command(['gunzip', GENOME + ".gz"], verbose=True, logs=False)

sraObjects = []

for x in maizeRun:
    thisSraOb = sra.SRA(x, workingDir)
    if thisSraOb.download_fastq():
        sraObjects.append(thisSraOb)
    else:
        print("Download failed:" + x)

print("Following runs downloaded:")
for ob in sraObjects:
    print(ob.srr_accession)
Example #29
0
    def perform_qc(self,
                   sra_object,
                   out_dir="",
                   out_suffix="_trimgalore",
                   verbose=False,
                   quiet=False,
                   logs=True,
                   objectid="NA",
                   **kwargs):
        """Function to perform qc using trimgalore.
        The function perform_qc() is consistent for all QC classess.
        
        Parameters
        ----------
        sra_object: SRA
            An SRA object whose fastq files will be used
        out_suffix: string
            Suffix for the output sam file
        verbose: bool
            Print stdout and std error
        quiet: bool
            Print nothing
        logs: bool
            Log this command to pyrpipe logs
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        kwargs: dict
            Options to pass to trimgalore. This will override the existing options 

            :return: Returns the path of fastq files after QC. tuple has one item for single end files and two for paired.
            :rtype: tuple
        """

        if not out_dir:
            out_dir = sra_object.location
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        #create new options based on parametrs
        newOpts = {}
        #get layout
        if sra_object.layout == 'PAIRED':
            fq1 = sra_object.localfastq1Path
            fq2 = sra_object.localfastq2Path
            out_file1 = os.path.join(
                out_dir,
                pu.get_file_basename(fq1) + out_suffix + ".fastq")
            out_file2 = os.path.join(
                out_dir,
                pu.get_file_basename(fq2) + out_suffix + ".fastq")
            newOpts = {"--paired": "", "--": (fq1, fq2), "-o": out_dir}
            mergedOpts = {**kwargs, **newOpts}
            #run trimgalore
            self.run_trimgalore(verbose=verbose,
                                quiet=quiet,
                                logs=logs,
                                objectid=objectid,
                                **mergedOpts)
            """
            running trim galore will create two files named <input>_val_1.fq and <input>_val_2.fq
            move these files to the specified out files
            """
            oldFile1 = os.path.join(out_dir,
                                    pu.get_file_basename(fq1) + "_val_1.fq")
            oldFile2 = os.path.join(out_dir,
                                    pu.get_file_basename(fq2) + "_val_2.fq")

            pe.move_file(oldFile1, out_file1)
            pe.move_file(oldFile2, out_file2)

            if not pu.check_files_exist(out_file1, out_file2):
                print("Trimgalore failed")
                return ("", )
            return out_file1, out_file2

        else:
            fq = sra_object.localfastqPath
            out_file = os.path.join(
                out_dir,
                pu.get_file_basename(fq) + out_suffix + ".fastq")
            #giving input arguments as a tuple "--":(fq,)
            newOpts = {"--": (fq, ), "-o": out_dir}
            #run trimgalore
            mergedOpts = {**kwargs, **newOpts}

            self.run_trimgalore(verbose=verbose,
                                quiet=quiet,
                                logs=logs,
                                objectid=objectid,
                                **mergedOpts)
            """
            running trim galore will create one file named <input>_trimmed.fq
            move these files to the specified out files
            """
            oldFile = os.path.join(out_dir,
                                   pu.get_file_basename(fq) + "_trimmed.fq")

            pe.move_file(oldFile, out_file)

            if not pu.check_files_exist(out_file):
                print("Trimgalore failed")
                return ("", )
            return (out_file, )
Example #30
0
    def perform_qc(self,
                   sra_object,
                   out_dir="",
                   out_suffix="_bbduk",
                   overwrite=True,
                   verbose=False,
                   quiet=False,
                   logs=True,
                   objectid="NA",
                   **kwargs):
        """Run bbduk on fastq files specified by the sra_object
        
        Parameters
        ----------
        arg1: SRA
            an SRA object
        arg2: string
            Suffix for output file name
        arg3: bool
            overwrite existing files
        verbose (bool): Print stdout and std error
        quiet (bool): Print nothing
        logs (bool): Log this command to pyrpipe logs
        objectid (str): Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        
        arg3: dict
            options passed to bbduk
            
        Returns
        tuple
            Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired.
        """

        #make out_dir
        if not out_dir:
            out_dir = sra_object.location
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        if sra_object.layout == 'PAIRED':
            fq1 = sra_object.localfastq1Path
            fq2 = sra_object.localfastq2Path

            out_fileName1 = pu.get_file_basename(fq1) + out_suffix + ".fastq"
            out_fileName2 = pu.get_file_basename(fq2) + out_suffix + ".fastq"
            out_file1Path = os.path.join(out_dir, out_fileName1)
            out_file2Path = os.path.join(out_dir, out_fileName2)

            newOpts = {
                "in": fq1,
                "in2": fq2,
                "out": out_file1Path,
                "out2": out_file2Path
            }
            mergedOpts = {**kwargs, **newOpts}

            #run bbduk
            if self.run_bbduk(verbose=verbose,
                              quiet=quiet,
                              logs=logs,
                              objectid=objectid,
                              **mergedOpts):
                if pu.check_files_exist(out_file1Path, out_file2Path):
                    return (out_file1Path, out_file2Path)
            return ("", )

        else:
            fq = sra_object.localfastqPath
            out_fileName = pu.get_file_basename(fq) + out_suffix + ".fastq"
            out_filePath = os.path.join(out_dir, out_fileName)
            newOpts = {"in": fq, "out": out_filePath}
            mergedOpts = {**kwargs, **newOpts}

            #run bbduk
            if self.run_bbduk(verbose=verbose,
                              quiet=quiet,
                              logs=logs,
                              objectid=objectid,
                              **mergedOpts):
                if pu.check_files_exist(out_filePath):
                    return (out_filePath, )
            return ("", )