Esempio n. 1
0
 def perform_qc(self,sra_object,out_dir="",out_suffix="_bbduk",objectid="NA"):
     """Run bbduk on fastq files specified by the sra_object
    
     sra_object: SRA
         An SRA object whose fastq files will be used
     out_dir: str
         Path to output directory
     out_suffix: string
         Suffix for the output sam file
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
     
     :return: Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired.
     :rtype: tuple
         
     """
     #make out_dir
     if not out_dir:
             out_dir=sra_object.directory
     else:
         if not pu.check_paths_exist(out_dir):
             pu.mkdir(out_dir)
            
     if sra_object.layout=='PAIRED':
         fq1=sra_object.fastq_path
         fq2=sra_object.fastq2_path
         out_fileName1=pu.get_file_basename(fq1)+out_suffix+".fastq"
         out_fileName2=pu.get_file_basename(fq2)+out_suffix+".fastq"
         out_file1Path=os.path.join(out_dir,out_fileName1)
         out_file2Path=os.path.join(out_dir,out_fileName2)
         
         internal_args=()
         internal_kwargs={"in":fq1,"in2":fq2,"out":out_file1Path,"out2":out_file2Path}
                     
         #run bbduk
         status=self.run(*internal_args,objectid=objectid,target=[out_file1Path,out_file2Path],**internal_kwargs)
         
         if status:
             if not pu.check_files_exist(out_file1Path,out_file2Path) and not _dryrun:
                     return("",)
                     
         return(out_file1Path,out_file2Path)
         
         
     else:
         fq=sra_object.fastq_path
         out_fileName=pu.get_file_basename(fq)+out_suffix+".fastq"
         out_filePath=os.path.join(out_dir,out_fileName)
         internal_args=()
         internal_kwargs={"in":fq,"out":out_filePath}
         
         #run bbduk
         status=self.run(*internal_args,objectid=objectid,target=out_filePath,**internal_kwargs)
         if status:
             if not pu.check_files_exist(out_filePath) and not _dryrun:
                 return("",)
             
         return(out_filePath,) 
Esempio n. 2
0
    def bamtofq(self, bam, oid, rm_bam=True):
        out_dir = pu.get_file_directory(bam)
        fastq_name = os.path.join(out_dir, oid)
        sname = os.path.join(out_dir, 's.fq')
        oname = os.path.join(out_dir, 'o.fq')
        o2name = os.path.join(out_dir, 'o2.fq')
        #tempfilename
        tmpdir = os.environ.get('LOCAL')
        if not tmpdir: tmpdir = './'
        tmpfile = os.path.join(tmpdir,
                               pu.get_file_basename(bam) + pu.get_timestamp())

        internal_kwargs = {
            'F': fastq_name + '_1.fastq',
            'F2': fastq_name + '_2.fastq',
            'S': sname,
            'O': oname,
            'O2': o2name,
            'T': tmpfile,
            'filename': bam
        }

        #call run
        status = self.run(None, objectid=oid, **internal_kwargs)
        if status and rm_bam: pe.delete_file(bam)
        return status
Esempio n. 3
0
    def search_fastq(self, path):
        """Search .fastq file under a dir and create SRA object
        Return True if found otherwise False
        """
        #search files under the path
        fq_files = pe.find_files(path, "*.fastq")

        if len(fq_files) < 1:
            return False

        if len(fq_files) > 2:
            pu.print_boldred("Can not determine .fastq. Exiting...")
            return False

        #case with single fastq
        if len(fq_files) == 1:
            self.localfastqPath = fq_files[0]
            pu.print_green("Found .fastq " + self.localfastqPath)
            self.layout = "SINGLE"

        #case with paired fastq
        if len(fq_files) == 2:
            self.localfastq1Path = fq_files[0]
            self.localfastq2Path = fq_files[1]
            pu.print_green("Found .fastq " + self.localfastq1Path + " " +
                           self.localfastq2Path)
            self.layout = "PAIRED"

        self.location = path
        self.srr_accession = pu.get_file_basename(fq_files[0])
        return True
Esempio n. 4
0
    def search_sra(self, path):
        """Search .sra file under a dir
        Return True if found otherwise False
        """
        #search files under the path

        sra_files = pe.find_files(path, "*.sra")

        if len(sra_files) < 1:
            return False

        if len(sra_files) > 1:
            pu.print_boldred(
                "Found multiple .sra files. Using the first entry...")
        sra_path = sra_files[0]
        #self.location=path
        self.srr_accession = pu.get_file_basename(sra_path)
        self.localSRAFilePath = sra_path
        self.sraFileSize = pu.get_file_size(self.localSRAFilePath)
        #test if file is paired or single end
        if pe.is_paired(self.localSRAFilePath):
            self.layout = "PAIRED"
        else:
            self.layout = "SINGLE"

        pu.print_green("Found .sra " + self.localSRAFilePath)
        return True
Esempio n. 5
0
    def createMikadoGTFlist(self,
                            out_file,
                            out_dir,
                            searchPath,
                            searchQuery="*.gtf",
                            strand=False):
        """Create a file to be used by mikado configure
        """

        files = pe.find_files(searchPath, searchQuery)
        args = files

        #create out dir
        if not pu.check_paths_exist(out_dir):
            pu.mkdir(out_dir)
        outFilePath = os.path.join(out_dir, out_file + ".txt")

        gtfs = []
        for l in args:
            thisName = pu.get_file_basename(l)
            if thisName:
                gtfs.append("\t".join([l, thisName, str(strand)]))

        f = open(outFilePath, "w")
        f.write("\n".join(gtfs))
        f.close()

        pu.print_green("Mikado list file written to:" + outFilePath)
        return outFilePath
Esempio n. 6
0
def shell():
    print("Generating bash script")
    parser = argparse.ArgumentParser(
   
            description='pyrpipe diagnostic utility\nGenerate shell script.',
            
            usage='''pyrpipe_diagnostic report [<args>] <logfile>
                    
                    ''')    
    parser.add_argument('-o', help='out file \ndefault: same as input logfile',action="store")
    parser.add_argument('-c',help='Dump command options [(a)ll,fa(i)l,(p)ass]\ndefault: a',default='a',action="store")
    parser.add_argument('-v',help='verbose',action="store_true")
    parser.add_argument('-f',help='Filter by programs. Provide a comma-separated list e.g., prefetch,STAR,bowtie2 \ndefault None')
    parser.add_argument('logfile', help='The log file generated by pyrpipe',action="store")
    args = parser.parse_args(sys.argv[2:])
    
    logFile=args.logfile  
    #parse args
    vFlag=args.v
    if vFlag:
        print("Generating report")
    outFile=""
    if args.o is None:
        outFile=pu.get_file_basename(logFile)
    else:
        outFile=args.o
    outFile+='.sh'
    
    filters=[]
    if args.f is not None:
        filters= args.f.split(',')
    
    generateBashScript(logFile,outFile,filters,args.c)
Esempio n. 7
0
def checkEnvLog(logFile):
    #check all logs exist
    logFileDir=pu.get_file_directory(logFile)
    basename=pu.get_file_basename(logFile)
    envLog=os.path.join(logFileDir,basename+"ENV.log")
    if not pu.check_files_exist(logFile,envLog):
        print("Please check missing log files. Exiting.")
        sys.exit(1)
    return envLog
Esempio n. 8
0
def sortbam(bam, oid):
    outfile = pu.get_file_basename(bam) + "_sorted.bam"
    outdir = pu.get_file_directory(bam)
    outpath = os.path.join(outdir, outfile)
    cmd = 'sambamba sort -t 25 -m 100G -o ' + outpath + ' ' + bam
    st = pe.execute_command(cmd.split(), logs=True, objectid=oid)
    if not st:
        return ""
    return outpath
Esempio n. 9
0
    def perform_assembly(self,
                         bam_file,
                         out_dir=None,
                         out_suffix="_stringtie",
                         objectid="NA"):
        """Function to run stringtie using a bam file.
                
        Parameters
        ----------
        
        bam_file: string
            path to the bam file
        out_dir: string
            Path to out file
        out_suffix: string
            Suffix for the output gtf file
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        :return: Returns the path to output GTF file
        :rtype: string
        """

        #create path to output file
        fname = pu.get_file_basename(bam_file)

        if not out_dir:
            out_dir = pu.get_file_directory(bam_file)

        if not pu.check_paths_exist(out_dir):
            pu.mkdir(out_dir)

        out_gtf_file = os.path.join(out_dir, fname + out_suffix + ".gtf")

        #Add output file name and input bam
        internal_args = (bam_file, )
        internal_kwargs = {"-o": out_gtf_file}
        #add positional args
        internal_kwargs['--'] = internal_args

        #call stringtie
        status = self.run(None,
                          objectid=objectid,
                          target=out_gtf_file,
                          **internal_kwargs)

        if status:
            #check if sam file is present in the location directory of sraOb
            if not pu.check_files_exist(out_gtf_file) and not _dryrun:
                return ""
            return out_gtf_file

        return ""
Esempio n. 10
0
def benchmark():
    print("Generating benchmarks")
    parser = argparse.ArgumentParser(
   
            description='pyrpipe diagnostic utility\nGenerate benchmark report.',
            
            usage='''pyrpipe_diagnostic report [<args>] <logfile>
                    
                    ''')    
    parser.add_argument('-o', help='out file \ndefault: same as input logfile',action="store")
    parser.add_argument('-e', help='report output type: [MD,PDF,HTML] \ndefault: PDF',default='PDF',action="store")
    parser.add_argument('-v',help='verbose',action="store_true")
    parser.add_argument('-f',help='Filter by programs. Provide a comma-separated list e.g., prefetch,STAR,bowtie2 \ndefault None')
    parser.add_argument('-t',help='Temporary directory. \ndefault ./tmp',action="store")
    parser.add_argument('logfile', help='The log file generated by pyrpipe',action="store")
    args = parser.parse_args(sys.argv[2:])
    
    logFile=args.logfile
    envLog=checkEnvLog(logFile)    
    #parse args
    vFlag=args.v
    if vFlag:
        print("Generating benchmarks")
    outFile=""
    if args.o is None:
        outFile=pu.get_file_basename(args.logfile)
    else:
        outFile=args.o
    outFile+='.'+args.e
    
    filters=[]
    if args.f is not None:
        filters= args.f.split(',')
    #create temp dir
    tempDir=""
    if args.t is not None:
        tempDir= args.t
    else:
        tempDir=os.path.join(os.getcwd(),"tmp")
    #create tmp dir
    if not pu.check_paths_exist(tempDir):
        pu.mkdir(tempDir)
        
    generateBenchmarkReport(logFile,envLog,filters,tempDir,outFile=outFile,verbose=args.v)
Esempio n. 11
0
    def createMikadoGTFlist(self,
                            out_file,
                            out_dir,
                            searchPath,
                            searchQuery="*.gtf",
                            strand=False):
        """Create a file to be used by mikado configure
        out_file: str
            outfile name
        out_dir: str
            path to out_dir
        searchPath: str
            Path where gtf/gff files will be searched
        searchQuery: str
            Query to perform search. Default: "*.gtf"
        strand: bool
            Stranded flag: Default false
        
            
        """

        files = pe.find_files(searchPath, searchQuery, recursive=True)
        args = files

        #create out dir
        if not pu.check_paths_exist(out_dir):
            pu.mkdir(out_dir)
        outFilePath = os.path.join(out_dir, out_file + ".txt")

        gtfs = []
        for l in args:
            thisName = pu.get_file_basename(l)
            if thisName:
                gtfs.append("\t".join([l, thisName, str(strand)]))

        f = open(outFilePath, "w")
        f.write("\n".join(gtfs))
        f.close()

        pu.print_green("Mikado list file written to:" + outFilePath)
        return outFilePath
Esempio n. 12
0
def report():
    
    parser = argparse.ArgumentParser(
   
            description='pyrpipe diagnostic utility\nGenerate analysis report.',
            
            usage='''pyrpipe_diagnostic report [<args>] <logfile>
                    
                    ''')    
    parser.add_argument('-o', help='out file \ndefault: same as input logfile',action="store")
    parser.add_argument('-e', help='report output type: [md,pdf,html] \ndefault: pdf',default='pdf',action="store")
    parser.add_argument('-c',help='Report options [(f)ull,fa(i)l,(p)ass]\ndefault: f',default='f',action="store")
    parser.add_argument('-v',help='verbose',action="store_true")
    parser.add_argument('logfile', help='The log file generated by pyrpipe',action="store")
    args = parser.parse_args(sys.argv[2:])
    
    logFile=args.logfile
    envLog=checkEnvLog(logFile)    
    #parse args
    vFlag=args.v
    if vFlag:
        print("Generating report")
    outFile=""
    if args.o is None:
        outFile=pu.get_file_basename(args.logfile)
    else:
        outFile=args.o
    outFile+='.'+args.e
    
    
    if args.e in ['pdf','html','md']:
        htmlReport=generateHTMLReport('simpleDiv.html',logFile,envLog,coverage=args.c)
        if args.e=='pdf':
            writeHtmlToPdf(htmlReport,outFile)
        elif args.e=='html':
            writeHtml(htmlReport,outFile)
        elif args.e == 'md':
            writeHtmlToMarkdown(htmlReport,outFile)
    else:
        pu.print_boldred("unknown extension:"+args.e+". Exiting")
Esempio n. 13
0
def checkEnvLog(logFile):
    """
    Check log exist and return path to corresponding ENV log

    Parameters
    ----------
    logFile : str
        path to log file.

    Returns
    -------
    envLog : TYPE
        DESCRIPTION.

    """
    #check all logs exist
    logFileDir = pu.get_file_directory(logFile)
    basename = pu.get_file_basename(logFile)
    envLog = os.path.join(logFileDir, basename + "ENV.log")
    if not pu.check_files_exist(logFile, envLog):
        print("Please check missing log files. Exiting.")
        sys.exit(1)
    return envLog
Esempio n. 14
0
    def sam_to_bam(self,
                   sam_file,
                   out_dir="",
                   out_suffix="",
                   delete_sam=False,
                   verbose=False,
                   quiet=False,
                   logs=True,
                   objectid="NA",
                   **kwargs):
        """Convert sam file to a bam file. 
        Output bam file will have same name as input sam.
        
        out_suffix: string
            Suffix for the output sam file
        delete_sam: bool
            delete the sam file after conversion
        verbose: bool
            Print stdout and std error
        quiet: bool
            Print nothing
        logs: bool
            Log this command to pyrpipe logs
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        kwargs: dict
            Options to pass to trimgalore. This will override the existing options 

        :return: Returns the path to the bam file. Returns empty string if operation failed.
        :rtype: string
        """
        if not out_dir:
            out_dir = pu.get_file_directory(sam_file)
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        fname = pu.get_file_basename(sam_file)

        #output will be out_bam
        out_bam = os.path.join(out_dir, fname + out_suffix + '.bam')

        newOpts = {"--": (sam_file, ), "-o": out_bam, "-b": ""}
        mergedOpts = {**kwargs, **newOpts}

        status = self.run_samtools("view",
                                   verbose=verbose,
                                   quiet=quiet,
                                   logs=logs,
                                   objectid=objectid,
                                   **mergedOpts)

        if not status:
            print("Sam to bam failed for:" + sam_file)
            return ""

        #check if bam file exists
        if not pu.check_files_exist(out_bam):
            return ""

        #delete_sam_file
        if delete_sam:
            if not pe.deleteFileFromDisk(sam_file):
                print("Error deleting sam file:" + sam_file)

        #return path to file
        return out_bam
Esempio n. 15
0
    def perform_assembly(self,
                         bam_file,
                         out_dir="",
                         out_suffix="_stringtie",
                         overwrite=True,
                         verbose=False,
                         quiet=False,
                         logs=True,
                         objectid="NA",
                         **kwargs):
        """Function to run stringtie using a bam file.
                
        Parameters
        ----------
        
        bam_file: string
            path to the bam file
        out_suffix: string
            Suffix for the output gtf file
        overwrite: bool
            Overwrite if output file already exists.
        verbose: bool
            Print stdout and std error
        quiet: bool
            Print nothing
        logs: bool
            Log this command to pyrpipe logs
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        kwargs: dict
            Options to pass to stringtie. This will override the existing options in self.passed_args_dict (only replace existing arguments and not replace all the arguments).
            
        :return: Returns the path to output GTF file
        :rtype: string
        """

        #create path to output file
        fname = pu.get_file_basename(bam_file)

        if not out_dir:
            out_dir = pu.get_file_directory(bam_file)

        out_gtf_file = os.path.join(out_dir, fname + out_suffix + ".gtf")
        """
        Handle overwrite
        """
        if not overwrite:
            #check if file exists. return if yes
            if os.path.isfile(out_gtf_file):
                print("The file " + out_gtf_file +
                      " already exists. Exiting..")
                return out_gtf_file

        #Add output file name and input bam
        new_opts = {"-o": out_gtf_file, "--": (bam_file, )}
        merged_opts = {**kwargs, **new_opts}

        #call stringtie
        status = self.run_stringtie(verbose=verbose,
                                    quiet=quiet,
                                    logs=logs,
                                    objectid=objectid,
                                    **merged_opts)

        if status:
            #check if sam file is present in the location directory of sraOb
            if pu.check_files_exist(out_gtf_file):
                return out_gtf_file
        else:
            return ""
Esempio n. 16
0
    def sort_bam(self,
                 bam_file,
                 out_dir="",
                 out_suffix="",
                 threads=None,
                 delete_bam=False,
                 verbose=False,
                 quiet=False,
                 logs=True,
                 objectid="NA",
                 **kwargs):
        """Sorts an input bam file. Outpufile will end in _sorted.bam
        bam_file: str
            Path to the input bam file
        out_dir: str
            Path to output directory
        out_suffix: str
            Output file suffix
        threads: int
            Number of threads. Default: Use self.threads initialized in init().
        delete_bam: bool
            Delete input bam_file
        verbose: bool
            Print stdout and std error
        quiet: bool
            Print nothing
        logs: bool
            Log this command to pyrpipe logs
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        kwargs: dict
            Options to pass to samtools. This will override the existing options 

        :return: Returns path to the sorted bam file. Returns empty string if operation failed.
        :rtype: string
        
        """
        if not out_dir:
            out_dir = pu.get_file_directory(bam_file)
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        fname = pu.get_file_basename(bam_file)
        #output will be out_bam
        outSortedbam_file = os.path.join(out_dir,
                                         fname + out_suffix + '_sorted.bam')

        #handle threads
        if not threads:
            threads = self.threads

        newOpts = {
            "--": (bam_file, ),
            "-o": outSortedbam_file,
            "-@": str(threads)
        }
        mergedOpts = {**newOpts, **kwargs}

        status = self.run_samtools("sort",
                                   verbose=verbose,
                                   quiet=quiet,
                                   logs=logs,
                                   objectid=objectid,
                                   **mergedOpts)

        if not status:
            print("Bam sort failed for:" + bam_file)
            return ""

        #check if bam file exists
        if not pu.check_files_exist(outSortedbam_file):
            return ""

        if delete_bam:
            if not pe.deleteFileFromDisk(bam_file):
                print("Error deleting sam file:" + bam_file)

        #return path to file
        return outSortedbam_file
Esempio n. 17
0
    def perform_qc(self,sra_object,out_dir="",out_suffix="_trimgalore",objectid="NA"):
        """Function to perform qc using trimgalore.
        The function perform_qc() is consistent for all QC classess.
        
        Parameters
        ----------
        
        sra_object: SRA
            An SRA object whose fastq files will be used
        out_dir: str
            Path to output directory
        out_suffix: string
            Suffix for the output sam file
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
            
        :return: Returns the path of fastq files after QC. tuple has one item for single end files and two for paired.
        :rtype: tuple
        """
        if not out_dir:
            out_dir=sra_object.directory
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)
        
        #get layout
        if sra_object.layout=='PAIRED':
            fq1=sra_object.fastq_path
            fq2=sra_object.fastq2_path
            internal_args=(fq1,fq2)
            internal_kwargs={"--paired":"","-o":out_dir}
            
            
            """
            running trim galore will create two files named <input>_val_1.fq and <input>_val_2.fq
            move these files to the specified out files
            """
            file1=os.path.join(out_dir,pu.get_file_basename(fq1)+"_val_1.fq")
            file2=os.path.join(out_dir,pu.get_file_basename(fq2)+"_val_2.fq")
            #targets
            out_file1=os.path.join(out_dir,pu.get_file_basename(fq1)+out_suffix+".fastq")
            out_file2=os.path.join(out_dir,pu.get_file_basename(fq2)+out_suffix+".fastq")
            
            #check if final files already exists
            if not _force and pu.check_files_exist(out_file1,out_file2):
                pu.print_green('Target files {}, {} already exist.'.format(out_file1,out_file2))
                return out_file1,out_file2
            
            
            #run trimgalore
            status=self.run(*internal_args,objectid=objectid,target=[file1,file2],**internal_kwargs)
            
            if status:
                #return rename the bam  file and return path
                if not _dryrun:
                    pe.move_file(file1,out_file1,verbose=False)
                    pe.move_file(file2,out_file2,verbose=False)
                    if not pu.check_files_exist(out_file1,out_file2):
                        return ""
                
                return out_file1,out_file2
            
            return ("",)
            
            
        else:
            fq=sra_object.fastq_path
            internal_args=(fq,)
            internal_kwargs={"-o":out_dir}

            """
            running trim galore will create one file named <input>_trimmed.fq
            move these files to the specified out files
            """
            file=os.path.join(out_dir,pu.get_file_basename(fq)+"_trimmed.fq")
            #target
            out_file=os.path.join(out_dir, pu.get_file_basename(fq)+out_suffix+".fastq")
            #check if final files already exists
            if not _force and pu.check_files_exist(out_file):
                pu.print_green('Target files {} already exist.'.format(out_file))
                return (out_file,)
            
            #run trimgalore
            status=self.run(*internal_args,objectid=objectid,target=file,**internal_kwargs)
            if status:
                #return rename the bam  file and return path
                if not _dryrun:
                    pe.move_file(file,out_file)
                    if not pu.check_files_exist(out_file):
                        return ""
                
                return (out_file,)
            
            return ("",)
Esempio n. 18
0
    def perform_qc(self,
                   sra_object,
                   out_dir="",
                   out_suffix="_bbduk",
                   overwrite=True,
                   threads=None,
                   max_memory=None,
                   verbose=False,
                   quiet=False,
                   logs=True,
                   objectid="NA",
                   **kwargs):
        """Run bbduk on fastq files specified by the sra_object
        
        Parameters
        ----------
        
        sra_object: SRA
            an SRA object
        out_dir: string
            Path to out dir. Default: sra_object.location
        out_suffix: string
            Suffix for output file name
        overwrite: bool
            overwrite existing files
        threads: int
            Num threads to use
        max_memory: float
            Max memory to use in GB
        verbose: bool
            Print stdout and std error
        quiet: bool
            Print nothing
        logs: bool
            Log this command to pyrpipe logs
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        
        kwargs: dict
            options passed to bbduk
            
        :return: Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired.
        :rtype: tuple
            
        """

        #make out_dir
        if not out_dir:
            out_dir = sra_object.location
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        if not threads:
            threads = self.threads
        if not max_memory:
            max_memory = self.max_memory

        memory_flag = "-Xmx" + str(max_memory) + "g"

        #optimize parameters
        #if optimize:
        #    print("generating suggested parameters XXX TD")

        if sra_object.layout == 'PAIRED':
            fq1 = sra_object.localfastq1Path
            fq2 = sra_object.localfastq2Path

            out_fileName1 = pu.get_file_basename(fq1) + out_suffix + ".fastq"
            out_fileName2 = pu.get_file_basename(fq2) + out_suffix + ".fastq"
            out_file1Path = os.path.join(out_dir, out_fileName1)
            out_file2Path = os.path.join(out_dir, out_fileName2)

            newOpts = {
                "in": fq1,
                "in2": fq2,
                "out": out_file1Path,
                "out2": out_file2Path,
                "--": (memory_flag, ),
                "threads": str(threads)
            }
            mergedOpts = {**newOpts, **kwargs}

            #run bbduk
            if self.run_bbduk(verbose=verbose,
                              quiet=quiet,
                              logs=logs,
                              objectid=objectid,
                              **mergedOpts):
                if pu.check_files_exist(out_file1Path, out_file2Path):
                    return (out_file1Path, out_file2Path)
            return ("", )

        else:
            fq = sra_object.localfastqPath
            out_fileName = pu.get_file_basename(fq) + out_suffix + ".fastq"
            out_filePath = os.path.join(out_dir, out_fileName)
            newOpts = {"in": fq, "out": out_filePath, "--": (memory_flag, )}
            mergedOpts = {**newOpts, **kwargs}

            #run bbduk
            if self.run_bbduk(verbose=verbose,
                              quiet=quiet,
                              logs=logs,
                              objectid=objectid,
                              **mergedOpts):
                if pu.check_files_exist(out_filePath):
                    return (out_filePath, )
            return ("", )
Esempio n. 19
0
 def perform_assembly(self,bam_file,out_dir="",out_suffix="_cufflinks",reference_gtf=None,threads=None,overwrite=True,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs):
     """Function to run cufflinks with BAM file as input.
             
     Parameters
     ----------
     bam_file: string
         path to bam file
     out_dir: 
         output directory
     out_suffix: string
         Suffix for the output gtf file
     reference_gtf: str
         Path to reference gtf 
     threads: int
         Number of threads to use
     overwrite: bool
         Overwrite if output file already exists.
     verbose: bool
         Print stdout and std error
     quiet: bool
         Print nothing
     logs: bool
         Log this command to pyrpipe logs
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession.
     kwargs: dict
         Options to pass to cufflinks. 
         
     :return: Returns the path to output GTF file
     :rtype: string       
     """
     
     #create path to output file
     fname=pu.get_file_basename(bam_file)
     if not out_dir:
         out_dir=pu.get_file_directory(bam_file)
     else:
         if not pu.check_paths_exist(out_dir):
             pu.mkdir(out_dir)
     out_gtf_file=os.path.join(out_dir,fname+out_suffix+".gtf")
     
     """
     Handle overwrite
     """
     if not overwrite:
         #check if file exists. return if yes
         if os.path.isfile(out_gtf_file):
             print("The file "+out_gtf_file+" already exists. Exiting..")
             return out_gtf_file
     
     if not threads:
         threads=self.threads
         
     #Add output file name and input bam
     new_opts={"-o":out_dir,"--":(bam_file,),"-p":str(threads)}
     
     #add ref gtf
     if reference_gtf:
         if not pu.check_files_exist(reference_gtf):
             pu.print_boldred("Error: Provided reference GTF {} doesn't exist. Exiting...".format(reference_gtf))
             return ""
         
         new_opts["-g"]=reference_gtf
     
     merged_opts={**new_opts,**kwargs}
     
     #call cufflinks
     status=self.run_cufflinks(verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**merged_opts)
     
     if status:
         #move out_dir/transcripts.gtf to outfile
         pe.move_file(os.path.join(out_dir,"transcripts.gtf"),out_gtf_file)
         #check if sam file is present in the location directory of sraOb
         if pu.check_files_exist(out_gtf_file):
             return out_gtf_file
     else:
         return ""
Esempio n. 20
0
    def perform_qc(self,
                   sra_object,
                   out_dir="",
                   out_suffix="_trimgalore",
                   verbose=False,
                   quiet=False,
                   logs=True,
                   objectid="NA",
                   **kwargs):
        """Function to perform qc using trimgalore.
        The function perform_qc() is consistent for all QC classess.
        
        Parameters
        ----------
        sra_object: SRA
            An SRA object whose fastq files will be used
        out_suffix: string
            Suffix for the output sam file
        verbose: bool
            Print stdout and std error
        quiet: bool
            Print nothing
        logs: bool
            Log this command to pyrpipe logs
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        kwargs: dict
            Options to pass to trimgalore. This will override the existing options 

            :return: Returns the path of fastq files after QC. tuple has one item for single end files and two for paired.
            :rtype: tuple
        """

        if not out_dir:
            out_dir = sra_object.location
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        #create new options based on parametrs
        newOpts = {}
        #get layout
        if sra_object.layout == 'PAIRED':
            fq1 = sra_object.localfastq1Path
            fq2 = sra_object.localfastq2Path
            out_file1 = os.path.join(
                out_dir,
                pu.get_file_basename(fq1) + out_suffix + ".fastq")
            out_file2 = os.path.join(
                out_dir,
                pu.get_file_basename(fq2) + out_suffix + ".fastq")
            newOpts = {"--paired": "", "--": (fq1, fq2), "-o": out_dir}
            mergedOpts = {**kwargs, **newOpts}
            #run trimgalore
            self.run_trimgalore(verbose=verbose,
                                quiet=quiet,
                                logs=logs,
                                objectid=objectid,
                                **mergedOpts)
            """
            running trim galore will create two files named <input>_val_1.fq and <input>_val_2.fq
            move these files to the specified out files
            """
            oldFile1 = os.path.join(out_dir,
                                    pu.get_file_basename(fq1) + "_val_1.fq")
            oldFile2 = os.path.join(out_dir,
                                    pu.get_file_basename(fq2) + "_val_2.fq")

            pe.move_file(oldFile1, out_file1)
            pe.move_file(oldFile2, out_file2)

            if not pu.check_files_exist(out_file1, out_file2):
                print("Trimgalore failed")
                return ("", )
            return out_file1, out_file2

        else:
            fq = sra_object.localfastqPath
            out_file = os.path.join(
                out_dir,
                pu.get_file_basename(fq) + out_suffix + ".fastq")
            #giving input arguments as a tuple "--":(fq,)
            newOpts = {"--": (fq, ), "-o": out_dir}
            #run trimgalore
            mergedOpts = {**kwargs, **newOpts}

            self.run_trimgalore(verbose=verbose,
                                quiet=quiet,
                                logs=logs,
                                objectid=objectid,
                                **mergedOpts)
            """
            running trim galore will create one file named <input>_trimmed.fq
            move these files to the specified out files
            """
            oldFile = os.path.join(out_dir,
                                   pu.get_file_basename(fq) + "_trimmed.fq")

            pe.move_file(oldFile, out_file)

            if not pu.check_files_exist(out_file):
                print("Trimgalore failed")
                return ("", )
            return (out_file, )
Esempio n. 21
0
    def perform_qc(self,
                   sra_object,
                   out_dir="",
                   out_suffix="_bbduk",
                   overwrite=True,
                   verbose=False,
                   quiet=False,
                   logs=True,
                   objectid="NA",
                   **kwargs):
        """Run bbduk on fastq files specified by the sra_object
        
        Parameters
        ----------
        arg1: SRA
            an SRA object
        arg2: string
            Suffix for output file name
        arg3: bool
            overwrite existing files
        verbose (bool): Print stdout and std error
        quiet (bool): Print nothing
        logs (bool): Log this command to pyrpipe logs
        objectid (str): Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        
        arg3: dict
            options passed to bbduk
            
        Returns
        tuple
            Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired.
        """

        #make out_dir
        if not out_dir:
            out_dir = sra_object.location
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        if sra_object.layout == 'PAIRED':
            fq1 = sra_object.localfastq1Path
            fq2 = sra_object.localfastq2Path

            out_fileName1 = pu.get_file_basename(fq1) + out_suffix + ".fastq"
            out_fileName2 = pu.get_file_basename(fq2) + out_suffix + ".fastq"
            out_file1Path = os.path.join(out_dir, out_fileName1)
            out_file2Path = os.path.join(out_dir, out_fileName2)

            newOpts = {
                "in": fq1,
                "in2": fq2,
                "out": out_file1Path,
                "out2": out_file2Path
            }
            mergedOpts = {**kwargs, **newOpts}

            #run bbduk
            if self.run_bbduk(verbose=verbose,
                              quiet=quiet,
                              logs=logs,
                              objectid=objectid,
                              **mergedOpts):
                if pu.check_files_exist(out_file1Path, out_file2Path):
                    return (out_file1Path, out_file2Path)
            return ("", )

        else:
            fq = sra_object.localfastqPath
            out_fileName = pu.get_file_basename(fq) + out_suffix + ".fastq"
            out_filePath = os.path.join(out_dir, out_fileName)
            newOpts = {"in": fq, "out": out_filePath}
            mergedOpts = {**kwargs, **newOpts}

            #run bbduk
            if self.run_bbduk(verbose=verbose,
                              quiet=quiet,
                              logs=logs,
                              objectid=objectid,
                              **mergedOpts):
                if pu.check_files_exist(out_filePath):
                    return (out_filePath, )
            return ("", )
Esempio n. 22
0
 def stringtie_merge(self,*args,out_dir=None,out_suffix="_stringtieMerge",threads=None,overwrite=False,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs):
     """Function to run stringtie merge.
     
     Parameters
     ----------
     
     args: tuple
         path to gtf files to merge
     out_suffix: string
         Suffix for output gtf file name
     threads: int
         Number of threads to use
     overwrite: bool
         Overwrite if output file already exists.
     verbose: bool
         Print stdout and std error
     quiet: bool
         Print nothing
     logs: bool
         Log this command to pyrpipe logs
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
     kwargs: dict
         Options to pass to stringtie. 
     :return: Returns the path to the merged GTF file
     :rtype: string
     """
     
     if len(args) < 1:
         print("ERROR: No input gtf for stringtie merge.")
         return ""
     
     #create path to output sam file
     fname=pu.get_file_basename(args[0])
     
     if not out_dir:
         out_dir=pu.get_file_directory(args[0])
     
     out_gtf_file=os.path.join(out_dir,fname+out_suffix+".gtf")
     
     if not overwrite:
         #check if file exists. return if yes
         if os.path.isfile(out_gtf_file):
             print("The file "+out_gtf_file+" already exists. Exiting..")
             return out_gtf_file
     
     if not threads:
         threads=self.threads
         
     #Add merge flag, output file name and input bam
     new_opts={"--merge":"","-o":out_gtf_file,"--":args,"-p":str(threads)}
     
     merged_opts={**new_opts,**kwargs}
     
     #call stringtie
     status=self.run_stringtie(verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**merged_opts)
     
     if status:
         #check if sam file is present in the location directory of sraOb
         if pu.check_files_exist(out_gtf_file):
             return out_gtf_file
     else:
         return ""
Esempio n. 23
0
    def perform_cleaning(self,sra_object,bbsplit_index,out_dir="",out_suffix="_bbsplit",objectid="NA",**kwargs):
        """
        Remove contaminated reads mapping to given reference using bbsplit
        
        Parameters
        ----------
        
        sra_object: SRA
            an SRA object
        bbsplit_index: string
            Path to bbsplit index or fasta file which will generate index
        out_dir: string
            Path to output dir. Default: sra_object.directory
        out_suffix: string
            Suffix for output file name
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.        
        kwargs: dict
            options passed to bbsplit

            :return: Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired.
            :rtype: tuple
        """
        
        #check index
        indexPath=""
        if not pu.check_paths_exist(bbsplit_index):
            #index folder doesn't exist
            #check if input is path to fasta
            if not pu.check_files_exist(bbsplit_index):
                print("Error: Please check bbsplit index")
                return ("",)
            #check if index folder "ref" exists in this directory
            indexPath=os.path.join(pu.get_file_directory(bbsplit_index),"ref")
            if pu.check_paths_exist(indexPath):
                print("Using bbsplit index: "+indexPath)
            else:
                #create new index
                print("Creating new index"+indexPath)
                newOpts={"ref_x":bbsplit_index,"path": pu.get_file_directory(bbsplit_index)}
                mergedOpts={**kwargs,**newOpts}
                #run bbduk
                if not self.run_bbsplit(objectid=objectid,**mergedOpts):
                    print("Error creating bbsplit index.")
                    return ("",)
                if not pu.check_paths_exist(indexPath):
                    print("Error creating bbsplit index.")
                    return ("",)
        else:
            indexPath=bbsplit_index
                
        
        #indexPath point to the ref directory, go one directory higher
        indexPath=os.path.dirname(indexPath)
        
        
        #make out_dir
        if not out_dir:
                out_dir=sra_object.directory
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)
        
        if sra_object.layout=='PAIRED':
            fq1=sra_object.fastq_path
            fq2=sra_object.fastq2_path
            #append input and output options
            
            out_fileName1=pu.get_file_basename(fq1)+out_suffix+".fastq"
            out_fileName2=pu.get_file_basename(fq2)+out_suffix+".fastq"
            out_file1Path=os.path.join(out_dir,out_fileName1)
            out_file2Path=os.path.join(out_dir,out_fileName2)
            
            newOpts={"in1":fq1,"in2":fq2,"outu1":out_file1Path,"outu2":out_file2Path,"path":indexPath}
            mergedOpts={**kwargs,**newOpts}
            
            #run bbsplit
            if self.run_bbsplit(objectid=objectid,target=[out_file1Path,out_file2Path],**mergedOpts):
                if pu.check_files_exist(out_file1Path,out_file2Path):
                    return(out_file1Path,out_file2Path)
            return("",)
            
            
        else:
            fq=sra_object.fastq_path
            #append input and output options
           
            out_fileName=pu.get_file_basename(fq)+out_suffix+".fastq"
            out_filePath=os.path.join(out_dir,out_fileName)
            newOpts={"in":fq,"outu":out_filePath,"path":indexPath}
            mergedOpts={**kwargs,**newOpts}
            
            #run bbsplit
            if self.run_bbsplit(objectid=objectid,target=out_filePath,**mergedOpts):
                if pu.check_files_exist(out_filePath):
                    return(out_filePath,)
            
            return("",)
Esempio n. 24
0
 def perform_assembly(self,bam_file,out_dir=None,out_suffix="_stringtie",reference_gtf=None,threads=None,overwrite=False,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs):
     """Function to run stringtie using a bam file.
             
     Parameters
     ----------
     
     bam_file: string
         path to the bam file
     out_suffix: string
         Suffix for the output gtf file
     reference_gtf: str
         Path to the reference gtf used as guide
     threads: int
         Number of threads to use
     overwrite: bool
         Overwrite if output file already exists.
     verbose: bool
         Print stdout and std error
     quiet: bool
         Print nothing
     logs: bool
         Log this command to pyrpipe logs
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
     kwargs: dict
         Options to pass to stringtie. 
     :return: Returns the path to output GTF file
     :rtype: string
     """
     
     #create path to output file
     fname=pu.get_file_basename(bam_file)
     
     if not out_dir:
         out_dir=pu.get_file_directory(bam_file)
         
     out_gtf_file=os.path.join(out_dir,fname+out_suffix+".gtf")
     
     """
     Handle overwrite
     """
     if not overwrite:
         #check if file exists. return if yes
         if os.path.isfile(out_gtf_file):
             print("The file "+out_gtf_file+" already exists. Exiting..")
             return out_gtf_file
     
     if not threads:
         threads=self.threads
     
     #Add output file name and input bam
     new_opts={"-o":out_gtf_file,"--":(bam_file,),"-p":str(threads)}
     
     if reference_gtf:
         if not pu.check_files_exist(reference_gtf):
             pu.print_boldred("Error: Provided reference GTF {} doesn't exist. Exiting...".format(reference_gtf))
             return ""
         new_opts["-G"]=reference_gtf
         
     
     merged_opts={**new_opts,**kwargs}
     
     #call stringtie
     status=self.run_stringtie(verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**merged_opts)
     
     if status:
         #check if sam file is present in the location directory of sraOb
         if pu.check_files_exist(out_gtf_file):
             return out_gtf_file
     else:
         return ""
Esempio n. 25
0
    def perform_assembly(self,
                         bam_file,
                         out_dir=None,
                         out_suffix="_cufflinks",
                         objectid="NA"):
        """Function to run cufflinks with BAM file as input.
                
        Parameters
        ----------
        bam_file: string
            path to bam file
        out_dir: 
            output directory
        out_suffix: string
            Suffix for the output gtf file
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession.
            
        :return: Returns the path to output GTF file
        :rtype: string       
        """

        #create path to output file
        fname = pu.get_file_basename(bam_file)
        if not out_dir:
            out_dir = pu.get_file_directory(bam_file)
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        #Add output file name and input bam
        internal_args = (bam_file, )
        internal_kwargs = {"-o": out_dir}
        #add positional args
        internal_kwargs['--'] = internal_args

        #targets
        outfile = os.path.join(out_dir, "transcripts.gtf")
        out_gtf_file = os.path.join(out_dir, fname + out_suffix + ".gtf")

        #if final file already exists
        if not _force and pu.check_files_exist(out_gtf_file):
            pu.print_green(
                'Target files {} already exist.'.format(out_gtf_file))
            return out_gtf_file

        #call cufflinks
        status = self.run(None,
                          objectid=objectid,
                          target=outfile,
                          **internal_kwargs)

        if status:
            if not _dryrun:
                pe.move_file(outfile, out_gtf_file)
                if not pu.check_files_exist(out_gtf_file):
                    return ""

            return out_gtf_file

        return ""