def checkNeedUpdateByDate(subjectSequencesFilename, dbName):
    """Check to see if the BLAST database needs to be updated

        subjectSequencesFilename: The name of the subject sequences file

        Bool value indicating if the BLAST DB needs to be updated


    # Initialize our logger
    logger = log.setupLogger("checkNeedUpdateByDate")

    # Get the date that the FASTA file was last updated
    fastaFileDate = getFastaDate(subjectSequencesFilename)

    # Get the date that the database was updated last
    localDatabaseDate = getLocalDBDate(dbName)

    # If the expression database is newer than the BLAST database,
    # an update is needed
    if (fastaFileDate > localDatabaseDate):"An update is needed to the BLAST database and will be "\
        "created now. This will take time so please be patient.")
        return (True)

    # If the expression database is older than the BLAST database,
    # no update is needed
    else:"No update is needed to the BLAST database. Proceeding "\
        "with the BLAST.")
        return (False)

def getLocalDBDate(dbName):
    """Get the date of the blast DB creation date

        dbName: The name of the blast DB
        The date of the blast DB update


    # Initialize our logger
    logger = log.setupLogger("getLocalDBDate")

    # Only check the creation date if the file even exists
    if (os.path.isfile('%s.nhr' % dbName)):
        # Get the time in the struct_time format
        localDatabaseTime = datetime.datetime.fromtimestamp(
            os.path.getmtime('%s.nhr' % dbName))

        return (localDatabaseTime)

    # File doesn't exist so return year 1 for comparison sake.
    # Note: datetime.MINYEAR threw error hence the hardcoded date
    else:"There is no file with name %s. Creating BLAST database."\
            % dbName)
        return (datetime.datetime(1, 1, 1))

Exemple #3
    def readFastq(self):
        """Read a library in FASTQ format into memory as a dictionary.
        The key will be the sequence, and the value will be the count of
        times in which the tag was seen

            Dictionary of the library file


        # Initialize our logger
        logger = log.setupLogger("readFastq")

        # Start timer for function
        funcStart = time.time()

        # Create an empty dictionary to store the full library
        libDict = {}

        # Initialize a counter to simply name the reads in the temp fasta file
        readCount = 1

        # Open the file and loop through line by line to store the library into
        # a dictionary. Each tag will be a key and the number of times seen
        # will be the value
        with open(self.filename) as f, open(self.fastaFilename, "w") as g:
            for count, line in enumerate(f, start=0):
                if (count % 4 == 1):
                    # Store the sequence and count into variables, then add
                    # them to the dictionary. There should not be any
                    # duplicate sequences in this format, however,
                    tag = line.rstrip()

                    # If the tag already exists in the libDict, increment
                    # the counter of the sequence and retain the empty hits
                    # variable as 0
                    if (tag in libDict):
                        currCount = libDict[tag][0]
                        libDict[tag] = [currCount + 1, 0]
                        libDict[tag] = [1, 0]

                        # Write the sequence to a unique reads FASTA file
                        g.write(">s_%s\n%s\n" % (readCount, tag))
                        readCount += 1

        # Stop timer for function
        funcEnd = time.time()

        # Calculate the execution time and print it to the user
        execTime = round(funcEnd - funcStart, 2)"Time to read library %s: %s seconds" %
                    (self.filename, execTime))


        return (libDict)
def drawPrecursor(precursorSeq, mirName, mirSeq, starSeq, outputFolder,
                  perlPath, RNAFoldPath, RNAPlotPath, ps2pdfwrPath):
    """Using RNAFold, draw the miRNA and the miRNA* on the precursor
        precursorSeq: The sequence of the miRNA precursor
        mirName: The name of the candidate miRNA. Use this instead of
            precursorName because it's possible for more than one candidate
            miRNA to come from the same precursor, so this should be unique
        mirSeq: The candidate miRNA sequence
        startSeq: The sequence of the miRNA* on this duplex
        outputFolder: The name of the output folder
        perlPath: The path of perl on this system
        RNAFoldPath: The path of RNAFold on this system
        RNAPlotPath: The path of RNAPlot on this system
        ps2pdfwrPath: The path of ps2pdfwr on this system

    # Initialize our logger
    logger = log.setupLogger("drawPrecursor")

    tempFilename = "%s/images/%s.temp" % (outputFolder, mirName)
    mir_out = open(tempFilename, "w")
    mir_out.write('%s\n%s' %
                  (mirSeq.replace("T", "U"), starSeq.replace("T", "U")))

    # Call the perl drawPrecursor program on our data
    returnCode =[
        perlPath, "drawPrecursor/", RNAFoldPath, RNAPlotPath,
        ps2pdfwrPath, mirName, precursorSeq, tempFilename

    if (returnCode):
        logger.error("Something went wrong when running drawPrecursor. " \
            "Command was\nperl drawPrecursor/ %s %s %s %s "\
            "%s %s" % (RNAFoldPath, RNAPlotPath, ps2pdfwrPath, mirName,
            precursorSeq, tempFilename))

    # Rename the file from the default drawPrecursor Structure_plot file
    # name to the mirName_precursor
    os.rename("%s_RNAplot_out/%s_Structure_plot.pdf" % (mirName, mirName),
              "%s/images/%s_precursor.pdf" % (outputFolder, mirName))

    # Delete temp files to create image
    except OSError as e:
        logger.error("Failed to delete %s" % tempFilename)
        shutil.rmtree("%s_RNAplot_out" % mirName)
    except OSError as e:
        logger.error("Failed to delete RNAplot folder %s_RNAplot_out" %\

Exemple #5
    def identifyFileType(self):
        """Investigate the first few linse of the library filename to
        determine its format so that the proper file parser function
        is used

            A simple string of fasta, fastq, or tagCount


        # Initialize our logger
        logger = log.setupLogger("identifyFileType")

        libType = ""

        with open(self.filename) as f:
            lines = [line for line in f][:4]

            # If the first character of the first line of the file is a >, and
            # both the 2nd and 4th lines are nucleotide sequences, then we
            # should be able to call this file fasta
            if (lines[0][0] == ">"
                    and"^[ACGT]*$", lines[1].rstrip().upper())
                    and"^[ACGT]*$", lines[3].rstrip().upper())):
                libType = "fasta"

            # If the second line of the file is a nucleotide sequence, but
            # the 4th is not, then it should be a fastq file
            elif ("^[ACGTN]*$", lines[1].rstrip().upper())
                  and not"^[ACGTN]*$", lines[3].rstrip().upper())):
                libType = "fastq"

            # If splitting the first line on a tab results in the first
            # index being just a nucleotide sequence, then the input file
            # is a tag count file
            elif ("^[ACGTN]*$", lines[0].split("\t")[0].upper())):
                libType = "tagCount"

            # If none of the previous tests passed, report an error to
            # the user and kill the run
      "The data in %s was not recognized as a fasta, "\
                    "fastq, or tag count file. Please check the file to "\
                    "ensure that it is one of the recognized file types." %\


        return (libType)
Exemple #6
    def readTagCount(self):
        """Read a library in tag count format into memory. Additionally,
        as each sequence is read, we will write this to a FASTA output
        file for bowtie

            Dictionary of library read in


        # Initialize our logger
        logger = log.setupLogger("readTagCount")

        # Start timer for function
        funcStart = time.time()

        # Create an empty dictionary to store the full library
        libDict = {}

        # Initialize a counter to simply name the reads in the temp fasta file
        readCount = 1

        # Open the file and loop through line by line to store the library into
        # a dictionary. Each tag will be a key and the abundance will be the
        # value
        with open(self.filename) as f, open(self.fastaFilename, "w") as g:
            for line in f:
                # Store the sequence and count into variables, then add them
                # to the dictionary. There should not be any duplicate
                # sequences in this format, however,
                tag = line.split("\t")[0]
                count = int(line.split("\t")[1].strip())
                libDict[tag] = [count, 0]

                # Write the sequence to a unique reads FASTA file
                g.write(">s_%s\n%s\n" % (readCount, tag))
                readCount += 1

        # Stop timer for function
        funcEnd = time.time()

        # Calculate the execution time and print it to the user
        execTime = round(funcEnd - funcStart, 2)"Time to read library %s: %s seconds" %
                    (self.filename, execTime))


        return (libDict)
Exemple #7
def checkNeedUpdate(version):
    """Check if the version file from miRBase exists in our miRBase folder.

        version: Version of miRBase to be queried. Generally should
            be "CURRENT"

        True if we need to update our miRBase folder and false if no update
        is required


    # Initialize our logger
    logger = log.setupLogger("checkNeedUpdate")

    ftp = ftplib.FTP("")
    ftp.login("anonymous", "")
        ftp.cwd("pub/mirbase/%s/" % version)
        logger.error("Input version does not appear to exist in miRBase. " \
            "Check version number in ini file and try again")

    # Pull the list of files from miRBase
    filenamesList = ftp.nlst()

    # Loop through the files in this version of miRBase to find the
    # version file
    for filename in filenamesList:
        # First, check if the miRBase plant file even exists yet. If not,
        # don't bother going any further. We need to update
        if (os.path.exists("miRBase/miRBasePlantMirnas.fa")):
            if (filename.startswith("0_THIS_IS_RELEASE")):
                versionFile = filename

                # If the verson file exists exactly as it was found in miRBase,
                # return false as we do not need to update our miRBase files
                if (os.path.exists("miRBase/%s" % filename)):
                    return (False)



    return (True)
Exemple #8
def setupMiRBase(organism, version):
    """Download all plant miRNAs from the provided version of miRBase to
    generate the subject files for BLAST when attentmping to annotate
    our candidate miRNAs

        organism: The three letter identifier of the organism being studied
        version: Version of miRBase to be downloaded. Generally should 
            be "CURRENT"


    # Initialize our logger
    logger = log.setupLogger("setupMirBase")

    updateStatus = checkNeedUpdate(version)
    gffFilename = "miRBase/%s.gff3" % organism
    mirBaseDict = {}

    # Check if the miRBase files need to be updated
    if (updateStatus):"Downloading the relevant miRBase files")
        # If there are still files in the miRBase folder but it needs to
        # be updated, then clear the contents prior to populating it again
        if (os.listdir("miRBase")):

        # Download the organism file and mature miRNA file
        # Find all the plant species that are in miRBase and then download
        # their GFF files IF they exist
        plantList = findPlantSpeciesFromOrganisms()
        downloadPlantSpecies(version, plantList)

    # We can only perform identity searches, with positional information, if
    # the GFF file exists. So, first check if the GFF file actually exists,
    # then read the file into a dictionary with parsePrecursorGFF
    if (os.path.isfile(gffFilename)):
        mirBaseDict = parsePrecursorGFF(gffFilename)


    return (mirBaseDict)
Exemple #9
    def buildBowtieIndex(self, bowtieBuildPath):
        """Code to create a bowtie index for the inverited repeats file.

            bowtieBuildPath: Path of bowtie-build
            Path of bowtie index


        # Initialize our logger
        logger = log.setupLogger("buildBowtieIndex")

        # Set the index filename. Remove any file extension and folders
        # from the filename path to ensure the index file is written
        # to the correct folder that is hardcoded here
        filenameStripped = os.path.splitext(self.filename.split('/')[-1])[0]
        indexFilename = "genome/bowtieIndex/%s" % (filenameStripped)

        if (self.checkBowtieNeedsUpdate(indexFilename)):
  "Building a bowtie index for %s" % (self.filename))
            with open("genome/bowtieIndex/%s_bowtiebuild.log" %\
                    filenameStripped, 'w') as logFile:
                returnCode =
                    [bowtieBuildPath, self.filename, indexFilename],

            if (returnCode):
      "Something went wrong when running bowtie-build. "\
                    "Command was\n%s %s %s" % (bowtieBuildPath, self.filename,



        return (indexFilename)
Exemple #10
    def buildFastaIndex(self, samtoolsBuildPath):
        """Code to create a fasta infdex

            samtoolsBuildPath: Path of samtools


        # Initialize our logger
        logger = log.setupLogger("buildsamtoolsIndex")

        if (self.checkFastaIndexNeedsUpdate()):
  "Building a fasta index for %s" % (self.filename))
            returnCode =
                [samtoolsBuildPath, "faidx", self.filename])

            if (returnCode):
      "Something went wrong when building fasta index. "\
                    "Command was\n%s %faidx %s" % (samtoolsBuildPath,

Exemple #11
    def combineIRTempFiles(self, IRFastaFilenamesList,
                           IRAlignmentFilenamesList, runEInvertedFlag):
        """This function combines the temporary einverted files into one
        file for final analysis. However, if the user has opted to not run 
        einverted due to a previous run alredy existing for this genome,
        this function will bypass the merging steps and only add the IR
        data to the IR dictionary

            IRFastaFilenamesList: List of the inverted repeat FASTA files
            IRAlignmentFilenamesList: List of inverted repeat alignment files

        # Initialize our logger
        logger = log.setupLogger("combineIRTempFiles")

        # If einverted was run, combine the temp FASTA files
        if (runEInvertedFlag):
            ## Combine inverted repeats sequences FASTA file

            fasta_out = open(self.IRFastaFilename, 'w')

            # Loop through all FASTA files and merge into one file
            for filename in IRFastaFilenamesList:
                with open(filename) as fastaFile:
                    for line in fastaFile:


        # Set a counter to process each inverted repeat by line number
        counter = 0
        IRCounter = 0

        # If einverted was run, open the output alignment file to write
        # the results to
        if (runEInvertedFlag):
            ## Combine inverted repeats alignments results
            align_out = open(self.IRAlignmentFilename, 'w')

        # Loop through all alignment files and merge into one file
        for filename in IRAlignmentFilenamesList:
            with open(filename) as alignmentFile:
                # Loop through the alignment files to add them to the
                # merged file and add them to IRDictByChr

                toWriteList = []
                # Parse each line of the alignment file. Alignments come
                # in sets of 5 lines, so process 5 lines before to store
                # into IRDict
                for line in alignmentFile:

                    # Split the entire line on spaces for parsing.
                    # Remove preceeding whitespace with lstrip first
                    parsedLine = line.lstrip().rstrip().split(' ')

                    # If the current line counter % 5 is 1, there will be
                    # a lot of useful information here. Begin to parse
                    # this data into specific variables
                    # Note that if einverted changes the output format,
                    # these lines here can fail and would need to be
                    # readjusted
                    if (counter % 5 == 1):
                        chrName = parsedLine[0].split(':')[0]
                        score = int(parsedLine[2].split(':')[0])
                        matches, totalBases = map(int,
                        percMatch = round(float(matches) / totalBases, 3)

                        if (percMatch == 1):
                            gaps = int(parsedLine[6])
                        elif (percMatch < .1):
                            gaps = int(parsedLine[8])
                            gaps = int(parsedLine[7])

                    # If the current line counter % 5 is 2, the 5'
                    # repeat start and end coordinates will be contained
                    # within this line
                    elif (counter % 5 == 2):
                        start5 = int(parsedLine[0])
                        hairpin5 = parsedLine[1].upper()
                        end5 = int(parsedLine[2])

                    # The alignment between the two strands is given in
                    # the 3rd line of the alignment.
                    elif (counter % 5 == 3):
                        alignmentIndicators = line.lstrip().rstrip()

                    # If the current line counter % 5 is 4, the 3'
                    # repeat start and end coordinates will be contained
                    # within this line. Also, calculate the loop length
                    elif (counter % 5 == 4):
                        start3 = int(parsedLine[2])
                        hairpin3 = parsedLine[1].upper()
                        end3 = int(parsedLine[0])
                        loop = int(start3) - int(end5) - 1

                        # Get the index of the chromosome to add the
                        # inverted repeat to
                        index = self.chrDict[chrName]

                        # Add the inverted repeat to the appropriate
                        # list within IRDictByChr
                        IRName = "precursor-%s" % IRCounter
                        self.IRDictByChr[index][IRName] = (start5, end5,
                                                           start3, end3, loop,
                                                           'w', hairpin5,

                        IRCounter += 1
                        IRName = "precursor-%s" % IRCounter
                        self.IRDictByChr[index][IRName] = (start5, end5,
                                                           start3, end3, loop,
                                                           'c', hairpin5,

                        if (runEInvertedFlag):
                            for entry in toWriteList:
                        toWriteList = []
                        IRCounter += 1

                    # Increment the counter
                    counter += 1

        # If einverted was run and the temp files were merged, close
        # the output file and delete the temp files
        if (runEInvertedFlag):

            # Delete individual inverted files and fasta files
  "Combined files '%s and %s'\nDeleting temp files" %\
                (self.IRAlignmentFilename, self.IRFastaFilename))

            # Combine the inverted repeats FASTA and alignmenet filenames
            # lists to delete all of these temp files
            garbage = IRFastaFilenamesList + IRAlignmentFilenamesList
            for toDelete in garbage:
                if os.path.exists(toDelete):


        return (IRCounter)
Exemple #12
def downloadOrganismsAndMirnas(version):
    """Get the organisms.txt.gz and mature.fa.gz files from the current
    version of miRBase, unzip the files, and save them to the  miRBase 

        version: Version of miRBase to be downloaded. Generally should
            be "CURRENT"


    # Initialize our logger
    logger = log.setupLogger("downloadOrganismsAndMirnas")

    ftp = ftplib.FTP("")
    ftp.login("anonymous", "")
    ftp.cwd("pub/mirbase/%s/" % version)

    # Before we actually get the organism file, we will actually need to
    # downlaod the version file to our directory to prevent updates in
    # successive runs
    filenamesList = ftp.nlst()
    for filename in filenamesList:
        if (filename.startswith("0_THIS_IS_RELEASE")):
            ftp.retrbinary("RETR %s" % filename, open("miRBase/%s" % \
                filename, "wb").write)

    # Try to download the organisms file for this organism.
        ftp.retrbinary("RETR organisms.txt.gz",
                       open("miRBase/organisms.txt.gz", "wb").write)

    # If there is no file
    except ftplib.all_errors as e:
        logger.error("Error while downloading organisms.txt.gz...\n%s\nTry "\
            "changing the version of miRBase you are trying to download" % e)

    # Use gzip to unzip the file and save it as organisms.tsv
    with"miRBase/organisms.txt.gz", "rb") as f_gz:
        with open("miRBase/organisms.tsv", "wb") as f_unzip:
            shutil.copyfileobj(f_gz, f_unzip)

    # Remove organisms.txt.gz as we have already unzipped it

    # Try to download the mature miRNA file for this organism
        ftp.retrbinary("RETR mature.fa.gz",
                       open("miRBase/mature.fa.gz", "wb").write)

    # If there is no file
    except ftplib.all_errors as e:
        logger.error("Error while downloading mature.fa.gz...\n%s\nTry "\
            "changing the version of miRBase you are trying to download" % e)

    # Use gzip to unzip the file and save it as mature.fa
    with"miRBase/mature.fa.gz", "rb") as f_gz:
        with open("miRBase/mature.fa", "wb") as f_unzip:
            shutil.copyfileobj(f_gz, f_unzip)

    # Remove organisms.txt.gz as we have already unzipped it


Exemple #13
def housekeeping(genomeFilename, libFilenamesString, libFolder,
                 libFilenamesList, bowtiePath, bowtieBuildPath, einvertedPath,
                 blastnPath, makeblastdbPath, perlPath, RNAFoldPath,
                 RNAPlotPath, samtoolsPath, ps2pdfwrPath, outputFolder,
                 organism, version):
    """Perform various housekeeping functions including the checks that all
    external program dependencies exist, that files being referenced and
    folders that will be written to exist and are created. Additionally,
    it will also call to create download the current
    version of miRBase if needed to prepare for the annotation of our
    candidate miRNAs

        genomeFilename: The path of the genome file
        libFilenamesString: The raw text of the library files that the
            user would have supplied in the config file
        libFolder: The folder of the library files if the user chose
            to supply that instead of individual library paths
        libFilenamesList: The list of library paths that have already
            been parsed either from libFilenamesString
        bowtiePath: The path of bowtie on the system
        einvertedPath: The path of einverted on the system
        blastnPath: The path of blastn on the system
        makeblastdbPath: The path of makeblastdb on the system
        perlPath: The path of perl on the system
        RNAFoldPath: The path of RNAFold on the system
        RNAPlotPath: The path of RNAPlot on the system
        samtoolsPath: The path of samtools on the system
        ps2pdfwrPath: The path of ps2pdfwr on the system
        outputFolder: The config entry for the output folder. Can be blank
        organism: The three letter identifier of the organism being studied
        version: The version of miRBase to be queried


    # Initialize our logger
    logger = log.setupLogger("housekeeping")

    # Make sure the genome file exists as defined
    if (not os.path.isfile(genomeFilename)):
        logger.error("%s could not be found! Please check that the "\
            "file path was input correctly" % genomeFilename)

    # Do not allow execution if both libFilenamesString and libFolder
    # are defined to anything other than empty strings
    if (libFilenamesString and libFolder):
        logger.error("You specified both libFolder and libNamesList, but "\
            "only one can exist. Delete one and try running again")

    # Loop through all libraries in libFilenamesList and confirm that they
    # exist before running
    for libName in libFilenamesList:
        if (not os.path.isfile(libName)):
            logger.error("%s could not be found! Please check that the "\
                "file path was input correctly" % libName)
    if (len(libFilenamesList) == 1):
        logger.warning("Only one library was provided. While miRador "\
            "can run with this, miRador will not\noutput any miRNAs that are "\
            "predicted outside of any known families as we require\n"\
            "identification in multiple libraries for novel annotation.\nIf "\
            "this organism does not exist yet in miRBase, then no miRNAs "\
            "will be predicted.\nPausing execution for 20 seconds if you "\
            "want to stop this run and add libraries. (Use ctrl+c to stop)\n")

    if (not shutil.which(bowtiePath)):
        logger.error("bowtie could not be found at the provided path: %s\n"\
            "Correct before running again" % bowtiePath)

    if (not shutil.which(bowtieBuildPath)):
        logger.error("bowtie-build could not be found at the provided path: "\
            "%s\nCorrect before running again" % bowtieBuildPath)

    if (not shutil.which(einvertedPath)):
        logger.error("einverted could not be found at the provided path: "\
            "%s\nCorrect before running again" % einvertedPath)

    if (not shutil.which(perlPath)):
        logger.error("perl could not be found at the provided path: %s\n"\
            "Correct before running again" % perlPath)

    if (not shutil.which(blastnPath)):
        logger.error("blastn could not be found at the provided path: %s\n"\
            "Correct before running again" % blastnPath)

    if (not shutil.which(makeblastdbPath)):
        logger.error("makeblastdb could not be found at the provided path: "\
            "%s\nCorrect before running again" % makeblastdbPath)

    if (not shutil.which(RNAFoldPath)):
        logger.error("RNAFold could not be found at the provided path: %s\n"\
            "Correct before running again" % RNAFoldPath)

    if (not shutil.which(RNAPlotPath)):
        logger.error("RNAPlot could not be found at the provided path: %s\n"\
            "Correct before running again" % RNAPlotPath)

    if (not shutil.which(samtoolsPath)):
        logger.error("samtools could not be found at the provided path: %s\n"\
            "Correct before running again" % RNAPlotPath)

    if (not shutil.which(ps2pdfwrPath)):
        logger.error("ps2pdfwr could not be found at the provided path: %s\n"\
            "Correct before running again" % ps2pdfwrPath)

    ### Create the necessary folders if they don't already exist
    # Create a folderfor genome if it does not exist already
    if not os.path.isdir("genome"):
    # Create a folder for the inverted repeat if it does not exist already
    if (not os.path.isdir("invertedRepeats")):
    # Create a folder for the miRBase folder if it does not exist yet
    if (not os.path.isdir("miRBase")):

    # If the user has filled the outputFolder option, check to see if it
    # has results from an older run and then delete them
    if (outputFolder):
        # Confirm that the output folder's name is not the same as
        # libFolder. This will ensure nothing of importance is
        # accidentally deleted
        if (outputFolder == libFolder):
            logger.error("outputFolder and libFolder cannot be the same "\
                "folder. Please rename outputFolder and run again")

        # Create the output folder if it does not yet exist
        if (not os.path.isdir(outputFolder)):

        # Delete the libs folder if it exists already
        if (os.path.isdir("%s/libs" % outputFolder)):
            shutil.rmtree("%s/libs" % outputFolder)

        # Delete the images folder if it exists already
        if (os.path.isdir("%s/images" % outputFolder)):
            shutil.rmtree("%s/images" % outputFolder)

        # Delete the various output files if they exist already
        # Deprecated
        if(os.path.isfile("%s/blastResults.txt" % outputFolder)):
            os.remove("%s/blastResults.txt" % outputFolder)
        if(os.path.isfile("%s/finalAnnotatedCandidates.csv" % outputFolder)):
            os.remove("%s/finalAnnotatedCandidates.csv" % outputFolder)
        if(os.path.isfile("%s/finalAnnotatedCandidates.fa" % outputFolder)):
            os.remove("%s/finalAnnotatedCandidates.fa" % outputFolder)
        if(os.path.isfile("%s/preAnnotatedCandidates.csv" % outputFolder)):
            os.remove("%s/preAnnotatedCandidates.csv" % outputFolder)
        if(os.path.isfile("%s/preAnnotatedCandidates.fa" % outputFolder)):
            os.remove("%s/preAnnotatedCandidates.fa" % outputFolder)


    # Create a path for an output folder if it does not exist already
    # (Almost certainly shoul dnot as it would require the same run second)
        outputFolder =
    if not os.path.isdir(outputFolder):
    if not os.path.isdir("%s/libs" % outputFolder):
        os.mkdir("%s/libs" % outputFolder)
    if not os.path.isdir("%s/images" % outputFolder):
        os.mkdir("%s/images" % outputFolder)
    if not os.path.isdir("miRadorTempFolder"):
    if not os.path.isdir("miRadorTempFolder/bowtieOutput"):

    mirBaseDict = setupMiRBase.setupMiRBase(organism, version)


    return (mirBaseDict, outputFolder)
Exemple #14
    def mapper(self, indexFilename, bowtiePath, nthreads):
        """Map small RNAs to the provided index file

            indexFilename: Path and name of the index for the genome. 
            bowtiePath: The path of bowtie
            nthreads: The number of threads to use with bowtie
            Filename of mapped data


        # Initialize our logger
        logger = log.setupLogger("mapper")

        # Strip the filename of its folders and create the output map
        # name with that stripped filename in the libs folder
        indexNameStripped = os.path.basename(indexFilename)

        logFilename = "%s_bowtie.log" % os.path.splitext(self.mapFilename)[:-1]

        if (self.libType == "tagCount"):
  "Mapping small RNAs to the genome files for %s" %\
  "Mapping small RNAs to the genome files for %s" %\

        with open(logFilename, "w") as logFile:
            # Run bowtie with the following options:
            # -a to report all valid alignments as we want multihits
            # -m 50 to suppress all alignments with more than 50 matches
            # to the genome. We expect few multi-matches to the genome
            # --best and --strata ensures only the best alignments are reported
            # and so that less optimum but passable alignments do not appear
            # -v 0 Allow no mismatch
            # --sam-nohead removes the header from the SAM file. This is useful
            # because we have to merge the fragment alignments for parallel
            # runs
            # --no-unal suppresses sequences with no alignemnt. This helps to
            # keep the map file manageable and filter out these sequences
            # earlier for efficiency
            ### Note that the output of bowtie is send to stderr for some
            ### which is why this log flie goes there
            returnCode =[
                bowtiePath, indexFilename, "-f", self.fastaFilename, "-a",
                "-m 50", "--best", "--strata", "-v 0", "-S", self.mapFilename,
                "-p", nthreads, "--sam-nohead", "--no-unal"

            # If there is a return code, report an error to the user and exit
            if (returnCode):
                logger.error("Something went wrong when running bowtie. "\
                    "Command was\n%s %s -f %s -a -m 50 --best --strata "\
                    "-v 0 -S %s -p %s --sam-nohead --no-unal" %\
                    (bowtiePath, indexFilename, self.fastaFilename,
                    self.mapFilename, nthreads))




        return (logFilename)
def filterPrecursors(mappedTagsToPrecursors, IRDict, libDict, overhang):
    """This function will perform the sRNA mapping and abundance filters.
    It will first try to find a miRNA and miRNA* pair by identifying
    tags that map to opposite sides of the precursor. It will also create
    splits of the c and w strand if there are tags that map to both

        mappedTagsToPrecursors: Dictionary of tag information mapping
            to the precursor, identified by the precursor name
        IRDict: Dictionary of the inverted repeats in one chromosome
        libDict: The entire library dictionary to be queried for abundances
        overhang: Specific length of overhang that a duplex must have
        Dictionary of all precursors and the miRNA:miRNA* duplexes within
        that pass all filters


    # Initialize our logger
    logger = log.setupLogger("filterPrecursors")

    # Initialize a dictionary to store our final candidaties that pass
    # all filters for this library
    finalCandidates = {}

    # Begin to loop though all of the candidate precursors for the
    # various filters. Each loop begins on the chromosome dictionary
    for precursorName, mappedTagsTuple in mappedTagsToPrecursors.items():
        # Initialize a flag for if the 5' or 3' end of the precursor
        # contains a candidate miRNA
        is3Candidate = False
        is5Candidate = False

        precursor = IRDict[precursorName]
        # Store various elements of the precursor dictionary values
        # for quick accession
        start5 = precursor[0]
        end5 = precursor[1]
        start3 = precursor[2]
        end3 = precursor[3]
        strand = precursor[5]
        arm5 = precursor[6]
        alignmentIndicators = precursor[7]
        arm3 = precursor[8]

        # Store the various elements of the mapped tags tuple
        mappedTagsDict5 = mappedTagsTuple[0]
        mappedTagsDict3 = mappedTagsTuple[1]
        totalAbun5 = mappedTagsTuple[2]
        totalAbun3 = mappedTagsTuple[3]
        loopAbun = mappedTagsTuple[4]

        # Begin a series of loops to identify if there are any tags on
        # the 5' and 3' strands that overlap within a short, user defined
        # overhang
        for candidate5Pos, mappedTagList5 in mappedTagsDict5.items():
            for mapped5Tag in mappedTagList5:
                # Get the length of the 5' candidate tag so that we can
                # determine local positions on the precursors
                tag5Length = len(mapped5Tag[0])
                tag5Abun = mapped5Tag[1]

                # If the length of the tag is not between 20 and 24,
                # just move to the next tag. We do this here because
                # we have to store tags that are 1 nt variants of
                # candidate miRNA or miRNA* sequences
                if (tag5Length < 20 or tag5Length > 24):

                # If the strand is w, the sequence will require no
                # modifications
                if (strand == "w"):
                    sequence5 = mapped5Tag[0]

                # If the strand is c, we need to reverse complement
                # the mapped sequence so that we can find it on the
                # IR arm
                    sequence5 = mapped5Tag[0].translate(
                        str.maketrans("ACGT", "TGCA"))[::-1]

                oldSequence5 = sequence5

                # If we are unable to find the sequences in the
                # IR arm, we know it is for one of two
                # possibilities. Because there can be gaps in the
                # alignment, so we must identify which case (if not
                # both) it is before proceeding
                if (sequence5 not in arm5):
                    sequence5, local5Start, local5End = \
                        findSequenceInIR(sequence5, arm5, tag5Length)

                # If the sequence can be found, update the local positions
                # as they may be shifted due to gaps prior
                    local5Start = arm5.find(sequence5)
                    local5End = local5Start + tag5Length - 1

                # Check to confirm that the sequence with gaps is the
                # same sequence as before
                if (oldSequence5 != sequence5.replace("-", "")):
                    logger.error("findSequenceInIR messed up for %s. "\
                        "Contact Reza to debug" % oldSequence5)
                    logger.error(precursorName, oldSequence5, sequence5,
                                 local5Start, local5End)

                # Loop through all mapped tags in the 3' dictionary to
                # identify any candidate miRNA:miRNA* pairs with the
                # current 5' mapped tag
                for candidate3Pos, mappedTagList3 in mappedTagsDict3.items():
                    for mapped3Tag in mappedTagList3:
                        # Get the length of the 3' candidate tag so that
                        # we can determine local positions on the
                        # precursor for mapping comparisons. A candidate
                        # will be recorded if a miRNA:miRNA* pair can
                        # be identified
                        tag3Length = len(mapped3Tag[0])
                        tag3Abun = mapped3Tag[1]

                        # If the length of the tag is not between 20 and
                        # 24, just move to the next tag. We do this here
                        # because we have to store tags that are 1 nt
                        # variants of candidate miRNA or miRNA* sequences
                        if (tag3Length < 20 or tag3Length > 24):

                        # If the strand is w, the sequence needs to be
                        # reversed because it is on the 3' arm of the IR
                        if (strand == "w"):
                            sequence3 = mapped3Tag[0][::-1]

                        # If the strand is c, the sequence needs to be
                        # complemented (but not reversed) because it is
                        # on the 3' arm of the IR, but the reverse strand
                        # of the genome
                            sequence3 = (mapped3Tag[0].translate(
                                str.maketrans("ACGT", "TGCA")))

                        oldSequence3 = sequence3

                        # If we are unable to find the sequences in the
                        # IR arm, we need to find the alignment sequence,
                        # start, and end positions
                        if (sequence3 not in arm3):
                            sequence3, local3Start, local3End = \
                                findSequenceInIR(sequence3, arm3, tag3Length)

                            local3Start = arm3.find(sequence3)
                            local3End = local3Start + tag3Length - 1

                        # Check to confirm that the sequence with gaps is
                        # the same sequence as before
                        if (oldSequence3 != sequence3.replace("-", "")):
                            logger.error("findSequenceInIR messed up for %s. "\
                                "Contact Reza to debug" % oldSequence3)
                            logger.error(precursorName, oldSequence3,
                                         sequence3, local3Start, local3End)

                        # If there is an overhang on either the sequence,
                        # we have a candidate duplex and will investigate
                        # it further
                        if ((strand == "c" and
                             (local3Start - local5Start == overhang) and
                             (local3End - local5End == overhang))
                                or (strand == "w" and
                                    (local5End - local3End == overhang) and
                                    (local5Start - local3Start == overhang))):
                            # Because we can have overhangs, the alignment
                            # should start and end at the postiions just
                            # prior to the overhang
                            alignStart = max(local5Start, local3Start)
                            alignEnd = min(local5End, local3End)

                            # Get the einverted alignment for the two
                            # sequences
                            matchCount, mismatchCount, wobbleCount,\
                                gapCount = getAlignment(arm5, arm3,
                                alignStart, alignEnd)

                            # Only proceed if the alignment meets our filter
                            # specifications
                            if (gapCount + mismatchCount +
                                (wobbleCount * .5) <= 5 and gapCount <= 3):
                                # Get the hits information for the 5' and 3'
                                # tags from libDict
                                hits5 = libDict[mapped5Tag[0]][1]
                                hits3 = libDict[mapped3Tag[0]][1]

                                ### Code for the abundance filter
                                #variant5Abun = totalAbun5
                                #variant3Abun = totalAbun3
                                variant5Abun = tag5Abun
                                variant3Abun = tag3Abun

                                # Get the abundance of all eight 1-nt
                                # variants of both 5' and 3' tags
                                variant5AbunList = getVariantAbundance(
                                    mappedTagsDict5, mapped5Tag[0],
                                    candidate5Pos, strand)
                                variant3AbunList = getVariantAbundance(
                                    mappedTagsDict3, mapped3Tag[0],
                                    candidate3Pos, strand)

                                if (tag5Abun < max(variant5AbunList)
                                        or tag3Abun < max(variant3AbunList)):

                                if (variant5Abun == -1 or variant3Abun == -1):

                                variant5Abun += sum(variant5AbunList)
                                variant3Abun += sum(variant3AbunList)

                                # Get the proportion of reads coming from
                                # the miRNA duplex compred to the rest
                                # of the reads mapping to the duplex
                                proportion = (variant5Abun + variant3Abun) /\
                                    (totalAbun5 + totalAbun3 + loopAbun)

                                # The 5' mapping tag will be kept as a candidate
                                # miRNA if it has at least an abundance of 3 RPM
                                if (tag5Abun >= 3):
                                    duplex = ("5p", mapped3Tag[0],
                                              candidate5Pos, candidate3Pos,
                                              tag5Abun, hits5, tag3Abun, hits3,
                                              matchCount, mismatchCount,
                                              wobbleCount, gapCount,
                                              variant5Abun, variant3Abun,
                                              totalAbun5, totalAbun3, loopAbun,

                                    # If the sum of the two tags in the
                                    # make up more than 75% of the read
                                    # abundance in the entire precursor,
                                    # add the duplex to the candidates
                                    # dictionary
                                    if (proportion >= .75):
                                        # Add the precursor name as a key to
                                        # finalCandidates if it does not
                                        # yet exist. The value will be a list
                                        # of duplexes found in the precursor,
                                        # but the first element will be the IR
                                        # coordinates
                                        if (precursorName
                                                not in finalCandidates):
                                            finalCandidates[precursorName] = \

                                            mapped5Tag[0]] = duplex

                                # The 3' mapping tag will be kept as a
                                # candidate miRNA if it has an abundance
                                # of at least 3 RPM
                                if (tag3Abun >= 3):
                                    duplex = ("3p", mapped5Tag[0],
                                              candidate3Pos, candidate5Pos,
                                              tag3Abun, hits3, tag5Abun, hits5,
                                              matchCount, mismatchCount,
                                              wobbleCount, gapCount,
                                              variant3Abun, variant5Abun,
                                              totalAbun3, totalAbun5, loopAbun,

                                    # If the sum of the two tags in the
                                    # make up more than 75% of the read
                                    # abundance in the entire precursor,
                                    # add the duplex to the candidates
                                    # dictionary
                                    if (proportion >= .75):
                                        # Add the precursor name as a key to
                                        # finalCandidates if it does not
                                        # yet exist. The valu will be a list of
                                        # duplexes found in the precursor, but
                                        # the first element will be the IR
                                        # coordinates
                                        if (precursorName
                                                not in finalCandidates):
                                            finalCandidates[precursorName] = \

                                            mapped3Tag[0]] = duplex


    return (finalCandidates)
Exemple #16
def runEinverted(einvertedPath, chrFilename, match, mismatch, gap, threshold,
    """Fuunction to run einverted for a single chromosome
        chrFilename: The path to the individual chromosome that will
            be run through einverted
        match: Score to pass to einverted for a match
        mismatch: Penalty score to pass to einverted for a mismatch
        gap: Score to pass to einverted for a gap
        threshold: Minimum total score an inverted repeat must have
            for einverted to record it
        maxRepLen: Maximum length an inverted repeat can have

        The name of the output FASTA file that einverted created, and
        the name of the alignment output file that einverted created.


    # Initialize our logger
    logger = log.setupLogger("runEinverted")

    outputFastaFilenamesList = []
    outputAlignmentFilenamesLis = []

    # Open FNULL to suppress the output of einverted becuase we do not
    # really need to know it is running for each proc
    FNULL = open(os.devnull, 'w')

    # Names of temporary output files to store results prior to merging
    outputFastaFilename = "invertedRepeats/%s.fa.temp" % os.path.splitext(
    outputAlignmentFilename = "invertedRepeats/%s.alignment.temp" % \

    # Call einverted utilizing this current sequence with the user
    # defined arguments from the config file.
    returnCode =[
        einvertedPath, "-sequence", chrFilename, "-gap",
        str(gap), "-threshold",
        str(threshold), "-match",
        str(match), "-mismatch",
        str(mismatch), "-maxrepeat",
        str(maxRepLen), "-outfile", outputAlignmentFilename, "-outseq",

    # If a return code of anything but 0 is returned, it means there
    # was a problem and it should be investigated. Temp files wiill
    # remain from the run to assist in the debugging process
    if (returnCode != 0):
        logger.error("Something went wrong when running einverted. Command "\
            "was\n%s -sequence %s -gap %s -threshold %s -match %s -mismatch "\
            "%s -maxrepeat %s -outfile %s -outseq %s" % (einvertedPath,
            chrFilename, gap, threshold, match, mismatch, maxRepLen,
            outputAlignmentFilename, outputFastaFilename))

    # Close FNULL


    return (outputFastaFilename, outputAlignmentFilename)
Exemple #17
def main():
        logger, options = parseOptions()
    except OptionError as e:
        print >> sys.stderr, str(e)
        exit(1, status="virt-who can't be started: %s" % str(e))

    lock = PIDLock(PIDFILE)
    if lock.is_locked():
        msg = "virt-who seems to be already running. If not, remove %s" % PIDFILE
        print >> sys.stderr, msg
        exit(1, status=msg)

    global RetryInterval
    if options.interval < RetryInterval:
        RetryInterval = options.interval

    global virtWho
        virtWho = VirtWho(logger, options)
    except (InvalidKeyFile, InvalidPasswordFormat) as e:
        exit(1, "virt-who can't be started: %s" % str(e))

    if options.virtType is not None:
        config = Config("env/cmdline", options.virtType, **options.__dict__)
        config.checkOptions(options.smType, logger)
    for conffile in options.configs:
        except Exception as e:
            logger.error('Config file "%s" skipped because of an error: %s' %
                         (conffile, str(e)))
    if len(virtWho.configManager.configs) == 0:
        # In order to keep compatibility with older releases of virt-who,
        # fallback to using libvirt as default virt backend"No configurations found, using libvirt as backend")
        virtWho.configManager.addConfig(Config("env/cmdline", "libvirt"))

    for config in virtWho.configManager.configs:
        if is None:
                'Using commandline or sysconfig configuration ("%s" mode)',
  'Using configuration "%s" ("%s" mode)' %
                        (, config.type))

    if options.background:
        locker = lambda: daemon.DaemonContext(pidfile=lock)
        locker = lambda: lock

    with locker():
        signal.signal(signal.SIGHUP, reload)
        signal.signal(signal.SIGTERM, atexit_fn)

        virtWho.logger = logger = log.getLogger(options, queue=True)

        sd_notify("READY=1\nMAINPID=%d" % os.getpid())
        while True:
            except ReloadRequest:
Exemple #18
def main():
        logger, options = parseOptions()
    except OptionError as e:
        print >>sys.stderr, str(e)
        exit(1, status="virt-who can't be started: %s" % str(e))

    lock = PIDLock(PIDFILE)
    if lock.is_locked():
        msg = "virt-who seems to be already running. If not, remove %s" % PIDFILE
        print >>sys.stderr, msg
        exit(1, status=msg)

    global virtWho
        virtWho = VirtWho(logger, options)
    except (InvalidKeyFile, InvalidPasswordFormat) as e:
        exit(1, "virt-who can't be started: %s" % str(e))

    if options.virtType is not None:
        config = Config("env/cmdline", options.virtType, virtWho.configManager._defaults, **options)
    for conffile in options.configs:
        except Exception as e:
            logger.error('Config file "%s" skipped because of an error: %s' % (conffile, str(e)))
    if len(virtWho.configManager.configs) == 0:
        # In order to keep compatibility with older releases of virt-who,
        # fallback to using libvirt as default virt backend"No configurations found, using libvirt as backend")
        virtWho.configManager.addConfig(Config("env/cmdline", "libvirt"))

    for config in virtWho.configManager.configs:
        if is None:
  'Using commandline or sysconfig configuration ("%s" mode)', config.type)
  'Using configuration "%s" ("%s" mode)' % (, config.type))

    if options.background:
        locker = lambda: daemon.DaemonContext(pidfile=lock)
        locker = lambda: lock

    with locker():
        signal.signal(signal.SIGHUP, reload)
        signal.signal(signal.SIGTERM, atexit_fn)

        virtWho.logger = logger = log.getLogger(name='main', config=None, queue=True)

        sd_notify("READY=1\nMAINPID=%d" % os.getpid())
        while True:
            except ReloadRequest:
def annotateIdenticalCandidates(similarityDict, mirBaseDict, identicalList,
                                header, line, outputFolder):
    """Helper function to annotate the candidate miRNAs that have identical
    sequences to ones that have already been identified.

        mirBaseDict: Dictionary of miRBase miRNAs and their coordinates for
            this organism, if available (will be an empty dictionary if not)
        identicalList: The list of miRBase miRNAs with the same sequence
            as the candidate miRNA
        header: Header line of the precursor file
        line: The full line from the pre-annotated file that will be
            modified to provide the new annotation
        outputFolder: Name of the folder where the results will be written to

        Flag to indicate if a positional match was found for this candidate
        and the update line with the proper annotation


    # Initialize our logger
    logger = log.setupLogger("annotateIdenticalCandidates")

    mirNameIndex = header.index("miR Name")
    chrNameIndex = header.index("Chr")
    strandIndex = header.index("Strand")
    positionIndex = header.index("miR Position")
    mirSeqIndex = header.index("miR Sequence")
    starSeqIndex = header.index("Star Sequence")

    annotatedFlag = False
    mirName = line[mirNameIndex]
    chrName = line[chrNameIndex]
    strand = line[strandIndex]
    position = line[positionIndex]
    mirSeq = line[mirSeqIndex]
    starSeq = line[starSeqIndex]

    # Remove "chr" if it exists in the chromosome name
    if ("chr" in chrName.lower()):
        chrName = chrName.lower().replace("chr", "")

    # If mirBaseDict is populated, that means that we have positional
    # information for this organism and thus can generate the most accurate
    # annotations for this organism
    if (mirBaseDict):
        # Loop through all identical miRNA sequences
        for identicalMirna in identicalList:
            # It turns out that there can be annotated miRNAs in miRBase
            # that do not exist in the gff file, so do a check to ensure
            # that the identical miRNA exists in mirBaseDict prior to
            # entering this loop
            if (identicalMirna not in mirBaseDict):

            # Loop through all coordinates that this specific miRNA
            # can be found
            for coordinates in mirBaseDict[identicalMirna]:
                mirBaseChr = coordinates[0]
                # Remove "chr" if it exists in the chromosome name
                if ("chr" in mirBaseChr.lower()):
                    mirBaseChr = mirBaseChr.lower().replace("chr", "")

                mirBaseStrand = coordinates[1]
                mirBasePosition = coordinates[2]

                # Need to convert strand from +/- to w/c
                if (mirBaseStrand == "+"):
                    mirBaseStrand = "w"
                elif (mirBaseStrand == "-"):
                    mirBaseStrand = "c"
                # If there strand is not + or -, something is wrong with this
                # miRBase entry. We will not exit the run, but we will report
                # the issue to the user and continue to the next tag
          "Unrecognized strand of miRBase entry. We "\
                        "will skip this entry, but please check with the "\
                        "miRBase file %s.gff3 and miRNA name %s" % (organism,

                # If the coordinates of the candidate miRNA meet the
                # coordinates of the miRBase miRNA, then this candidate miRNA
                # is this miRBase miRNA and we will change the annotation to
                # represent that
                if (chrName == mirBaseChr and strand == mirBaseStrand
                        and position == mirBasePosition):
                    line[mirNameIndex] = identicalMirna
                    annotatedFlag = True

                    # If the miRNA is known, update the image filename within
                    # the image folder with its miRBase annotated name. But
                    # if the name with the candidate sequence does not exist,
                    # that suggests that it has already been upduated in a
                    # previous run and thus we do not need to rename the file
                    if(os.path.isfile("%s/images/%s_precursor.pdf" % \
                            (outputFolder, mirName))):
                            "%s/images/%s_precursor.pdf" %
                            (outputFolder, mirName),
                            "%s/images/%s_precursor.pdf" %
                            (outputFolder, identicalMirna))

    # If we did not find an annotated miRNA at this same position, we will
    # annotate it in the final file as being identical to the following known
    # miRNAs, but not at the same position
    if (not annotatedFlag):
        toAdd = ""
        # Loop through all identical tags and copy a line for it so that all
        # known miRBase miRNAs matching this read can be found
        line.append("Identical to the following known miRNAs at different "\
        for identicalMirna in similarityDict[mirName]:
            toAdd = "%s%s " % (toAdd, identicalMirna)


    return (line)
Exemple #20
def miRador():
    """Parse configuration file and make necessary calls to the various
    helper functions to perform the entire miRNA prediction of the user
    provided input files. This function primarily serves as a wrapper
    to those other functions in other files


    # Initialize our logger
    logger = log.setupLogger("miRador")

    progStart = time.time()

    ######################## Parse Config File ###############################
    configFilename = sys.argv[1]
    config = configparser.ConfigParser()

    # Get the preprocessing arguments
    #runPreprocessFlag = config.get("Preprocess", "runPreprocessFlag")

    # Get the genome file name
    genomeFilename = config.get("Genome", "genomeFilename")

    # Get the einverted arguments
    runEInvertedFlag = config.getint("EInverted", "runEInvertedFlag",
        fallback = 1)
    einvertedPresets = (config.get("EInverted", "einvertedPresets",
        fallback = "medium"))

    # If einvertedPresets is set, set the einverted parameters to
    # appropriate levels for prediction of inverted repeats
    if(einvertedPresets.lower() == "medium"):
        match = 3
        mismatch = -4
        gap = 6
        threshold = 45
        maxRepLen = 300

    elif(einvertedPresets.lower() == "low"):
        match = 3
        mismatch = -4
        gap = 6
        threshold = 40
        maxRepLen = 300

    elif(einvertedPresets.lower() == "high"):
        match = 3
        mismatch = -5
        gap = 7
        threshold = 50
        maxRepLen = 300

    # Get the advanced einverted arguments from the config file which
    # will override the presets if these are set
    advancedMatch = config.get("Advanced", "match")
    advancedMismatch = config.get("Advanced", "mismatch")
    advancedGap = config.get("Advanced", "gap")
    advancedThreshold = config.get("Advanced", "threshold")
    advancedMaxRepLen = config.get("Advanced", "maxRepLen")

    # If the advanced settings are set, override whatever has been set in them
        match = int(advancedMatch)
        mismatch = int(advancedMismatch)
        gap = int(advancedGap)
        threshold = int(advancedThreshold)
        maxRepLen = int(advancedMaxRepLen)

    # Get the Libraries arguments and parse the libraries into a list
    # of strings. User can input a list of files or just a directory
    # holding all of the tag count files
    libFilenamesList = []
    libFilenamesString = config.get("Libraries", "libFilenamesList",
        fallback = "")
    libFolder = config.get("Libraries", "libFolder", fallback = "")

    # If individual libraries were given, split the string on commas and
    # store them in libFilenamesList 
        libFilenamesList = libFilenamesString.split(",")

    # If libFolder was specified, loop through the files in the folder
    # and add all files to libFilenamesList
        for file in os.listdir(libFolder):
            libFilenamesList.append("%s/%s" % (libFolder,

    # Do a check to confirm the user did not enter the same library
    # multiple times in libFilenamesList. First, we don't want to
    # process a library twice, but we also want to make sure the user
    # also did not intend to place another library in and accidentally
    # just pasted the path to another twice
    numLibs = len(libFilenamesList)
    if(numLibs != len(set(libFilenamesList))):
        logger.error("It appears that a library was input more than once. "
            "Please check your libraries again, remove any duplicate "\
            "entries, and ensure all libraries you intend to process are "

    # Grab the information for the BLAST variables
    organism = config.get("miRBase", "organism").lower()
    version = config.get("miRBase", "version", fallback = "CURRENT")

    cleanupFlag = config.getint("General", "cleanupFlag", fallback = 1)
    parallel = config.getint("General", "parallel")
    nthreads = config.get("General", "nthreads")
    blastnPath = os.path.expanduser(config.get("General", "blastnPath"))
    bowtiePath = os.path.expanduser(config.get("General", "bowtiePath"))
    bowtieBuildPath = os.path.expanduser(config.get("General",
    einvertedPath = os.path.expanduser(config.get("General", "einvertedPath"))
    makeblastdbPath = os.path.expanduser(config.get("General",
    perlPath = os.path.expanduser(config.get("General", "perlPath"))
    RNAFoldPath = os.path.expanduser(config.get("General", "RNAFoldPath"))
    RNAPlotPath = os.path.expanduser(config.get("General", "RNAPlotPath"))
    samtoolsPath = os.path.expanduser(config.get("General", "samtoolsPath"))
    ps2pdfwrPath = os.path.expanduser(config.get("General", "ps2pdfwrPath"))
    outputFolder = config.get("General", "outputFolder", fallback = "")

    # Required overhang between top and bottom strands of miRNA duplex
    # Hardcoded to 2 here, but in such a way that could technically allow
    # modifications
    overhang = 2

    # Perform various housekeeping functions including the checks that all
    # external program dependencies exist, that files being referenced and
    # folders that will be written to exist and are created.
    mirBaseDict, outputFolder = housekeeping.housekeeping(genomeFilename,
        libFilenamesString, libFolder, libFilenamesList, bowtiePath,
        bowtieBuildPath, einvertedPath, blastnPath, makeblastdbPath,
        perlPath, RNAFoldPath, RNAPlotPath, samtoolsPath,
        ps2pdfwrPath, outputFolder, organism, version)

    # Set the number of cores, if parallel is on
        nproc = int(round(int(multiprocessing.cpu_count()*.5),1))

    # Create genome object
    GenomeClass = genome.Genome(genomeFilename, bowtieBuildPath, samtoolsPath)


    ############### Find inverted repeats in genome file #####################

    # Run EInverted if the flag is set
        # Create an empty list for both the inverted repeat FASTA files
        # and alignment files
        IRFastaFilenamesList = []
        IRAlignmentFilenamesList = []

        # If parallel is set, run einverted using the parallel version
  "Running einverted in parallel")

            if(len(GenomeClass.chrFilenamesList) < nproc):
                pool = multiprocessing.Pool(len(GenomeClass.chrFilenamesList))
                pool = multiprocessing.Pool(nproc)

            res = pool.starmap_async(genome.runEinverted,
                zip(repeat(einvertedPath), GenomeClass.chrFilenamesList,
                repeat(match), repeat(mismatch), repeat(gap),
                repeat(threshold), repeat(maxRepLen)))

            results = res.get()


            # Loop through the results and add the inverted repeat filenames
            # to their respective lists
            for result in results:

  "Running einverted sequentially")

            # Loop through each chromosome and run einverted on each, one at 
            # a time
            for chrFilename in GenomeClass.chrFilenamesList:
                IRName, IRSeq = genome.runEinverted(einvertedPath, 
                    chrFilename, match, mismatch, gap, threshold, maxRepLen)


    # If einverted was not run, set the temp file lists to be just the list
    # of the merged final file so that we can create the IRDictByChr using the
    # previously merged file
        IRFastaFilenamesList = [GenomeClass.IRFastaFilename]
        IRAlignmentFilenamesList = [GenomeClass.IRAlignmentFilename]

    # Combine the inverted repeat temp files of the einverted runs
    # into one file
        IRAlignmentFilenamesList, runEInvertedFlag)


    ######################## Map small RNAs to genome #######################

    filteredPrecursorsDict = {}
    candidatesByLibDict = {}

    # Populate candidatesByLibDict chromosomes with empty dictionaries
    for chrName, chrIndex in GenomeClass.chrDict.items():
        candidatesByLibDict[chrName] = {}

    # Initialize libCounter to help inform users how far along the run is
    libCounter = 1

    for libraryFilename in libFilenamesList:
        libNameNoFolders = os.path.splitext(os.path.basename(
            libraryFilename))[0]"Beginning to process %s, library %s of %s." % (
            libraryFilename, libCounter, len(libFilenamesList)))
        Lib = library.Library(libraryFilename, GenomeClass.chrDict)

        filteredPrecursorsDict[libNameNoFolders] = {}

        for chrName in sorted(GenomeClass.chrDict.keys()):
            filteredPrecursorsDict[libNameNoFolders][chrName] = {}

        # Map small RNAs to the genome"Running bowtie on %s" % Lib.filename)
        funcStart = time.time()

        logFilename = Lib.mapper(GenomeClass.indexFilename, bowtiePath,

        funcEnd = time.time()
        execTime = round(funcEnd - funcStart, 2)"Runtime of bowtie for %s: %s seconds" % \
            (Lib.mapFilename, execTime))"Creating the mapped list for %s" % Lib.filename)
        funcStart = time.time()

        # Create a dictionary with the sequence of all tags that
        # map to a position on every chromosome

        # Normalize the reads in libDict

        funcEnd = time.time()
        execTime = round(funcEnd - funcStart, 2)"Time to create the mappedList: %s seconds" % (execTime))

        ################# Map small RNAs to inverted repeats ##################

        #######################################################################"Mapping sRNAs to the inverted repeats")

        funcStart = time.time()

        mappedTagsToPrecursors = []

        # Parallelization of this module has been removed as the overhead of
        # transferring mappedList to each proc is quite significant while
        # its runtime on one proc is extremely quick
#        if(parallel):
#            # Run mapSRNAsToIRs in parallel
#            pool = multiprocessing.Pool(nproc)
#            res = pool.starmap_async(mapSRNAsToIRs.mapSRNAsToIRs,
#                zip(GenomeClass.IRDictByChr, Lib.mappedList,
#                repeat(Lib.libDict)))
#            mappedTagsToPrecursors = res.get()
#            pool.close()

        # Map the sRNAs for this library to the inverted repeats predicted
        # for this genome.
        # Note that the format of this where we run one chromosome at a
        # time is a holdover from the parallelization effort that was made
        # for this function. While this for loop can now be moved into the
        # function, I am keeping it outside quite simply because the tabbing
        # within this function became quite deep and I'd rather avoid going
        # another level deeper
        for i in range(len(GenomeClass.chrDict)):
                GenomeClass.IRDictByChr[i], Lib.mappedList[i], Lib.libDict))

        funcEnd = time.time()
        execTime = round(funcEnd - funcStart, 2)"Time to map sRNAs to inverted repeats: %s seconds" \
            % (execTime))"Writing precursors to a file")

        # Create a file for all precursors to be written to that have at least
        # one sRNA that maps to both strands
        unfilteredFilename = "%s/libs/%s_all_precursors.txt" % (outputFolder,

            GenomeClass.chrDict, GenomeClass.IRDictByChr,


        ################### Filter precursor candidates #######################

        #######################################################################"Filtering candidate precursors")

        funcStart = time.time()

        # Parallelization of this module has been removed as the overhead of
        # transferring mappedList to each proc is quite significant while
        # its runtime on one proc is extremely quick
#        if(parallel):
#            pool = multiprocessing.Pool(nproc)
#            res = pool.starmap_async(filterPrecursors.filterPrecursors,
#                zip(mappedTagsToPrecursors, GenomeClass.IRDictByChr,
#                repeat(overhang)))
#            results = res.get()
#            pool.close()
#            for chrName in sorted(GenomeClass.chrDict.keys()):
#                chrIndex = GenomeClass.chrDict[chrName]
#                # Get the index of chrDict[chrName]
#                filteredPrecursorsDict[libNameNoFolders][chrName] = \
#                    results[chrIndex][1]

        # Filter the precursors, one chromosome at a time
        # Note that the format of this where we run one chromosome at a
        # time is a holdover from the parallelization effort that was made
        # for this function. While this for loop can now be moved into the
        # function, I am keeping it outside quite simply because the tabbing
        # within this function became quite deep and I'd rather avoid going
        # another level deeper
        for chrName in sorted(GenomeClass.chrDict.keys()):
            # Get the index of each chromosome that will be processed
            # sequentially
            chrIndex = GenomeClass.chrDict[chrName]
            precursorList = mappedTagsToPrecursors[chrIndex]
            IRDict = GenomeClass.IRDictByChr[chrIndex]

            filteredPrecursorsDict[libNameNoFolders][chrName] = \
                filterPrecursors.filterPrecursors(precursorList, IRDict,
                Lib.libDict, overhang)

        funcEnd = time.time()
        execTime = round(funcEnd - funcStart, 2)"Time to filter inverted repeats: %s seconds" % \

        # Prior to writing this library's results, add its miRNAs and
        # corresponding precursors to a dictionary

        # Loop through each chromosome of the final candidates dictionary
        # for this library
        for chrName, subFilteredPrecursorsDict in \

            # Loop through each duplex in the precursor and add it to the 
            # dictionary tracking which library it has been found in
            for precursorName, duplexDict in subFilteredPrecursorsDict.items():
                if(precursorName not in candidatesByLibDict[chrName]):
                    candidatesByLibDict[chrName][precursorName] = {} 

                for mirCandidate in duplexDict.keys():
                    if(mirCandidate not in candidatesByLibDict[chrName]\
                            [mirCandidate] = []


        # Create a file for all precursors that have been identified as having
        # a valid miRNA:miRNA* duplex to be written to
        filteredFilename = "%s/libs/%s_candidate_precursors.txt" % (
            outputFolder, libNameNoFolders)

        funcStart = time.time()
            GenomeClass.chrDict, GenomeClass.IRDictByChr,

        # Increment the library counter
        libCounter += 1

    filterPrecursors.writeCandidates(outputFolder, candidatesByLibDict,
        filteredPrecursorsDict, GenomeClass.IRDictByChr, libFilenamesList,
        GenomeClass.chrDict, genomeFilename)


    ################### Annotate candidate miRNAs ############################

    ##########################################################################"Annotating candidate miRNAs")

    funcStart = time.time()

    subjectSequencesFilename = "miRBase/miRBasePlantMirnas.fa"
    queryMirnasFilename = "%s/preAnnotatedCandidates.fa" % outputFolder
    dbFilename = "miRBase/miRBasePlantMirnas.db"

    # Create a list of candidate miRNAs and mirBase miRNAs
    querySeqsList = annotateCandidates.createListForAlign(queryMirnasFilename)
    subjectSeqsList = annotateCandidates.createListForAlign(

    similarityDict = {}

    if(parallel):"Running sequence alignment in parallel")

        pool = multiprocessing.Pool(nproc)

        res = pool.starmap_async(annotateCandidates.pairwiseAlignmentParallel,
            zip(querySeqsList, repeat(subjectSeqsList), repeat(organism)))

        results = res.get()


        for result in results:

        similarityDict = annotateCandidates.pairwiseAlignment(querySeqsList,
            subjectSeqsList, organism)

    # Properly annotate the candidate miRNAs with the data in similarityDict
    classificationCountsList = annotateCandidates.annotateCandidates(
        outputFolder, similarityDict, organism, mirBaseDict,
        GenomeClass.IRDictByChr, numLibs, GenomeClass.chrDict,
        GenomeClass.chrFilenamesList, perlPath, RNAFoldPath, RNAPlotPath,

    # Delete the single chromosome files used by einverted and the
    # draw functions to clean up temp file
    for chrFilename in GenomeClass.chrFilenamesList:

    funcEnd = time.time()
    execTime = round(funcEnd - funcStart, 2)"Time to annotate candidate miRNAs: %s seconds" % (execTime))

    progEnd = time.time()
    execTime = round(progEnd - progStart, 2)"Total runtime was %s seconds" % execTime)


    # Write a summary file with details of the analysis
    createSummary(classificationCountsList, outputFolder, execTime)