def getLocalDBDate(dbName): """Get the date of the blast DB creation date Args: dbName: The name of the blast DB Returns: The date of the blast DB update """ # Initialize our logger logger = log.setupLogger("getLocalDBDate") # Only check the creation date if the file even exists if (os.path.isfile('%s.nhr' % dbName)): # Get the time in the struct_time format localDatabaseTime = datetime.datetime.fromtimestamp( os.path.getmtime('%s.nhr' % dbName)) return (localDatabaseTime) # File doesn't exist so return year 1 for comparison sake. # Note: datetime.MINYEAR threw error hence the hardcoded date else: logger.info("There is no file with name %s. Creating BLAST database."\ % dbName) return (datetime.datetime(1, 1, 1)) log.closeLogger(logger)
def checkNeedUpdateByDate(subjectSequencesFilename, dbName): """Check to see if the BLAST database needs to be updated Args: subjectSequencesFilename: The name of the subject sequences file Returns: Bool value indicating if the BLAST DB needs to be updated """ # Initialize our logger logger = log.setupLogger("checkNeedUpdateByDate") # Get the date that the FASTA file was last updated fastaFileDate = getFastaDate(subjectSequencesFilename) # Get the date that the database was updated last localDatabaseDate = getLocalDBDate(dbName) # If the expression database is newer than the BLAST database, # an update is needed if (fastaFileDate > localDatabaseDate): logger.info("An update is needed to the BLAST database and will be "\ "created now. This will take time so please be patient.") return (True) # If the expression database is older than the BLAST database, # no update is needed else: logger.info("No update is needed to the BLAST database. Proceeding "\ "with the BLAST.") return (False) log.closeLogger(logger)
def readFastq(self): """Read a library in FASTQ format into memory as a dictionary. The key will be the sequence, and the value will be the count of times in which the tag was seen Return: Dictionary of the library file """ # Initialize our logger logger = log.setupLogger("readFastq") # Start timer for function funcStart = time.time() # Create an empty dictionary to store the full library libDict = {} # Initialize a counter to simply name the reads in the temp fasta file readCount = 1 # Open the file and loop through line by line to store the library into # a dictionary. Each tag will be a key and the number of times seen # will be the value with open(self.filename) as f, open(self.fastaFilename, "w") as g: for count, line in enumerate(f, start=0): if (count % 4 == 1): # Store the sequence and count into variables, then add # them to the dictionary. There should not be any # duplicate sequences in this format, however, tag = line.rstrip() # If the tag already exists in the libDict, increment # the counter of the sequence and retain the empty hits # variable as 0 if (tag in libDict): currCount = libDict[tag][0] libDict[tag] = [currCount + 1, 0] else: libDict[tag] = [1, 0] # Write the sequence to a unique reads FASTA file g.write(">s_%s\n%s\n" % (readCount, tag)) readCount += 1 # Stop timer for function funcEnd = time.time() # Calculate the execution time and print it to the user execTime = round(funcEnd - funcStart, 2) logger.info("Time to read library %s: %s seconds" % (self.filename, execTime)) log.closeLogger(logger) return (libDict)
def drawPrecursor(precursorSeq, mirName, mirSeq, starSeq, outputFolder, perlPath, RNAFoldPath, RNAPlotPath, ps2pdfwrPath): """Using RNAFold, draw the miRNA and the miRNA* on the precursor Args: precursorSeq: The sequence of the miRNA precursor mirName: The name of the candidate miRNA. Use this instead of precursorName because it's possible for more than one candidate miRNA to come from the same precursor, so this should be unique mirSeq: The candidate miRNA sequence startSeq: The sequence of the miRNA* on this duplex outputFolder: The name of the output folder perlPath: The path of perl on this system RNAFoldPath: The path of RNAFold on this system RNAPlotPath: The path of RNAPlot on this system ps2pdfwrPath: The path of ps2pdfwr on this system """ # Initialize our logger logger = log.setupLogger("drawPrecursor") tempFilename = "%s/images/%s.temp" % (outputFolder, mirName) mir_out = open(tempFilename, "w") mir_out.write('%s\n%s' % (mirSeq.replace("T", "U"), starSeq.replace("T", "U"))) mir_out.close() # Call the perl drawPrecursor program on our data returnCode = subprocess.call([ perlPath, "drawPrecursor/drawPrecursor.pl", RNAFoldPath, RNAPlotPath, ps2pdfwrPath, mirName, precursorSeq, tempFilename ]) if (returnCode): logger.error("Something went wrong when running drawPrecursor. " \ "Command was\nperl drawPrecursor/drawPrecursor.pl %s %s %s %s "\ "%s %s" % (RNAFoldPath, RNAPlotPath, ps2pdfwrPath, mirName, precursorSeq, tempFilename)) sys.exit() # Rename the file from the default drawPrecursor Structure_plot file # name to the mirName_precursor os.rename("%s_RNAplot_out/%s_Structure_plot.pdf" % (mirName, mirName), "%s/images/%s_precursor.pdf" % (outputFolder, mirName)) # Delete temp files to create image try: os.remove(tempFilename) except OSError as e: logger.error("Failed to delete %s" % tempFilename) try: shutil.rmtree("%s_RNAplot_out" % mirName) except OSError as e: logger.error("Failed to delete RNAplot folder %s_RNAplot_out" %\ mirName) log.closeLogger(logger)
def identifyFileType(self): """Investigate the first few linse of the library filename to determine its format so that the proper file parser function is used Returns: A simple string of fasta, fastq, or tagCount """ # Initialize our logger logger = log.setupLogger("identifyFileType") libType = "" with open(self.filename) as f: lines = [line for line in f][:4] # If the first character of the first line of the file is a >, and # both the 2nd and 4th lines are nucleotide sequences, then we # should be able to call this file fasta if (lines[0][0] == ">" and re.search("^[ACGT]*$", lines[1].rstrip().upper()) and re.search("^[ACGT]*$", lines[3].rstrip().upper())): libType = "fasta" # If the second line of the file is a nucleotide sequence, but # the 4th is not, then it should be a fastq file elif (re.search("^[ACGTN]*$", lines[1].rstrip().upper()) and not re.search("^[ACGTN]*$", lines[3].rstrip().upper())): libType = "fastq" # If splitting the first line on a tab results in the first # index being just a nucleotide sequence, then the input file # is a tag count file elif (re.search("^[ACGTN]*$", lines[0].split("\t")[0].upper())): libType = "tagCount" # If none of the previous tests passed, report an error to # the user and kill the run else: logger.info("The data in %s was not recognized as a fasta, "\ "fastq, or tag count file. Please check the file to "\ "ensure that it is one of the recognized file types." %\ self.filename) log.closeLogger(logger) sys.exit() log.closeLogger(logger) return (libType)
def readTagCount(self): """Read a library in tag count format into memory. Additionally, as each sequence is read, we will write this to a FASTA output file for bowtie Return: Dictionary of library read in """ # Initialize our logger logger = log.setupLogger("readTagCount") # Start timer for function funcStart = time.time() # Create an empty dictionary to store the full library libDict = {} # Initialize a counter to simply name the reads in the temp fasta file readCount = 1 # Open the file and loop through line by line to store the library into # a dictionary. Each tag will be a key and the abundance will be the # value with open(self.filename) as f, open(self.fastaFilename, "w") as g: for line in f: # Store the sequence and count into variables, then add them # to the dictionary. There should not be any duplicate # sequences in this format, however, tag = line.split("\t")[0] count = int(line.split("\t")[1].strip()) libDict[tag] = [count, 0] # Write the sequence to a unique reads FASTA file g.write(">s_%s\n%s\n" % (readCount, tag)) readCount += 1 # Stop timer for function funcEnd = time.time() # Calculate the execution time and print it to the user execTime = round(funcEnd - funcStart, 2) logger.info("Time to read library %s: %s seconds" % (self.filename, execTime)) log.closeLogger(logger) return (libDict)
def checkNeedUpdate(version): """Check if the version file from miRBase exists in our miRBase folder. Args: version: Version of miRBase to be queried. Generally should be "CURRENT" Returns: True if we need to update our miRBase folder and false if no update is required """ # Initialize our logger logger = log.setupLogger("checkNeedUpdate") ftp = ftplib.FTP("mirbase.org") ftp.login("anonymous", "") try: ftp.cwd("pub/mirbase/%s/" % version) except: logger.error("Input version does not appear to exist in miRBase. " \ "Check version number in ini file and try again") sys.exit() # Pull the list of files from miRBase filenamesList = ftp.nlst() # Loop through the files in this version of miRBase to find the # version file for filename in filenamesList: # First, check if the miRBase plant file even exists yet. If not, # don't bother going any further. We need to update if (os.path.exists("miRBase/miRBasePlantMirnas.fa")): if (filename.startswith("0_THIS_IS_RELEASE")): versionFile = filename # If the verson file exists exactly as it was found in miRBase, # return false as we do not need to update our miRBase files if (os.path.exists("miRBase/%s" % filename)): ftp.close() return (False) ftp.close() log.closeLogger(logger) return (True)
def setupMiRBase(organism, version): """Download all plant miRNAs from the provided version of miRBase to generate the subject files for BLAST when attentmping to annotate our candidate miRNAs Args: organism: The three letter identifier of the organism being studied version: Version of miRBase to be downloaded. Generally should be "CURRENT" """ # Initialize our logger logger = log.setupLogger("setupMirBase") updateStatus = checkNeedUpdate(version) gffFilename = "miRBase/%s.gff3" % organism mirBaseDict = {} # Check if the miRBase files need to be updated if (updateStatus): logger.info("Downloading the relevant miRBase files") # If there are still files in the miRBase folder but it needs to # be updated, then clear the contents prior to populating it again if (os.listdir("miRBase")): shutil.rmtree("miRBase") os.mkdir("miRBase") # Download the organism file and mature miRNA file downloadOrganismsAndMirnas(version) # Find all the plant species that are in miRBase and then download # their GFF files IF they exist plantList = findPlantSpeciesFromOrganisms() downloadPlantSpecies(version, plantList) createFastaFile(plantList) # We can only perform identity searches, with positional information, if # the GFF file exists. So, first check if the GFF file actually exists, # then read the file into a dictionary with parsePrecursorGFF if (os.path.isfile(gffFilename)): mirBaseDict = parsePrecursorGFF(gffFilename) log.closeLogger(logger) return (mirBaseDict)
def buildBowtieIndex(self, bowtieBuildPath): """Code to create a bowtie index for the inverited repeats file. Args: bowtieBuildPath: Path of bowtie-build Returns: Path of bowtie index """ # Initialize our logger logger = log.setupLogger("buildBowtieIndex") # Set the index filename. Remove any file extension and folders # from the filename path to ensure the index file is written # to the correct folder that is hardcoded here filenameStripped = os.path.splitext(self.filename.split('/')[-1])[0] indexFilename = "genome/bowtieIndex/%s" % (filenameStripped) if (self.checkBowtieNeedsUpdate(indexFilename)): logger.info("Building a bowtie index for %s" % (self.filename)) with open("genome/bowtieIndex/%s_bowtiebuild.log" %\ filenameStripped, 'w') as logFile: returnCode = subprocess.call( [bowtieBuildPath, self.filename, indexFilename], stdout=logFile) if (returnCode): logger.info("Something went wrong when running bowtie-build. "\ "Command was\n%s %s %s" % (bowtieBuildPath, self.filename, indexFilename)) sys.exit() logFile.close() log.closeLogger(logger) return (indexFilename)
def buildFastaIndex(self, samtoolsBuildPath): """Code to create a fasta infdex Args: samtoolsBuildPath: Path of samtools """ # Initialize our logger logger = log.setupLogger("buildsamtoolsIndex") if (self.checkFastaIndexNeedsUpdate()): logger.info("Building a fasta index for %s" % (self.filename)) returnCode = subprocess.call( [samtoolsBuildPath, "faidx", self.filename]) if (returnCode): logger.info("Something went wrong when building fasta index. "\ "Command was\n%s %faidx %s" % (samtoolsBuildPath, self.filename)) sys.exit() log.closeLogger(logger)
def main(): ''' Main routine ''' setupLogger() # Add options usage = ('usage: %prog [-cpnN] Instr [-sndftgahi] ' 'params={val|min,max|min,guess,max}...') parser = OptionParser(usage, version=mccode_config.configuration['MCCODE_VERSION']) add_mcrun_options(parser) add_mcstas_options(parser) # Parse options (options, args) = parser.parse_args() # Write user config file and exit if options.write_user_config: mccode_config.save_user_config() quit() # Extract instrument and parameters if len(args) == 0: print(parser.get_usage()) parser.exit() # Set path of instrument-file after locating it options.instr = find_instr_file(args[0]) if options.param: # load params from file text = open(options.param).read() import re params = re.findall('[\w0-9]+=[^=\s]+', text) options.params = map(clean_quotes, params) else: # Clean out quotes (perl mcgui requires this step) options.params = map(clean_quotes, args[1:]) # On windows, ensure that backslashes in the filename are escaped if sys.platform == "win32": options.instr = options.instr.replace("\\","\\\\") # Fill out extra information expand_options(options) if options.verbose: setLogLevel(DEBUG) # Inform user of what is happening # TODO: More info? LOG.info('Using directory: "%s"' % options.dir) if options.dir == "." or options.dir == "./" or options == ".\\": LOG.warning('Existing files in "%s" will be overwritten!' % options.dir) LOG.warning(' - and datafiles catenated...') options.dir = ''; # Run McStas mcstas = McStas(options.instr) mcstas.prepare(options) (fixed_params, intervals) = get_parameters(options) # Indicate end of setup / start of computations LOG.info('===') if options.info: print('info!') mcstas.run(override_mpi=False) exit() # Set fixed parameters for key, value in fixed_params.items(): mcstas.set_parameter(key, value) # Check for linear scanning interval_points = None # Can't both do list and interval scanning if options.list and options.numpoints: raise OptionValueError('--numpoints cannot be used with --list') if options.list: if len(intervals) == 0: raise OptionValueError( '--list was chosen but no lists was presented.') pointlist=list(intervals.values()) points = len(pointlist[0]) if not(all(map(lambda i: len(i) == points, intervals.values()))): raise OptionValueError( 'All variables much have an equal amount of points.') interval_points = LinearInterval.from_list( points, intervals) scan = options.multi or options.numpoints if ((options.numpoints is not None and options.numpoints < 2) or (scan and options.numpoints is None)): raise OptionValueError( ('Cannot scan variable(s) %s using only one data point. ' 'Please use -N to specify the number of points.') % \ ', '.join(intervals.keys())) # Check that input is valid decimals if not all(map(lambda i: len(i) == 2 and all(map(is_decimal, i)), intervals.values())): raise OptionValueError('Could not parse intervals -- result: %s' % str(intervals)) if options.multi is not None: interval_points = MultiInterval.from_range( options.numpoints, intervals) elif options.numpoints is not None: interval_points = LinearInterval.from_range( options.numpoints, intervals) # Parameters for linear scanning present if interval_points: scanner = Scanner(mcstas, intervals) scanner.set_points(interval_points) if (not options.dir == ''): mkdir(options.dir) scanner.run() else: # Only run a simulation if we have a nonzero ncount if not options.ncount == 0.0: mcstas.run() if isdir(options.dir): LOG.info('Placing instr file copy %s in dataset %s',options.instr,options.dir) copyfile(options.instr, join(options.dir,basename(options.instr))) if options.autoplot is not None: autoplotter = mccode_config.configuration['MCPLOT'] # apply selected autoplotter, if used if options.autoplotter is not None: autoplotter = options.autoplotter if isdir(options.dir): LOG.info('Running plotter %s on dataset %s',mccode_config.configuration['MCPLOT'],options.dir) Process(autoplotter).run([options.dir])
def miRador(): """Parse configuration file and make necessary calls to the various helper functions to perform the entire miRNA prediction of the user provided input files. This function primarily serves as a wrapper to those other functions in other files """ # Initialize our logger logger = log.setupLogger("miRador") progStart = time.time() ######################## Parse Config File ############################### configFilename = sys.argv[1] config = configparser.ConfigParser() config.read(configFilename) # Get the preprocessing arguments #runPreprocessFlag = config.get("Preprocess", "runPreprocessFlag") # Get the genome file name genomeFilename = config.get("Genome", "genomeFilename") # Get the einverted arguments runEInvertedFlag = config.getint("EInverted", "runEInvertedFlag", fallback = 1) einvertedPresets = (config.get("EInverted", "einvertedPresets", fallback = "medium")) # If einvertedPresets is set, set the einverted parameters to # appropriate levels for prediction of inverted repeats if(einvertedPresets.lower() == "medium"): match = 3 mismatch = -4 gap = 6 threshold = 45 maxRepLen = 300 elif(einvertedPresets.lower() == "low"): match = 3 mismatch = -4 gap = 6 threshold = 40 maxRepLen = 300 elif(einvertedPresets.lower() == "high"): match = 3 mismatch = -5 gap = 7 threshold = 50 maxRepLen = 300 # Get the advanced einverted arguments from the config file which # will override the presets if these are set advancedMatch = config.get("Advanced", "match") advancedMismatch = config.get("Advanced", "mismatch") advancedGap = config.get("Advanced", "gap") advancedThreshold = config.get("Advanced", "threshold") advancedMaxRepLen = config.get("Advanced", "maxRepLen") # If the advanced settings are set, override whatever has been set in them if(advancedMatch): match = int(advancedMatch) if(advancedMismatch): mismatch = int(advancedMismatch) if(advancedGap): gap = int(advancedGap) if(advancedThreshold): threshold = int(advancedThreshold) if(advancedMaxRepLen): maxRepLen = int(advancedMaxRepLen) # Get the Libraries arguments and parse the libraries into a list # of strings. User can input a list of files or just a directory # holding all of the tag count files libFilenamesList = [] libFilenamesString = config.get("Libraries", "libFilenamesList", fallback = "") libFolder = config.get("Libraries", "libFolder", fallback = "") # If individual libraries were given, split the string on commas and # store them in libFilenamesList if(libFilenamesString): libFilenamesList = libFilenamesString.split(",") # If libFolder was specified, loop through the files in the folder # and add all files to libFilenamesList if(libFolder): for file in os.listdir(libFolder): libFilenamesList.append("%s/%s" % (libFolder, os.path.join(file))) # Do a check to confirm the user did not enter the same library # multiple times in libFilenamesList. First, we don't want to # process a library twice, but we also want to make sure the user # also did not intend to place another library in and accidentally # just pasted the path to another twice numLibs = len(libFilenamesList) if(numLibs != len(set(libFilenamesList))): logger.error("It appears that a library was input more than once. " "Please check your libraries again, remove any duplicate "\ "entries, and ensure all libraries you intend to process are " "present.") sys.exit() # Grab the information for the BLAST variables organism = config.get("miRBase", "organism").lower() version = config.get("miRBase", "version", fallback = "CURRENT") cleanupFlag = config.getint("General", "cleanupFlag", fallback = 1) parallel = config.getint("General", "parallel") nthreads = config.get("General", "nthreads") blastnPath = os.path.expanduser(config.get("General", "blastnPath")) bowtiePath = os.path.expanduser(config.get("General", "bowtiePath")) bowtieBuildPath = os.path.expanduser(config.get("General", "bowtieBuildPath")) einvertedPath = os.path.expanduser(config.get("General", "einvertedPath")) makeblastdbPath = os.path.expanduser(config.get("General", "makeblastdbPath")) perlPath = os.path.expanduser(config.get("General", "perlPath")) RNAFoldPath = os.path.expanduser(config.get("General", "RNAFoldPath")) RNAPlotPath = os.path.expanduser(config.get("General", "RNAPlotPath")) samtoolsPath = os.path.expanduser(config.get("General", "samtoolsPath")) ps2pdfwrPath = os.path.expanduser(config.get("General", "ps2pdfwrPath")) outputFolder = config.get("General", "outputFolder", fallback = "") # Required overhang between top and bottom strands of miRNA duplex # Hardcoded to 2 here, but in such a way that could technically allow # modifications overhang = 2 # Perform various housekeeping functions including the checks that all # external program dependencies exist, that files being referenced and # folders that will be written to exist and are created. mirBaseDict, outputFolder = housekeeping.housekeeping(genomeFilename, libFilenamesString, libFolder, libFilenamesList, bowtiePath, bowtieBuildPath, einvertedPath, blastnPath, makeblastdbPath, perlPath, RNAFoldPath, RNAPlotPath, samtoolsPath, ps2pdfwrPath, outputFolder, organism, version) # Set the number of cores, if parallel is on if(parallel): nproc = int(round(int(multiprocessing.cpu_count()*.5),1)) # Create genome object GenomeClass = genome.Genome(genomeFilename, bowtieBuildPath, samtoolsPath) ########################################################################## ############### Find inverted repeats in genome file ##################### ########################################################################## # Run EInverted if the flag is set if(runEInvertedFlag): # Create an empty list for both the inverted repeat FASTA files # and alignment files IRFastaFilenamesList = [] IRAlignmentFilenamesList = [] # If parallel is set, run einverted using the parallel version if(parallel): logger.info("Running einverted in parallel") if(len(GenomeClass.chrFilenamesList) < nproc): pool = multiprocessing.Pool(len(GenomeClass.chrFilenamesList)) else: pool = multiprocessing.Pool(nproc) res = pool.starmap_async(genome.runEinverted, zip(repeat(einvertedPath), GenomeClass.chrFilenamesList, repeat(match), repeat(mismatch), repeat(gap), repeat(threshold), repeat(maxRepLen))) results = res.get() pool.close() # Loop through the results and add the inverted repeat filenames # to their respective lists for result in results: IRFastaFilenamesList.append(result[0]) IRAlignmentFilenamesList.append(result[1]) else: logger.info("Running einverted sequentially") # Loop through each chromosome and run einverted on each, one at # a time for chrFilename in GenomeClass.chrFilenamesList: IRName, IRSeq = genome.runEinverted(einvertedPath, chrFilename, match, mismatch, gap, threshold, maxRepLen) IRFastaFilenamesList.append(IRName) IRAlignmentFilenamesList.append(IRSeq) # If einverted was not run, set the temp file lists to be just the list # of the merged final file so that we can create the IRDictByChr using the # previously merged file else: IRFastaFilenamesList = [GenomeClass.IRFastaFilename] IRAlignmentFilenamesList = [GenomeClass.IRAlignmentFilename] # Combine the inverted repeat temp files of the einverted runs # into one file GenomeClass.combineIRTempFiles(IRFastaFilenamesList, IRAlignmentFilenamesList, runEInvertedFlag) ######################################################################### ######################## Map small RNAs to genome ####################### ######################################################################### filteredPrecursorsDict = {} candidatesByLibDict = {} # Populate candidatesByLibDict chromosomes with empty dictionaries for chrName, chrIndex in GenomeClass.chrDict.items(): candidatesByLibDict[chrName] = {} # Initialize libCounter to help inform users how far along the run is libCounter = 1 for libraryFilename in libFilenamesList: libNameNoFolders = os.path.splitext(os.path.basename( libraryFilename))[0] logger.info("Beginning to process %s, library %s of %s." % ( libraryFilename, libCounter, len(libFilenamesList))) Lib = library.Library(libraryFilename, GenomeClass.chrDict) filteredPrecursorsDict[libNameNoFolders] = {} for chrName in sorted(GenomeClass.chrDict.keys()): filteredPrecursorsDict[libNameNoFolders][chrName] = {} # Map small RNAs to the genome logger.info("Running bowtie on %s" % Lib.filename) funcStart = time.time() logFilename = Lib.mapper(GenomeClass.indexFilename, bowtiePath, nthreads) funcEnd = time.time() execTime = round(funcEnd - funcStart, 2) logger.info("Runtime of bowtie for %s: %s seconds" % \ (Lib.mapFilename, execTime)) logger.info("Creating the mapped list for %s" % Lib.filename) funcStart = time.time() # Create a dictionary with the sequence of all tags that # map to a position on every chromosome Lib.createMappedList(GenomeClass.chrDict) # Normalize the reads in libDict Lib.normalizeReads(logFilename) funcEnd = time.time() execTime = round(funcEnd - funcStart, 2) logger.info("Time to create the mappedList: %s seconds" % (execTime)) ####################################################################### ################# Map small RNAs to inverted repeats ################## ####################################################################### logger.info("Mapping sRNAs to the inverted repeats") funcStart = time.time() mappedTagsToPrecursors = [] # Parallelization of this module has been removed as the overhead of # transferring mappedList to each proc is quite significant while # its runtime on one proc is extremely quick # if(parallel): # # Run mapSRNAsToIRs in parallel # pool = multiprocessing.Pool(nproc) # # res = pool.starmap_async(mapSRNAsToIRs.mapSRNAsToIRs, # zip(GenomeClass.IRDictByChr, Lib.mappedList, # repeat(Lib.libDict))) # # mappedTagsToPrecursors = res.get() # # pool.close() # Map the sRNAs for this library to the inverted repeats predicted # for this genome. # Note that the format of this where we run one chromosome at a # time is a holdover from the parallelization effort that was made # for this function. While this for loop can now be moved into the # function, I am keeping it outside quite simply because the tabbing # within this function became quite deep and I'd rather avoid going # another level deeper for i in range(len(GenomeClass.chrDict)): mappedTagsToPrecursors.append(mapSRNAsToIRs.mapSRNAsToIRs( GenomeClass.IRDictByChr[i], Lib.mappedList[i], Lib.libDict)) funcEnd = time.time() execTime = round(funcEnd - funcStart, 2) logger.info("Time to map sRNAs to inverted repeats: %s seconds" \ % (execTime)) logger.info("Writing precursors to a file") # Create a file for all precursors to be written to that have at least # one sRNA that maps to both strands unfilteredFilename = "%s/libs/%s_all_precursors.txt" % (outputFolder, libNameNoFolders) mapSRNAsToIRs.writeUnfilteredPrecursors(unfilteredFilename, GenomeClass.chrDict, GenomeClass.IRDictByChr, mappedTagsToPrecursors) ####################################################################### ################### Filter precursor candidates ####################### ####################################################################### logger.info("Filtering candidate precursors") funcStart = time.time() # Parallelization of this module has been removed as the overhead of # transferring mappedList to each proc is quite significant while # its runtime on one proc is extremely quick # if(parallel): # pool = multiprocessing.Pool(nproc) # # res = pool.starmap_async(filterPrecursors.filterPrecursors, # zip(mappedTagsToPrecursors, GenomeClass.IRDictByChr, # repeat(overhang))) # # results = res.get() # # pool.close() # # for chrName in sorted(GenomeClass.chrDict.keys()): # chrIndex = GenomeClass.chrDict[chrName] # # # Get the index of chrDict[chrName] # filteredPrecursorsDict[libNameNoFolders][chrName] = \ # results[chrIndex][1] # Filter the precursors, one chromosome at a time # Note that the format of this where we run one chromosome at a # time is a holdover from the parallelization effort that was made # for this function. While this for loop can now be moved into the # function, I am keeping it outside quite simply because the tabbing # within this function became quite deep and I'd rather avoid going # another level deeper for chrName in sorted(GenomeClass.chrDict.keys()): # Get the index of each chromosome that will be processed # sequentially chrIndex = GenomeClass.chrDict[chrName] precursorList = mappedTagsToPrecursors[chrIndex] IRDict = GenomeClass.IRDictByChr[chrIndex] filteredPrecursorsDict[libNameNoFolders][chrName] = \ filterPrecursors.filterPrecursors(precursorList, IRDict, Lib.libDict, overhang) funcEnd = time.time() execTime = round(funcEnd - funcStart, 2) logger.info("Time to filter inverted repeats: %s seconds" % \ (execTime)) ### # Prior to writing this library's results, add its miRNAs and # corresponding precursors to a dictionary # Loop through each chromosome of the final candidates dictionary # for this library for chrName, subFilteredPrecursorsDict in \ filteredPrecursorsDict[libNameNoFolders].items(): # Loop through each duplex in the precursor and add it to the # dictionary tracking which library it has been found in for precursorName, duplexDict in subFilteredPrecursorsDict.items(): if(precursorName not in candidatesByLibDict[chrName]): candidatesByLibDict[chrName][precursorName] = {} for mirCandidate in duplexDict.keys(): if(mirCandidate not in candidatesByLibDict[chrName]\ [precursorName]): candidatesByLibDict[chrName][precursorName]\ [mirCandidate] = [] candidatesByLibDict[chrName][precursorName]\ [mirCandidate].append(libNameNoFolders) # Create a file for all precursors that have been identified as having # a valid miRNA:miRNA* duplex to be written to filteredFilename = "%s/libs/%s_candidate_precursors.txt" % ( outputFolder, libNameNoFolders) funcStart = time.time() filterPrecursors.writeFilteredPrecursors(filteredFilename, GenomeClass.chrDict, GenomeClass.IRDictByChr, filteredPrecursorsDict[libNameNoFolders]) # Increment the library counter libCounter += 1 filterPrecursors.writeCandidates(outputFolder, candidatesByLibDict, filteredPrecursorsDict, GenomeClass.IRDictByChr, libFilenamesList, GenomeClass.chrDict, genomeFilename) ########################################################################## ################### Annotate candidate miRNAs ############################ ########################################################################## logger.info("Annotating candidate miRNAs") funcStart = time.time() subjectSequencesFilename = "miRBase/miRBasePlantMirnas.fa" queryMirnasFilename = "%s/preAnnotatedCandidates.fa" % outputFolder dbFilename = "miRBase/miRBasePlantMirnas.db" # Create a list of candidate miRNAs and mirBase miRNAs querySeqsList = annotateCandidates.createListForAlign(queryMirnasFilename) subjectSeqsList = annotateCandidates.createListForAlign( subjectSequencesFilename) similarityDict = {} if(parallel): logger.info("Running sequence alignment in parallel") pool = multiprocessing.Pool(nproc) res = pool.starmap_async(annotateCandidates.pairwiseAlignmentParallel, zip(querySeqsList, repeat(subjectSeqsList), repeat(organism))) results = res.get() pool.close() for result in results: similarityDict.update(result) else: similarityDict = annotateCandidates.pairwiseAlignment(querySeqsList, subjectSeqsList, organism) # Properly annotate the candidate miRNAs with the data in similarityDict classificationCountsList = annotateCandidates.annotateCandidates( outputFolder, similarityDict, organism, mirBaseDict, GenomeClass.IRDictByChr, numLibs, GenomeClass.chrDict, GenomeClass.chrFilenamesList, perlPath, RNAFoldPath, RNAPlotPath, ps2pdfwrPath) # Delete the single chromosome files used by einverted and the # draw functions to clean up temp file for chrFilename in GenomeClass.chrFilenamesList: os.remove(chrFilename) funcEnd = time.time() execTime = round(funcEnd - funcStart, 2) logger.info("Time to annotate candidate miRNAs: %s seconds" % (execTime)) progEnd = time.time() execTime = round(progEnd - progStart, 2) logger.info("Total runtime was %s seconds" % execTime) log.closeLogger(logger) # Write a summary file with details of the analysis createSummary(classificationCountsList, outputFolder, execTime)
def downloadOrganismsAndMirnas(version): """Get the organisms.txt.gz and mature.fa.gz files from the current version of miRBase, unzip the files, and save them to the miRBase folder Args: version: Version of miRBase to be downloaded. Generally should be "CURRENT" """ # Initialize our logger logger = log.setupLogger("downloadOrganismsAndMirnas") ftp = ftplib.FTP("mirbase.org") ftp.login("anonymous", "") ftp.cwd("pub/mirbase/%s/" % version) # Before we actually get the organism file, we will actually need to # downlaod the version file to our directory to prevent updates in # successive runs filenamesList = ftp.nlst() for filename in filenamesList: if (filename.startswith("0_THIS_IS_RELEASE")): ftp.retrbinary("RETR %s" % filename, open("miRBase/%s" % \ filename, "wb").write) # Try to download the organisms file for this organism. try: ftp.retrbinary("RETR organisms.txt.gz", open("miRBase/organisms.txt.gz", "wb").write) # If there is no file except ftplib.all_errors as e: logger.error("Error while downloading organisms.txt.gz...\n%s\nTry "\ "changing the version of miRBase you are trying to download" % e) sys.exit() # Use gzip to unzip the file and save it as organisms.tsv with gzip.open("miRBase/organisms.txt.gz", "rb") as f_gz: with open("miRBase/organisms.tsv", "wb") as f_unzip: shutil.copyfileobj(f_gz, f_unzip) # Remove organisms.txt.gz as we have already unzipped it os.remove("miRBase/organisms.txt.gz") # Try to download the mature miRNA file for this organism try: ftp.retrbinary("RETR mature.fa.gz", open("miRBase/mature.fa.gz", "wb").write) # If there is no file except ftplib.all_errors as e: logger.error("Error while downloading mature.fa.gz...\n%s\nTry "\ "changing the version of miRBase you are trying to download" % e) sys.exit() # Use gzip to unzip the file and save it as mature.fa with gzip.open("miRBase/mature.fa.gz", "rb") as f_gz: with open("miRBase/mature.fa", "wb") as f_unzip: shutil.copyfileobj(f_gz, f_unzip) # Remove organisms.txt.gz as we have already unzipped it os.remove("miRBase/mature.fa.gz") ftp.close() log.closeLogger(logger)
""" 按照config.py中配置的不同数据库,执行不同的sql,写入到相应的文件。 然后发送邮件。 """ import logging from report import Report from config import settings if __name__ == '__main__': """ """ import log log.setupLogger(level='info', filename=settings.LOG_FILE, filemode='a') import logging logger = logging.getLogger(__name__) logger.info('Starting...') report = Report() # 执行sql查询,会将数据写入到~/var/data/report/<date>/<trade_name> report.execute() # 发送此次查询的结果 report.send() logger.info('Finished')
def mapper(self, indexFilename, bowtiePath, nthreads): """Map small RNAs to the provided index file Args: indexFilename: Path and name of the index for the genome. bowtiePath: The path of bowtie nthreads: The number of threads to use with bowtie Returns: Filename of mapped data """ # Initialize our logger logger = log.setupLogger("mapper") # Strip the filename of its folders and create the output map # name with that stripped filename in the libs folder indexNameStripped = os.path.basename(indexFilename) logFilename = "%s_bowtie.log" % os.path.splitext(self.mapFilename)[:-1] if (self.libType == "tagCount"): logger.info("Mapping small RNAs to the genome files for %s" %\ (self.fastaFilename)) else: logger.info("Mapping small RNAs to the genome files for %s" %\ (self.filename)) with open(logFilename, "w") as logFile: # Run bowtie with the following options: # -a to report all valid alignments as we want multihits # -m 50 to suppress all alignments with more than 50 matches # to the genome. We expect few multi-matches to the genome # --best and --strata ensures only the best alignments are reported # and so that less optimum but passable alignments do not appear # -v 0 Allow no mismatch # --sam-nohead removes the header from the SAM file. This is useful # because we have to merge the fragment alignments for parallel # runs # --no-unal suppresses sequences with no alignemnt. This helps to # keep the map file manageable and filter out these sequences # earlier for efficiency ### Note that the output of bowtie is send to stderr for some ### which is why this log flie goes there returnCode = subprocess.call([ bowtiePath, indexFilename, "-f", self.fastaFilename, "-a", "-m 50", "--best", "--strata", "-v 0", "-S", self.mapFilename, "-p", nthreads, "--sam-nohead", "--no-unal" ], stderr=logFile) # If there is a return code, report an error to the user and exit if (returnCode): logger.error("Something went wrong when running bowtie. "\ "Command was\n%s %s -f %s -a -m 50 --best --strata "\ "-v 0 -S %s -p %s --sam-nohead --no-unal" %\ (bowtiePath, indexFilename, self.fastaFilename, self.mapFilename, nthreads)) sys.exit() logFile.close() log.closeLogger(logger) return (logFilename)
def housekeeping(genomeFilename, libFilenamesString, libFolder, libFilenamesList, bowtiePath, bowtieBuildPath, einvertedPath, blastnPath, makeblastdbPath, perlPath, RNAFoldPath, RNAPlotPath, samtoolsPath, ps2pdfwrPath, outputFolder, organism, version): """Perform various housekeeping functions including the checks that all external program dependencies exist, that files being referenced and folders that will be written to exist and are created. Additionally, it will also call setupMirBase.py to create download the current version of miRBase if needed to prepare for the annotation of our candidate miRNAs Args: genomeFilename: The path of the genome file libFilenamesString: The raw text of the library files that the user would have supplied in the config file libFolder: The folder of the library files if the user chose to supply that instead of individual library paths libFilenamesList: The list of library paths that have already been parsed either from libFilenamesString bowtiePath: The path of bowtie on the system einvertedPath: The path of einverted on the system blastnPath: The path of blastn on the system makeblastdbPath: The path of makeblastdb on the system perlPath: The path of perl on the system RNAFoldPath: The path of RNAFold on the system RNAPlotPath: The path of RNAPlot on the system samtoolsPath: The path of samtools on the system ps2pdfwrPath: The path of ps2pdfwr on the system outputFolder: The config entry for the output folder. Can be blank organism: The three letter identifier of the organism being studied version: The version of miRBase to be queried """ # Initialize our logger logger = log.setupLogger("housekeeping") # Make sure the genome file exists as defined if (not os.path.isfile(genomeFilename)): logger.error("%s could not be found! Please check that the "\ "file path was input correctly" % genomeFilename) sys.exit() # Do not allow execution if both libFilenamesString and libFolder # are defined to anything other than empty strings if (libFilenamesString and libFolder): logger.error("You specified both libFolder and libNamesList, but "\ "only one can exist. Delete one and try running again") sys.exit() # Loop through all libraries in libFilenamesList and confirm that they # exist before running for libName in libFilenamesList: if (not os.path.isfile(libName)): logger.error("%s could not be found! Please check that the "\ "file path was input correctly" % libName) sys.exit() if (len(libFilenamesList) == 1): logger.warning("Only one library was provided. While miRador "\ "can run with this, miRador will not\noutput any miRNAs that are "\ "predicted outside of any known families as we require\n"\ "identification in multiple libraries for novel annotation.\nIf "\ "this organism does not exist yet in miRBase, then no miRNAs "\ "will be predicted.\nPausing execution for 20 seconds if you "\ "want to stop this run and add libraries. (Use ctrl+c to stop)\n") time.sleep(20) if (not shutil.which(bowtiePath)): logger.error("bowtie could not be found at the provided path: %s\n"\ "Correct before running again" % bowtiePath) sys.exit() if (not shutil.which(bowtieBuildPath)): logger.error("bowtie-build could not be found at the provided path: "\ "%s\nCorrect before running again" % bowtieBuildPath) sys.exit() if (not shutil.which(einvertedPath)): logger.error("einverted could not be found at the provided path: "\ "%s\nCorrect before running again" % einvertedPath) sys.exit() if (not shutil.which(perlPath)): logger.error("perl could not be found at the provided path: %s\n"\ "Correct before running again" % perlPath) sys.exit() if (not shutil.which(blastnPath)): logger.error("blastn could not be found at the provided path: %s\n"\ "Correct before running again" % blastnPath) sys.exit() if (not shutil.which(makeblastdbPath)): logger.error("makeblastdb could not be found at the provided path: "\ "%s\nCorrect before running again" % makeblastdbPath) sys.exit() if (not shutil.which(RNAFoldPath)): logger.error("RNAFold could not be found at the provided path: %s\n"\ "Correct before running again" % RNAFoldPath) sys.exit() if (not shutil.which(RNAPlotPath)): logger.error("RNAPlot could not be found at the provided path: %s\n"\ "Correct before running again" % RNAPlotPath) sys.exit() if (not shutil.which(samtoolsPath)): logger.error("samtools could not be found at the provided path: %s\n"\ "Correct before running again" % RNAPlotPath) sys.exit() if (not shutil.which(ps2pdfwrPath)): logger.error("ps2pdfwr could not be found at the provided path: %s\n"\ "Correct before running again" % ps2pdfwrPath) sys.exit() ### Create the necessary folders if they don't already exist # Create a folderfor genome if it does not exist already if not os.path.isdir("genome"): os.mkdir('genome') # Create a folder for the inverted repeat if it does not exist already if (not os.path.isdir("invertedRepeats")): os.mkdir("invertedRepeats") # Create a folder for the miRBase folder if it does not exist yet if (not os.path.isdir("miRBase")): os.mkdir("miRBase") # If the user has filled the outputFolder option, check to see if it # has results from an older run and then delete them if (outputFolder): # Confirm that the output folder's name is not the same as # libFolder. This will ensure nothing of importance is # accidentally deleted if (outputFolder == libFolder): logger.error("outputFolder and libFolder cannot be the same "\ "folder. Please rename outputFolder and run again") sys.exit() # Create the output folder if it does not yet exist if (not os.path.isdir(outputFolder)): os.mkdir(outputFolder) # Delete the libs folder if it exists already if (os.path.isdir("%s/libs" % outputFolder)): shutil.rmtree("%s/libs" % outputFolder) # Delete the images folder if it exists already if (os.path.isdir("%s/images" % outputFolder)): shutil.rmtree("%s/images" % outputFolder) # Delete the various output files if they exist already """ # Deprecated if(os.path.isfile("%s/blastResults.txt" % outputFolder)): os.remove("%s/blastResults.txt" % outputFolder) if(os.path.isfile("%s/finalAnnotatedCandidates.csv" % outputFolder)): os.remove("%s/finalAnnotatedCandidates.csv" % outputFolder) if(os.path.isfile("%s/finalAnnotatedCandidates.fa" % outputFolder)): os.remove("%s/finalAnnotatedCandidates.fa" % outputFolder) if(os.path.isfile("%s/preAnnotatedCandidates.csv" % outputFolder)): os.remove("%s/preAnnotatedCandidates.csv" % outputFolder) if(os.path.isfile("%s/preAnnotatedCandidates.fa" % outputFolder)): os.remove("%s/preAnnotatedCandidates.fa" % outputFolder) """ ########################################################################### # Create a path for an output folder if it does not exist already # (Almost certainly shoul dnot as it would require the same run second) else: outputFolder = datetime.datetime.now().strftime( 'output_%Y-%m-%d_%H-%M-%S') if not os.path.isdir(outputFolder): os.mkdir(outputFolder) if not os.path.isdir("%s/libs" % outputFolder): os.mkdir("%s/libs" % outputFolder) if not os.path.isdir("%s/images" % outputFolder): os.mkdir("%s/images" % outputFolder) if not os.path.isdir("miRadorTempFolder"): os.mkdir("miRadorTempFolder") if not os.path.isdir("miRadorTempFolder/bowtieOutput"): os.mkdir("miRadorTempFolder/bowtieOutput") mirBaseDict = setupMiRBase.setupMiRBase(organism, version) log.closeLogger(logger) return (mirBaseDict, outputFolder)
def runEinverted(einvertedPath, chrFilename, match, mismatch, gap, threshold, maxRepLen): """Fuunction to run einverted for a single chromosome Args: chrFilename: The path to the individual chromosome that will be run through einverted match: Score to pass to einverted for a match mismatch: Penalty score to pass to einverted for a mismatch gap: Score to pass to einverted for a gap threshold: Minimum total score an inverted repeat must have for einverted to record it maxRepLen: Maximum length an inverted repeat can have Returns: The name of the output FASTA file that einverted created, and the name of the alignment output file that einverted created. """ # Initialize our logger logger = log.setupLogger("runEinverted") outputFastaFilenamesList = [] outputAlignmentFilenamesLis = [] # Open FNULL to suppress the output of einverted becuase we do not # really need to know it is running for each proc FNULL = open(os.devnull, 'w') # Names of temporary output files to store results prior to merging outputFastaFilename = "invertedRepeats/%s.fa.temp" % os.path.splitext( os.path.basename(chrFilename))[0] outputAlignmentFilename = "invertedRepeats/%s.alignment.temp" % \ os.path.splitext(os.path.basename(chrFilename))[0] # Call einverted utilizing this current sequence with the user # defined arguments from the config file. returnCode = subprocess.call([ einvertedPath, "-sequence", chrFilename, "-gap", str(gap), "-threshold", str(threshold), "-match", str(match), "-mismatch", str(mismatch), "-maxrepeat", str(maxRepLen), "-outfile", outputAlignmentFilename, "-outseq", outputFastaFilename ], stdout=FNULL, stderr=subprocess.STDOUT) # If a return code of anything but 0 is returned, it means there # was a problem and it should be investigated. Temp files wiill # remain from the run to assist in the debugging process if (returnCode != 0): logger.error("Something went wrong when running einverted. Command "\ "was\n%s -sequence %s -gap %s -threshold %s -match %s -mismatch "\ "%s -maxrepeat %s -outfile %s -outseq %s" % (einvertedPath, chrFilename, gap, threshold, match, mismatch, maxRepLen, outputAlignmentFilename, outputFastaFilename)) sys.exit() # Close FNULL FNULL.close() log.closeLogger(logger) return (outputFastaFilename, outputAlignmentFilename)
def combineIRTempFiles(self, IRFastaFilenamesList, IRAlignmentFilenamesList, runEInvertedFlag): """This function combines the temporary einverted files into one file for final analysis. However, if the user has opted to not run einverted due to a previous run alredy existing for this genome, this function will bypass the merging steps and only add the IR data to the IR dictionary Args: IRFastaFilenamesList: List of the inverted repeat FASTA files IRAlignmentFilenamesList: List of inverted repeat alignment files """ # Initialize our logger logger = log.setupLogger("combineIRTempFiles") # If einverted was run, combine the temp FASTA files if (runEInvertedFlag): ## Combine inverted repeats sequences FASTA file fasta_out = open(self.IRFastaFilename, 'w') # Loop through all FASTA files and merge into one file for filename in IRFastaFilenamesList: with open(filename) as fastaFile: for line in fastaFile: fasta_out.write(line) fasta_out.close() # Set a counter to process each inverted repeat by line number counter = 0 IRCounter = 0 # If einverted was run, open the output alignment file to write # the results to if (runEInvertedFlag): ## Combine inverted repeats alignments results align_out = open(self.IRAlignmentFilename, 'w') # Loop through all alignment files and merge into one file for filename in IRAlignmentFilenamesList: with open(filename) as alignmentFile: # Loop through the alignment files to add them to the # merged file and add them to IRDictByChr toWriteList = [] # Parse each line of the alignment file. Alignments come # in sets of 5 lines, so process 5 lines before to store # into IRDict for line in alignmentFile: toWriteList.append(line) # Split the entire line on spaces for parsing. # Remove preceeding whitespace with lstrip first parsedLine = line.lstrip().rstrip().split(' ') # If the current line counter % 5 is 1, there will be # a lot of useful information here. Begin to parse # this data into specific variables # Note that if einverted changes the output format, # these lines here can fail and would need to be # readjusted if (counter % 5 == 1): chrName = parsedLine[0].split(':')[0] score = int(parsedLine[2].split(':')[0]) matches, totalBases = map(int, parsedLine[3].split('/')) percMatch = round(float(matches) / totalBases, 3) if (percMatch == 1): gaps = int(parsedLine[6]) elif (percMatch < .1): gaps = int(parsedLine[8]) else: gaps = int(parsedLine[7]) # If the current line counter % 5 is 2, the 5' # repeat start and end coordinates will be contained # within this line elif (counter % 5 == 2): start5 = int(parsedLine[0]) hairpin5 = parsedLine[1].upper() end5 = int(parsedLine[2]) # The alignment between the two strands is given in # the 3rd line of the alignment. elif (counter % 5 == 3): alignmentIndicators = line.lstrip().rstrip() # If the current line counter % 5 is 4, the 3' # repeat start and end coordinates will be contained # within this line. Also, calculate the loop length elif (counter % 5 == 4): start3 = int(parsedLine[2]) hairpin3 = parsedLine[1].upper() end3 = int(parsedLine[0]) loop = int(start3) - int(end5) - 1 # Get the index of the chromosome to add the # inverted repeat to index = self.chrDict[chrName] # Add the inverted repeat to the appropriate # list within IRDictByChr IRName = "precursor-%s" % IRCounter self.IRDictByChr[index][IRName] = (start5, end5, start3, end3, loop, 'w', hairpin5, alignmentIndicators, hairpin3) IRCounter += 1 IRName = "precursor-%s" % IRCounter self.IRDictByChr[index][IRName] = (start5, end5, start3, end3, loop, 'c', hairpin5, alignmentIndicators, hairpin3) if (runEInvertedFlag): for entry in toWriteList: align_out.write(entry) toWriteList = [] IRCounter += 1 # Increment the counter counter += 1 # If einverted was run and the temp files were merged, close # the output file and delete the temp files if (runEInvertedFlag): align_out.close() # Delete individual inverted files and fasta files logger.info("Combined files '%s and %s'\nDeleting temp files" %\ (self.IRAlignmentFilename, self.IRFastaFilename)) # Combine the inverted repeats FASTA and alignmenet filenames # lists to delete all of these temp files garbage = IRFastaFilenamesList + IRAlignmentFilenamesList for toDelete in garbage: if os.path.exists(toDelete): os.remove(toDelete) log.closeLogger(logger) return (IRCounter)
def main(): ''' Main routine ''' setupLogger() # Add options usage = ('usage: %prog [-cpnN] Instr [-sndftgahi] ' 'params={val|min,max|min,guess,max}...') parser = OptionParser(usage, version=config.VERSION) add_mcrun_options(parser) add_mcstas_options(parser) # Parse options (options, args) = parser.parse_args() # Extract instrument and parameters if len(args) == 0: print parser.get_usage() parser.exit() # Set path of instrument-file after locating it options.instr = find_instr_file(args[0]) # Clean out quotes (perl mcgui requires this step) options.params = map(clean_quotes, args[1:]) # Fill out extra information expand_options(options) if options.verbose: setLogLevel(DEBUG) # Inform user of what is happening # TODO: More info? LOG.info('Using directory: "%s"' % options.dir) if options.dir == "." or options.dir == "./" or options == ".\\": LOG.warning('Existing files in "%s" will be overwritten!' % options.dir) options.dir = ''; # Run McStas mcstas = McStas(options.instr) mcstas.prepare(options) (fixed_params, intervals) = get_parameters(options) # Indicate end of setup / start of computations LOG.info('===') if options.info: print 'info!' mcstas.run(override_mpi=False) exit() # Set fixed parameters for key, value in fixed_params.items(): mcstas.set_parameter(key, value) # Check for linear scanning interval_points = None # Can't both do list and interval scanning if options.list and options.numpoints: raise OptionValueError('--numpoints cannot be used with --list') if options.list: if len(intervals) == 0: raise OptionValueError( '--list was chosen but no lists was presented.') points = len(intervals.values()[0]) if not(all(map(lambda i: len(i) == points, intervals.values()))): raise OptionValueError( 'All variables much have an equal amount of points.') interval_points = LinearInterval.from_list( points, intervals) scan = options.multi or options.numpoints if ((options.numpoints is not None and options.numpoints < 2) or (scan and options.numpoints is None)): raise OptionValueError( ('Cannot scan variable(s) %s using only one data point. ' 'Please use -N to specify the number of points.') % \ ', '.join(intervals.keys())) # Check that input is valid decimals if not all(map(lambda i: len(i) == 2 and all(map(is_decimal, i)), intervals.values())): raise OptionValueError('Could not parse intervals -- result: %s' % str(intervals)) if options.multi is not None: interval_points = MultiInterval.from_range( options.numpoints, intervals) elif options.numpoints is not None: interval_points = LinearInterval.from_range( options.numpoints, intervals) # Parameters for linear scanning present if interval_points: scanner = Scanner(mcstas, intervals) scanner.set_points(interval_points) mkdir(options.dir) scanner.run() else: # Only run a simulation if we have a nonzero ncount if not options.ncount == 0.0: mcstas.run()
def annotateIdenticalCandidates(similarityDict, mirBaseDict, identicalList, header, line, outputFolder): """Helper function to annotate the candidate miRNAs that have identical sequences to ones that have already been identified. Args: mirBaseDict: Dictionary of miRBase miRNAs and their coordinates for this organism, if available (will be an empty dictionary if not) identicalList: The list of miRBase miRNAs with the same sequence as the candidate miRNA header: Header line of the precursor file line: The full line from the pre-annotated file that will be modified to provide the new annotation outputFolder: Name of the folder where the results will be written to Returns: Flag to indicate if a positional match was found for this candidate and the update line with the proper annotation """ # Initialize our logger logger = log.setupLogger("annotateIdenticalCandidates") mirNameIndex = header.index("miR Name") chrNameIndex = header.index("Chr") strandIndex = header.index("Strand") positionIndex = header.index("miR Position") mirSeqIndex = header.index("miR Sequence") starSeqIndex = header.index("Star Sequence") annotatedFlag = False mirName = line[mirNameIndex] chrName = line[chrNameIndex] strand = line[strandIndex] position = line[positionIndex] mirSeq = line[mirSeqIndex] starSeq = line[starSeqIndex] # Remove "chr" if it exists in the chromosome name if ("chr" in chrName.lower()): chrName = chrName.lower().replace("chr", "") # If mirBaseDict is populated, that means that we have positional # information for this organism and thus can generate the most accurate # annotations for this organism if (mirBaseDict): # Loop through all identical miRNA sequences for identicalMirna in identicalList: # It turns out that there can be annotated miRNAs in miRBase # that do not exist in the gff file, so do a check to ensure # that the identical miRNA exists in mirBaseDict prior to # entering this loop if (identicalMirna not in mirBaseDict): continue # Loop through all coordinates that this specific miRNA # can be found for coordinates in mirBaseDict[identicalMirna]: mirBaseChr = coordinates[0] # Remove "chr" if it exists in the chromosome name if ("chr" in mirBaseChr.lower()): mirBaseChr = mirBaseChr.lower().replace("chr", "") mirBaseStrand = coordinates[1] mirBasePosition = coordinates[2] # Need to convert strand from +/- to w/c if (mirBaseStrand == "+"): mirBaseStrand = "w" elif (mirBaseStrand == "-"): mirBaseStrand = "c" # If there strand is not + or -, something is wrong with this # miRBase entry. We will not exit the run, but we will report # the issue to the user and continue to the next tag else: logger.info("Unrecognized strand of miRBase entry. We "\ "will skip this entry, but please check with the "\ "miRBase file %s.gff3 and miRNA name %s" % (organism, identicalMirna)) continue # If the coordinates of the candidate miRNA meet the # coordinates of the miRBase miRNA, then this candidate miRNA # is this miRBase miRNA and we will change the annotation to # represent that if (chrName == mirBaseChr and strand == mirBaseStrand and position == mirBasePosition): line[mirNameIndex] = identicalMirna line.append("Known") annotatedFlag = True # If the miRNA is known, update the image filename within # the image folder with its miRBase annotated name. But # if the name with the candidate sequence does not exist, # that suggests that it has already been upduated in a # previous run and thus we do not need to rename the file if(os.path.isfile("%s/images/%s_precursor.pdf" % \ (outputFolder, mirName))): os.rename( "%s/images/%s_precursor.pdf" % (outputFolder, mirName), "%s/images/%s_precursor.pdf" % (outputFolder, identicalMirna)) # If we did not find an annotated miRNA at this same position, we will # annotate it in the final file as being identical to the following known # miRNAs, but not at the same position if (not annotatedFlag): toAdd = "" # Loop through all identical tags and copy a line for it so that all # known miRBase miRNAs matching this read can be found line.append("Identical to the following known miRNAs at different "\ "positions:") for identicalMirna in similarityDict[mirName]: toAdd = "%s%s " % (toAdd, identicalMirna) line.append(toAdd) log.closeLogger(logger) return (line)
def filterPrecursors(mappedTagsToPrecursors, IRDict, libDict, overhang): """This function will perform the sRNA mapping and abundance filters. It will first try to find a miRNA and miRNA* pair by identifying tags that map to opposite sides of the precursor. It will also create splits of the c and w strand if there are tags that map to both Args: mappedTagsToPrecursors: Dictionary of tag information mapping to the precursor, identified by the precursor name IRDict: Dictionary of the inverted repeats in one chromosome libDict: The entire library dictionary to be queried for abundances overhang: Specific length of overhang that a duplex must have Returns: Dictionary of all precursors and the miRNA:miRNA* duplexes within that pass all filters """ # Initialize our logger logger = log.setupLogger("filterPrecursors") # Initialize a dictionary to store our final candidaties that pass # all filters for this library finalCandidates = {} # Begin to loop though all of the candidate precursors for the # various filters. Each loop begins on the chromosome dictionary for precursorName, mappedTagsTuple in mappedTagsToPrecursors.items(): # Initialize a flag for if the 5' or 3' end of the precursor # contains a candidate miRNA is3Candidate = False is5Candidate = False precursor = IRDict[precursorName] # Store various elements of the precursor dictionary values # for quick accession start5 = precursor[0] end5 = precursor[1] start3 = precursor[2] end3 = precursor[3] strand = precursor[5] arm5 = precursor[6] alignmentIndicators = precursor[7] arm3 = precursor[8] # Store the various elements of the mapped tags tuple mappedTagsDict5 = mappedTagsTuple[0] mappedTagsDict3 = mappedTagsTuple[1] totalAbun5 = mappedTagsTuple[2] totalAbun3 = mappedTagsTuple[3] loopAbun = mappedTagsTuple[4] # Begin a series of loops to identify if there are any tags on # the 5' and 3' strands that overlap within a short, user defined # overhang for candidate5Pos, mappedTagList5 in mappedTagsDict5.items(): for mapped5Tag in mappedTagList5: # Get the length of the 5' candidate tag so that we can # determine local positions on the precursors tag5Length = len(mapped5Tag[0]) tag5Abun = mapped5Tag[1] # If the length of the tag is not between 20 and 24, # just move to the next tag. We do this here because # we have to store tags that are 1 nt variants of # candidate miRNA or miRNA* sequences if (tag5Length < 20 or tag5Length > 24): continue # If the strand is w, the sequence will require no # modifications if (strand == "w"): sequence5 = mapped5Tag[0] # If the strand is c, we need to reverse complement # the mapped sequence so that we can find it on the # IR arm else: sequence5 = mapped5Tag[0].translate( str.maketrans("ACGT", "TGCA"))[::-1] oldSequence5 = sequence5 # If we are unable to find the sequences in the # IR arm, we know it is for one of two # possibilities. Because there can be gaps in the # alignment, so we must identify which case (if not # both) it is before proceeding if (sequence5 not in arm5): sequence5, local5Start, local5End = \ findSequenceInIR(sequence5, arm5, tag5Length) # If the sequence can be found, update the local positions # as they may be shifted due to gaps prior else: local5Start = arm5.find(sequence5) local5End = local5Start + tag5Length - 1 # Check to confirm that the sequence with gaps is the # same sequence as before if (oldSequence5 != sequence5.replace("-", "")): logger.error("findSequenceInIR messed up for %s. "\ "Contact Reza to debug" % oldSequence5) logger.error(precursorName, oldSequence5, sequence5, local5Start, local5End) sys.exit() # Loop through all mapped tags in the 3' dictionary to # identify any candidate miRNA:miRNA* pairs with the # current 5' mapped tag for candidate3Pos, mappedTagList3 in mappedTagsDict3.items(): for mapped3Tag in mappedTagList3: # Get the length of the 3' candidate tag so that # we can determine local positions on the # precursor for mapping comparisons. A candidate # will be recorded if a miRNA:miRNA* pair can # be identified tag3Length = len(mapped3Tag[0]) tag3Abun = mapped3Tag[1] # If the length of the tag is not between 20 and # 24, just move to the next tag. We do this here # because we have to store tags that are 1 nt # variants of candidate miRNA or miRNA* sequences if (tag3Length < 20 or tag3Length > 24): continue # If the strand is w, the sequence needs to be # reversed because it is on the 3' arm of the IR if (strand == "w"): sequence3 = mapped3Tag[0][::-1] # If the strand is c, the sequence needs to be # complemented (but not reversed) because it is # on the 3' arm of the IR, but the reverse strand # of the genome else: sequence3 = (mapped3Tag[0].translate( str.maketrans("ACGT", "TGCA"))) oldSequence3 = sequence3 # If we are unable to find the sequences in the # IR arm, we need to find the alignment sequence, # start, and end positions if (sequence3 not in arm3): sequence3, local3Start, local3End = \ findSequenceInIR(sequence3, arm3, tag3Length) else: local3Start = arm3.find(sequence3) local3End = local3Start + tag3Length - 1 # Check to confirm that the sequence with gaps is # the same sequence as before if (oldSequence3 != sequence3.replace("-", "")): logger.error("findSequenceInIR messed up for %s. "\ "Contact Reza to debug" % oldSequence3) logger.error(precursorName, oldSequence3, sequence3, local3Start, local3End) sys.exit() # If there is an overhang on either the sequence, # we have a candidate duplex and will investigate # it further if ((strand == "c" and (local3Start - local5Start == overhang) and (local3End - local5End == overhang)) or (strand == "w" and (local5End - local3End == overhang) and (local5Start - local3Start == overhang))): # Because we can have overhangs, the alignment # should start and end at the postiions just # prior to the overhang alignStart = max(local5Start, local3Start) alignEnd = min(local5End, local3End) # Get the einverted alignment for the two # sequences matchCount, mismatchCount, wobbleCount,\ gapCount = getAlignment(arm5, arm3, alignStart, alignEnd) # Only proceed if the alignment meets our filter # specifications if (gapCount + mismatchCount + (wobbleCount * .5) <= 5 and gapCount <= 3): # Get the hits information for the 5' and 3' # tags from libDict hits5 = libDict[mapped5Tag[0]][1] hits3 = libDict[mapped3Tag[0]][1] ### Code for the abundance filter #variant5Abun = totalAbun5 #variant3Abun = totalAbun3 variant5Abun = tag5Abun variant3Abun = tag3Abun # Get the abundance of all eight 1-nt # variants of both 5' and 3' tags variant5AbunList = getVariantAbundance( mappedTagsDict5, mapped5Tag[0], candidate5Pos, strand) variant3AbunList = getVariantAbundance( mappedTagsDict3, mapped3Tag[0], candidate3Pos, strand) if (tag5Abun < max(variant5AbunList) or tag3Abun < max(variant3AbunList)): continue if (variant5Abun == -1 or variant3Abun == -1): continue variant5Abun += sum(variant5AbunList) variant3Abun += sum(variant3AbunList) # Get the proportion of reads coming from # the miRNA duplex compred to the rest # of the reads mapping to the duplex proportion = (variant5Abun + variant3Abun) /\ (totalAbun5 + totalAbun3 + loopAbun) # The 5' mapping tag will be kept as a candidate # miRNA if it has at least an abundance of 3 RPM if (tag5Abun >= 3): duplex = ("5p", mapped3Tag[0], candidate5Pos, candidate3Pos, tag5Abun, hits5, tag3Abun, hits3, matchCount, mismatchCount, wobbleCount, gapCount, variant5Abun, variant3Abun, totalAbun5, totalAbun3, loopAbun, proportion) # If the sum of the two tags in the # make up more than 75% of the read # abundance in the entire precursor, # add the duplex to the candidates # dictionary if (proportion >= .75): # Add the precursor name as a key to # finalCandidates if it does not # yet exist. The value will be a list # of duplexes found in the precursor, # but the first element will be the IR # coordinates if (precursorName not in finalCandidates): finalCandidates[precursorName] = \ {} finalCandidates[precursorName][\ mapped5Tag[0]] = duplex # The 3' mapping tag will be kept as a # candidate miRNA if it has an abundance # of at least 3 RPM if (tag3Abun >= 3): duplex = ("3p", mapped5Tag[0], candidate3Pos, candidate5Pos, tag3Abun, hits3, tag5Abun, hits5, matchCount, mismatchCount, wobbleCount, gapCount, variant3Abun, variant5Abun, totalAbun3, totalAbun5, loopAbun, proportion) # If the sum of the two tags in the # make up more than 75% of the read # abundance in the entire precursor, # add the duplex to the candidates # dictionary if (proportion >= .75): # Add the precursor name as a key to # finalCandidates if it does not # yet exist. The valu will be a list of # duplexes found in the precursor, but # the first element will be the IR # coordinates if (precursorName not in finalCandidates): finalCandidates[precursorName] = \ {} finalCandidates[precursorName][ mapped3Tag[0]] = duplex log.closeLogger(logger) return (finalCandidates)
from hardware import * from so import * import log ## ## MAIN ## if __name__ == '__main__': log.setupLogger() log.logger.info('Starting emulator') ## setup our hardware and set memory size to 25 "cells" HARDWARE.setup(25) ## Switch on computer HARDWARE.switchOn() ## new create the Operative System Kernel # "booteamos" el sistema operativo # schedulers: FCFS, Priority, PreemptivePriority, RoundRobin(quantum) -- Round Robin es el scheduler por omision. kernel = Kernel() # Ahora vamos a intentar ejecutar 3 programas a la vez ################## prg1 = Program( "prg1.exe", [ASM.CPU(2), ASM.IO(), ASM.CPU(3), ASM.IO(), ASM.CPU(2)]) prg2 = Program("prg2.exe", [ASM.CPU(7)])
def __init__(self): super(BaseApp, self).__init__() setupLogger() self.models = models self.models.connect() self.ndsctl = NDSCTL()