def runPairWiseDiffs(self, fastaFileNames): print 'Calculating pairwise diffenrences...', # Read in fasta sequences into a dictionary: completeSets = {} for fastaFileName in fastaFileNames: baseName = os.path.splitext(os.path.basename(fastaFileName))[0] #baseName = os.path.basename(fastaFileName).split(".")[0] completeSets[baseName] = {} fastaFile = open(fastaFileName, 'r') fastaIterator = Fasta.Iterator(fastaFile, parser=Fasta.RecordParser()) for fastaRecord in fastaIterator: newName = safeName(copy.copy(fastaRecord.title)) #completeSets[baseName][fastaRecord.title.strip()] = fastaRecord.sequence completeSets[baseName][newName] = fastaRecord.sequence fastaFile.close() # Load existing alignment matrix alignmentMatrices = {} for fastaFileBaseName in completeSets.keys(): if not alignmentMatrices.has_key(fastaFileBaseName): alignmentMatrices[fastaFileBaseName] = {} alignmentMatrixFileName = os.path.join( self.options.statsdir, fastaFileBaseName + "_matrix.pickle") if os.path.exists(alignmentMatrixFileName) and os.path.getsize( alignmentMatrixFileName) > 0: alignmentMatrixFile = open(alignmentMatrixFileName, 'r') alignmentMatrices[fastaFileBaseName] = pickle.load( alignmentMatrixFile) alignmentMatrixFile.close() # Add any new alignments to alignment matrix (and save to them to file) self.updateAlignmentMatrix(alignmentMatrices, completeSets) print 'done'
def fixAndMoveInput(self, files, outputDir=None): if not outputDir: outputDir = self.options.datadir allowedLetters = IUPAC.IUPACAmbiguousDNA( ).letters + IUPAC.IUPACAmbiguousDNA().letters.lower() sequenceCount = 0 inputFileList = [] sequenceNameMap = {} for inputFileName in files: if not os.path.exists(inputFileName): raise AnalysisTerminated( 1, " Input filename %s does not exist!" % inputFileName) # print " Input filename %s does not exist!" % inputFileName # sys.exit(1) if self.options.inputformat == 'nexus': nex = Nexus.Nexus(inputFileName) fileContent = '' for name, seq in nex.matrix.items(): fileContent += ">%s\n%s\n" % (name, str(seq)) elif self.options.inputformat == 'fasta': fileContent = readFile(inputFileName) else: raise AnalysisTerminated( 1, "The supported input formats are 'fasta' and 'nexus'") # print "The supported input formats are 'fasta' and 'nexus'" # sys.exit() fileContent = re.sub(r'\r+', '\n', fileContent) tmpOutputFileName = os.path.join( outputDir, "%s.tmp" % os.path.split(inputFileName)[-1]) writeFile(tmpOutputFileName, fileContent) usedIDs = {} inputFile = open(tmpOutputFileName, 'r') fastaIterator = Fasta.Iterator(inputFile, parser=Fasta.RecordParser()) outputFileBaseName = os.path.split(inputFileName)[-1] newOutputFileBaseName = re.sub(r'[^0-9a-zA-Z.]+', '', outputFileBaseName) if re.match(r'\d', newOutputFileBaseName): newOutputFileBaseName = 'n' + newOutputFileBaseName outputFileName = os.path.join(outputDir, newOutputFileBaseName) baseName = os.path.splitext(os.path.basename(outputFileName))[0] sequenceNameMap[baseName] = {} inputFileList.append(outputFileName) outputFile = open(outputFileName, 'w') for fastaRecord in fastaIterator: sequenceCount += 1 origName = fastaRecord.title fastaRecord.title = safeName(fastaRecord.title) # Make sure ids are unique: if usedIDs.has_key( fastaRecord.title.lower() ): # we use lower to make sure they don't just differ in case. i = 1 while usedIDs.has_key("%s_%d" % (fastaRecord.title, i)): i += 1 fastaRecord.title = "%s_%d" % (fastaRecord.title, i) usedIDs[fastaRecord.title.lower()] = True sequenceNameMap[baseName][fastaRecord.title] = origName # Strip sequence of gap chars: #fastaRecord.sequence = fastaRecord.sequence.replace('-', '') fastaRecord.sequence = fastaRecord.sequence.replace('~', '') fastaRecord.sequence = fastaRecord.sequence.replace('.', '') # if allowedLetters is None: # if len(re.findall(IUPAC.IUPACAmbiguousDNA().letters, fastaRecord.sequence)) / len(fastaRecord.sequence) > 0.5: # allowedLetters = IUPAC.IUPACAmbiguousDNA().letters # wildcard = 'N' # else: # allowedLetters = ExtendedIUPACProtein().letters # wildcard = 'X' # fastaRecord.sequence = re.sub('[^%s-]' % allowedLetters, wildcard, fastaRecord.sequence) # fastaRecord.sequence = re.sub('[^%s]' % allowedLetters, 'N', fastaRecord.sequence) fastaRecord.sequence = re.sub('[^%s-]' % allowedLetters, 'N', fastaRecord.sequence) # Print only if there is some sequence left: if len(fastaRecord.sequence) > 0: outputFile.write(str(fastaRecord) + "\n") inputFile.close() outputFile.close() os.remove(tmpOutputFileName) return inputFileList, sequenceCount, sequenceNameMap
def checkCacheConsistency(self, files): idList = [] for inputFileName in files: inputFile = open(inputFileName, 'r') fastaIterator = Fasta.Iterator(inputFile, parser=Fasta.RecordParser()) baseName = os.path.splitext(os.path.split(inputFileName)[-1])[0] for fastaRecord in fastaIterator: idList.append("%s_%s" % (baseName, fastaRecord.title)) pickleFileName = os.path.join( self.options.project, os.path.split(self.options.project)[1] + '.sap') if os.path.exists(pickleFileName): # See if the options have changed - and if they have: # remove the selected parts of the cache. pickleFile = open(pickleFileName, 'r') prevOptions = pickle.load(pickleFile) pickleFile.close() # Lists of options that deprecates cache entries: deleteBlastCacheList = [ "database", "maxblasthits", "limitquery", "minsignificance", "nolowcomplexfilter", "blastwordsize" ] #deleteHomologueCacheList = [ "unclassified", "notruncate", "quickcompile", "minidentity", "forceidentity", "subspecieslevel", "fillinall", "fillineven", "fillintomatch", "individuals", "significance", "nrsignificant", deleteHomologueCacheList = [ "notruncate", "quickcompile", "minidentity", "forceidentity", "subspecieslevel", "fillinall", "fillineven", "fillintomatch", "individuals", "significance", "nrsignificant", "relbitscore", "phyla", "classes", "orders", "families", "genera", "besthits", "alignmentlimit", "minimaltaxonomy", "harddiversity", "forceincludefile", "forceincludegilist", "forceexcludegilist" ] deleteHomologueCacheList.extend(deleteBlastCacheList) deleteAlignmentCacheList = ["alignment", "alignmentoption"] deleteAlignmentCacheList.extend(deleteHomologueCacheList) deleteTreesCacheList = [] deleteTreesCacheList.extend(deleteAlignmentCacheList) deleteTreeStatsCacheList = ["assignment", "prunelevel"] deleteTreeStatsCacheList.extend(deleteTreesCacheList) print "Checking cache for deprecated entries" for option in self.options.__dict__.keys(): # This serves to map between new and older option names: prevVersionOption = option if not prevOptions.__dict__.has_key(option): if option == 'assignment': prevVersionOption = 'sampler' elif option == 'dbcache': prevVersionOption = 'genbankcache' elif option == 'minsignificance': prevVersionOption = 'evaluecutoff' elif option == 'significance': prevVersionOption = 'evaluesignificance' elif option == 'nrsignificant': prevVersionOption = 'minsignificant' else: print "New option for SAP \"%s\" will take default value \"%s\"" % ( option, self.options.__dict__[option]) if option in deleteBlastCacheList and self.options.__dict__[ option] != prevOptions.__dict__[prevVersionOption]: print '\tBlast cache' for queryID in idList: for entry in glob.glob( os.path.join(self.options.blastcache, queryID) + '*'): print "\t\t" + os.path.split(entry)[-1] os.remove(entry) deleteBlastCacheList = [] if option in deleteHomologueCacheList and self.options.__dict__[ option] != prevOptions.__dict__[prevVersionOption]: # Delete the homologcache for the entries in the input files: print '\tHomologue and alignment cache' for queryID in idList: for entry in glob.glob( os.path.join(self.options.homologcache, queryID) + '.*'): print "\t\t" + os.path.split(entry)[-1] os.remove(entry) for entry in glob.glob( os.path.join(self.options.alignmentcache, queryID) + '.*'): print "\t\t" + os.path.split(entry)[-1] os.remove(entry) deleteHomologueCacheList = [] if option in deleteTreesCacheList and self.options.__dict__[ option] != prevOptions.__dict__[prevVersionOption]: # Delete the tree statistics cache for the entries in the input files: print '\tTree sampling cache' for queryID in idList: for entry in glob.glob( os.path.join(self.options.treescache, queryID) + '.*'): print "\t\t" + os.path.split(entry)[-1] os.remove(entry) deleteTreesCacheList = [] if option in deleteTreeStatsCacheList and self.options.__dict__[ option] != prevOptions.__dict__[prevVersionOption]: # Delete the tree statistics cache for the entries in the input files: print '\tTree statistics cache' for queryID in idList: for entry in glob.glob( os.path.join(self.options.treestatscache, queryID) + '.*'): print "\t\t" + os.path.split(entry)[-1] os.remove(entry) deleteTreeStatsCacheList = [] # Dump the options specified: pickleFile = open(pickleFileName, 'w') pickle.dump(self.options, pickleFile) pickleFile.close() print
def align(self, fastaFileName): baseName = os.path.splitext(os.path.split(fastaFileName)[-1])[0] alignmentFileName = os.path.join(self.options.alignmentcache, baseName + ".nex") print "%s: Alignment: " % baseName, sys.stdout.flush() if os.path.exists( alignmentFileName) and os.path.getsize(alignmentFileName) > 0: print "Using cached results." sys.stdout.flush() else: print "Computing...", sys.stdout.flush() # Read in all the fasta entries: fastaFile = open(fastaFileName, 'r') fastaIterator = Fasta.Iterator(fastaFile, parser=Fasta.RecordParser()) fastaDict = {} for fastaRecord in fastaIterator: fastaDict[fastaRecord.title] = fastaRecord.sequence # Get the query sequence: queryName = baseName querySeq = fastaDict[queryName] del fastaDict[queryName] alignment = Nexus.Nexus() seqList = [[queryName, Seq.Seq(querySeq)]] for title, sequence in fastaDict.items(): alignmentList = pairwise2.align.globalms(querySeq, sequence, 1, 0, -10, -.5, one_alignment_only=1) queryAlnString = alignmentList[0][0] homologueAlnString = alignmentList[0][1] # Delete the columns that introcudes gaps in the querySeq: deleteList = [] for i, char in enumerate(queryAlnString): if char == '-': deleteList.append(i) prunedHomologueAlnString = '' for i, char in enumerate(homologueAlnString): if i not in deleteList: prunedHomologueAlnString += char homologueAlnString = prunedHomologueAlnString # Make a list of tuple with truncated title and Seq object and add it the list: tup = (title, Seq.Seq(homologueAlnString)) seqList.append(tup) # Write the alignment to a file: writeNexusFile(alignmentFileName, seqList) print "done", sys.stdout.flush()