예제 #1
0
    def runPairWiseDiffs(self, fastaFileNames):

        print 'Calculating pairwise diffenrences...',

        # Read in fasta sequences into a dictionary:
        completeSets = {}
        for fastaFileName in fastaFileNames:
            baseName = os.path.splitext(os.path.basename(fastaFileName))[0]
            #baseName = os.path.basename(fastaFileName).split(".")[0]
            completeSets[baseName] = {}

            fastaFile = open(fastaFileName, 'r')
            fastaIterator = Fasta.Iterator(fastaFile,
                                           parser=Fasta.RecordParser())
            for fastaRecord in fastaIterator:
                newName = safeName(copy.copy(fastaRecord.title))
                #completeSets[baseName][fastaRecord.title.strip()] = fastaRecord.sequence
                completeSets[baseName][newName] = fastaRecord.sequence
            fastaFile.close()

        # Load existing alignment matrix
        alignmentMatrices = {}
        for fastaFileBaseName in completeSets.keys():
            if not alignmentMatrices.has_key(fastaFileBaseName):
                alignmentMatrices[fastaFileBaseName] = {}

            alignmentMatrixFileName = os.path.join(
                self.options.statsdir, fastaFileBaseName + "_matrix.pickle")
            if os.path.exists(alignmentMatrixFileName) and os.path.getsize(
                    alignmentMatrixFileName) > 0:
                alignmentMatrixFile = open(alignmentMatrixFileName, 'r')
                alignmentMatrices[fastaFileBaseName] = pickle.load(
                    alignmentMatrixFile)
                alignmentMatrixFile.close()

        # Add any new alignments to alignment matrix (and save to them to file)
        self.updateAlignmentMatrix(alignmentMatrices, completeSets)

        print 'done'
예제 #2
0
파일: Initialize.py 프로젝트: cbirdlab/sap
    def fixAndMoveInput(self, files, outputDir=None):

        if not outputDir:
            outputDir = self.options.datadir

        allowedLetters = IUPAC.IUPACAmbiguousDNA(
        ).letters + IUPAC.IUPACAmbiguousDNA().letters.lower()

        sequenceCount = 0
        inputFileList = []
        sequenceNameMap = {}
        for inputFileName in files:

            if not os.path.exists(inputFileName):
                raise AnalysisTerminated(
                    1, " Input filename %s does not exist!" % inputFileName)
#                 print " Input filename %s does not exist!" % inputFileName
#                 sys.exit(1)

            if self.options.inputformat == 'nexus':
                nex = Nexus.Nexus(inputFileName)
                fileContent = ''
                for name, seq in nex.matrix.items():
                    fileContent += ">%s\n%s\n" % (name, str(seq))
            elif self.options.inputformat == 'fasta':
                fileContent = readFile(inputFileName)
            else:
                raise AnalysisTerminated(
                    1, "The supported input formats are 'fasta' and 'nexus'")


#                print "The supported input formats are 'fasta' and 'nexus'"
#                sys.exit()

            fileContent = re.sub(r'\r+', '\n', fileContent)
            tmpOutputFileName = os.path.join(
                outputDir, "%s.tmp" % os.path.split(inputFileName)[-1])
            writeFile(tmpOutputFileName, fileContent)

            usedIDs = {}
            inputFile = open(tmpOutputFileName, 'r')
            fastaIterator = Fasta.Iterator(inputFile,
                                           parser=Fasta.RecordParser())
            outputFileBaseName = os.path.split(inputFileName)[-1]
            newOutputFileBaseName = re.sub(r'[^0-9a-zA-Z.]+', '',
                                           outputFileBaseName)
            if re.match(r'\d', newOutputFileBaseName):
                newOutputFileBaseName = 'n' + newOutputFileBaseName

            outputFileName = os.path.join(outputDir, newOutputFileBaseName)

            baseName = os.path.splitext(os.path.basename(outputFileName))[0]
            sequenceNameMap[baseName] = {}

            inputFileList.append(outputFileName)
            outputFile = open(outputFileName, 'w')
            for fastaRecord in fastaIterator:
                sequenceCount += 1
                origName = fastaRecord.title
                fastaRecord.title = safeName(fastaRecord.title)

                # Make sure ids are unique:
                if usedIDs.has_key(
                        fastaRecord.title.lower()
                ):  # we use lower to make sure they don't just differ in case.
                    i = 1
                    while usedIDs.has_key("%s_%d" % (fastaRecord.title, i)):
                        i += 1
                    fastaRecord.title = "%s_%d" % (fastaRecord.title, i)
                usedIDs[fastaRecord.title.lower()] = True

                sequenceNameMap[baseName][fastaRecord.title] = origName

                # Strip sequence of gap chars:
                #fastaRecord.sequence = fastaRecord.sequence.replace('-', '')
                fastaRecord.sequence = fastaRecord.sequence.replace('~', '')
                fastaRecord.sequence = fastaRecord.sequence.replace('.', '')

                #                 if allowedLetters is None:
                #                    if len(re.findall(IUPAC.IUPACAmbiguousDNA().letters, fastaRecord.sequence)) / len(fastaRecord.sequence) > 0.5:
                #                       allowedLetters = IUPAC.IUPACAmbiguousDNA().letters
                #                       wildcard = 'N'
                #                    else:
                #                       allowedLetters = ExtendedIUPACProtein().letters
                #                       wildcard = 'X'
                #                 fastaRecord.sequence = re.sub('[^%s-]' % allowedLetters, wildcard, fastaRecord.sequence)

                # fastaRecord.sequence = re.sub('[^%s]' % allowedLetters, 'N', fastaRecord.sequence)
                fastaRecord.sequence = re.sub('[^%s-]' % allowedLetters, 'N',
                                              fastaRecord.sequence)

                # Print only if there is some sequence left:
                if len(fastaRecord.sequence) > 0:
                    outputFile.write(str(fastaRecord) + "\n")
            inputFile.close()
            outputFile.close()
            os.remove(tmpOutputFileName)

        return inputFileList, sequenceCount, sequenceNameMap
예제 #3
0
파일: Initialize.py 프로젝트: cbirdlab/sap
    def checkCacheConsistency(self, files):

        idList = []
        for inputFileName in files:
            inputFile = open(inputFileName, 'r')
            fastaIterator = Fasta.Iterator(inputFile,
                                           parser=Fasta.RecordParser())
            baseName = os.path.splitext(os.path.split(inputFileName)[-1])[0]
            for fastaRecord in fastaIterator:
                idList.append("%s_%s" % (baseName, fastaRecord.title))

        pickleFileName = os.path.join(
            self.options.project,
            os.path.split(self.options.project)[1] + '.sap')

        if os.path.exists(pickleFileName):
            # See if the options have changed - and if they have:
            # remove the selected parts of the cache.
            pickleFile = open(pickleFileName, 'r')
            prevOptions = pickle.load(pickleFile)
            pickleFile.close()

            # Lists of options that deprecates cache entries:
            deleteBlastCacheList = [
                "database", "maxblasthits", "limitquery", "minsignificance",
                "nolowcomplexfilter", "blastwordsize"
            ]

            #deleteHomologueCacheList = [ "unclassified", "notruncate", "quickcompile", "minidentity", "forceidentity", "subspecieslevel", "fillinall", "fillineven", "fillintomatch", "individuals", "significance", "nrsignificant",
            deleteHomologueCacheList = [
                "notruncate", "quickcompile", "minidentity", "forceidentity",
                "subspecieslevel", "fillinall", "fillineven", "fillintomatch",
                "individuals", "significance", "nrsignificant", "relbitscore",
                "phyla", "classes", "orders", "families", "genera", "besthits",
                "alignmentlimit", "minimaltaxonomy", "harddiversity",
                "forceincludefile", "forceincludegilist", "forceexcludegilist"
            ]
            deleteHomologueCacheList.extend(deleteBlastCacheList)

            deleteAlignmentCacheList = ["alignment", "alignmentoption"]
            deleteAlignmentCacheList.extend(deleteHomologueCacheList)

            deleteTreesCacheList = []
            deleteTreesCacheList.extend(deleteAlignmentCacheList)

            deleteTreeStatsCacheList = ["assignment", "prunelevel"]
            deleteTreeStatsCacheList.extend(deleteTreesCacheList)

            print "Checking cache for deprecated entries"

            for option in self.options.__dict__.keys():

                # This serves to map between new and older option names:
                prevVersionOption = option
                if not prevOptions.__dict__.has_key(option):
                    if option == 'assignment':
                        prevVersionOption = 'sampler'
                    elif option == 'dbcache':
                        prevVersionOption = 'genbankcache'
                    elif option == 'minsignificance':
                        prevVersionOption = 'evaluecutoff'
                    elif option == 'significance':
                        prevVersionOption = 'evaluesignificance'
                    elif option == 'nrsignificant':
                        prevVersionOption = 'minsignificant'
                    else:
                        print "New option for SAP \"%s\" will take default value \"%s\"" % (
                            option, self.options.__dict__[option])

                if option in deleteBlastCacheList and self.options.__dict__[
                        option] != prevOptions.__dict__[prevVersionOption]:
                    print '\tBlast cache'
                    for queryID in idList:
                        for entry in glob.glob(
                                os.path.join(self.options.blastcache, queryID)
                                + '*'):
                            print "\t\t" + os.path.split(entry)[-1]
                            os.remove(entry)
                    deleteBlastCacheList = []

                if option in deleteHomologueCacheList and self.options.__dict__[
                        option] != prevOptions.__dict__[prevVersionOption]:
                    # Delete the homologcache for the entries in the input files:
                    print '\tHomologue and alignment cache'
                    for queryID in idList:
                        for entry in glob.glob(
                                os.path.join(self.options.homologcache,
                                             queryID) + '.*'):
                            print "\t\t" + os.path.split(entry)[-1]
                            os.remove(entry)
                        for entry in glob.glob(
                                os.path.join(self.options.alignmentcache,
                                             queryID) + '.*'):
                            print "\t\t" + os.path.split(entry)[-1]
                            os.remove(entry)
                    deleteHomologueCacheList = []

                if option in deleteTreesCacheList and self.options.__dict__[
                        option] != prevOptions.__dict__[prevVersionOption]:
                    # Delete the tree statistics cache for the entries in the input files:
                    print '\tTree sampling cache'
                    for queryID in idList:
                        for entry in glob.glob(
                                os.path.join(self.options.treescache, queryID)
                                + '.*'):
                            print "\t\t" + os.path.split(entry)[-1]
                            os.remove(entry)
                    deleteTreesCacheList = []

                if option in deleteTreeStatsCacheList and self.options.__dict__[
                        option] != prevOptions.__dict__[prevVersionOption]:
                    # Delete the tree statistics cache for the entries in the input files:
                    print '\tTree statistics cache'
                    for queryID in idList:
                        for entry in glob.glob(
                                os.path.join(self.options.treestatscache,
                                             queryID) + '.*'):
                            print "\t\t" + os.path.split(entry)[-1]
                            os.remove(entry)
                    deleteTreeStatsCacheList = []

        # Dump the options specified:
        pickleFile = open(pickleFileName, 'w')
        pickle.dump(self.options, pickleFile)
        pickleFile.close()

        print
예제 #4
0
파일: Align.py 프로젝트: cbirdlab/sap
    def align(self, fastaFileName):

        baseName = os.path.splitext(os.path.split(fastaFileName)[-1])[0]
        alignmentFileName = os.path.join(self.options.alignmentcache,
                                         baseName + ".nex")

        print "%s: Alignment: " % baseName,
        sys.stdout.flush()

        if os.path.exists(
                alignmentFileName) and os.path.getsize(alignmentFileName) > 0:
            print "Using cached results."
            sys.stdout.flush()
        else:
            print "Computing...",
            sys.stdout.flush()

            # Read in all the fasta entries:
            fastaFile = open(fastaFileName, 'r')
            fastaIterator = Fasta.Iterator(fastaFile,
                                           parser=Fasta.RecordParser())
            fastaDict = {}
            for fastaRecord in fastaIterator:
                fastaDict[fastaRecord.title] = fastaRecord.sequence

            # Get the query sequence:
            queryName = baseName
            querySeq = fastaDict[queryName]
            del fastaDict[queryName]

            alignment = Nexus.Nexus()

            seqList = [[queryName, Seq.Seq(querySeq)]]
            for title, sequence in fastaDict.items():
                alignmentList = pairwise2.align.globalms(querySeq,
                                                         sequence,
                                                         1,
                                                         0,
                                                         -10,
                                                         -.5,
                                                         one_alignment_only=1)
                queryAlnString = alignmentList[0][0]
                homologueAlnString = alignmentList[0][1]

                # Delete the columns that introcudes gaps in the querySeq:
                deleteList = []
                for i, char in enumerate(queryAlnString):
                    if char == '-':
                        deleteList.append(i)
                prunedHomologueAlnString = ''
                for i, char in enumerate(homologueAlnString):
                    if i not in deleteList:
                        prunedHomologueAlnString += char
                homologueAlnString = prunedHomologueAlnString

                # Make a list of tuple with truncated title and Seq object and add it the list:
                tup = (title, Seq.Seq(homologueAlnString))
                seqList.append(tup)

            # Write the alignment to a file:
            writeNexusFile(alignmentFileName, seqList)

            print "done",
            sys.stdout.flush()