def __init__(self, dbPath): self.wordExplorer = WordExplorer(dbPath); self.initWordCaptureTally(); self.verbosity = Verbosity.NONE;
class Evaluator(object): def __init__(self, dbPath): self.wordExplorer = WordExplorer(dbPath); self.initWordCaptureTally(); self.verbosity = Verbosity.NONE; def getMaxDepthAllSentences(self): ''' Runs through all sentences this Evaluator instance has measured, and returns the deepest depth of all sentences: ''' maxDepth = 0; for sentencePerf in self.performanceTally: maxDepth = max(sentencePerf.getDeepestDepth(), maxDepth); return maxDepth; def toCSV(self, outFileFD=None): csv = self.getCSVHeader() + '\n'; for sentencePerf in self.performanceTally: csv += sentencePerf.toCSV() + '\n'; if outFileFD is not None: try: outFileFD.write(csv); outFileFD.flush(); except IOError: print "Warning: could not write to outfile FD: %s" + str(outFileFD); return csv; def getCSVHeader(self): header = 'EmailID,SentenceID,SentenceLen,Failures,OutofSeq,InputSavings'; for depthIndex in range(1,self.getMaxDepthAllSentences() + 1): header += ',Depth_' + str(depthIndex); header += ',DepthWeightedScore' return header; def extractWordSet(self, jsonEchoTreeStr): ''' Given a JSON Echo Tree, return the root word and a flat set of all follow-on words. @param jsonEchoTreeStr: JSON EchoTree structure of any depth/breadth @type jsonEchoTreeStr: string ''' pythonEchoTree = json.loads(jsonEchoTreeStr); flatTreeStr = self.extractWordSeqsHelper(pythonEchoTree); lowerCaseFlatTreeList = []; for word in flatTreeStr.split(" "): lowerCaseFlatTreeList.append(word.lower()); rootWord = lowerCaseFlatTreeList[0]; flatSet = set(lowerCaseFlatTreeList[1:]); return (rootWord, flatSet); def getDepthFromWord(self, pythonEchoTree, word): ''' Given a word, return its depth in the tree. Root postion is 0. @param pythonEchoTree: Python encoded EchoTree @type pythonEchoTree: Dict @param word: word to find in the EchoTree @type word: string @return: the depth at which the word occurs in the tree, or 0 if not present. @rtype: {int | None} ''' #********************** #self.wordExplorer.printWordTree(pythonEchoTree, 2); #********************** resultDepths = [] self.getDepthFromWordHelper(pythonEchoTree, word, resultDepths, depth=0); try: return min(resultDepths); except ValueError: return None; def getDepthFromWordHelper(self, pythonEchoTree, wordToFind, resultDepths, depth=0): if pythonEchoTree is None: return None; # While 'wordToFind' is always a single word, the # subtrees (pythonEchoTree['word']) will be # two words for a trigram system, yet one word # for bigram systems. Check whether *any* word in the given # pythonEchoTree match wordToFind. if wordToFind in pythonEchoTree['word'].split(): resultDepths.append(depth); return; # No match; recursively check the subtrees: for subtree in pythonEchoTree['followWordObjs']: newDepth = self.getDepthFromWordHelper(subtree, wordToFind, resultDepths, depth=depth+1); if newDepth is not None: resultDepths.append(newDepth); return; return None; def extractSentences(self, jsonEchoTreeStr): ''' Print all sentences that can be made from the EchoTree. @param jsonEchoTreeStr: @type jsonEchoTreeStr: ''' #sentenceStructs = self.extractWordSeqs(jsonEchoTreeStr); pass def extractWordSeqs(self, jsonEchoTreeStr): ''' Given a JSON EchoTree structure, return a structure representing all 'sentences' generated by the tree via a depth-first walk. Example: root pig truffle mud tree deep broad generates: deque([root, OrderedDict([(tree, deque([broad, deep]))]), OrderedDict([(pig, deque([mud, truffle]))])]) from which one can generate: - root tree broad - root tree deep - root pig mud - root pig truffle @param jsonEchoTreeStr: JSON encoded EchoTree @type jsonEchoTreeStr:string ''' pythonEchoTree = json.loads(jsonEchoTreeStr); flatTree = self.extractWordSeqsHelper(pythonEchoTree); flatQueue = deque(flatTree.split()); # Number of words: breadth ** (depth-1) + 1 numSibPops = WORD_TREE_BREADTH ** (WORD_TREE_DEPTH - 2); # Root word first: resDictQueue = deque([flatQueue[0]]); for dummy in range(numSibPops): sibs = deque([]); parentDict = OrderedDict(); resDictQueue.append(parentDict); for dummy in range(WORD_TREE_BREADTH): sibs.append(flatQueue.pop()); parentDict[flatQueue.pop()] = sibs; return resDictQueue; def extractWordSeqsHelper(self, pythonEchoTreeDict): ''' Too-long example (it's what I had on hand: {u'word': u'reliability', u'followWordObjs': [ {u'word': u'new', u'followWordObjs': [ {u'word': u'power', u'followWordObjs': []}, {u'word': u'generation', u'followWordObjs': []}, {u'word': u'business', u'followWordObjs': []}, {u'word': u'product', u'followWordObjs': []}, {u'word': u'company', u'followWordObjs': []}]}, {u'word': u'issues', u'followWordObjs': [ {u'word': u'related', u'followWordObjs': []}, {u'word': u'need', u'followWordObjs': []}, {u'word': u'raised', u'followWordObjs': []}, {u'word': u'such', u'followWordObjs': []}, {u'word': u'addressed', u'followWordObjs': []}]}, {u'word': u'legislation', u'followWordObjs': [ {u'word': u'passed', u'followWordObjs': []}, {u'word': u'allow', u'followWordObjs': []}, {u'word': u'introduced', u'followWordObjs': []}, {u'word': u'require', u'followWordObjs': []}, {u'word': u'provide', u'followWordObjs': []}]}, {u'word': u'standards', u'followWordObjs': [ {u'word': u'conduct', u'followWordObjs': []}, {u'word': u'set', u'followWordObjs': []}, {u'word': u'needed', u'followWordObjs': []}, {u'word': u'facilitate', u'followWordObjs': []}, {u'word': u'required', u'followWordObjs': []}]}, {u'word': u'problems', u'followWordObjs': [ {u'word': u'please', u'followWordObjs': []}, {u'word': u'California', u'followWordObjs': []}, {u'word': u'accessing', u'followWordObjs': []}, {u'word': u'arise', u'followWordObjs': []}, {u'word': u'occur', u'followWordObjs': []}]}]} @param pythonEchoTreeDict: @type pythonEchoTreeDict: dict ''' res = ''; word = pythonEchoTreeDict['word']; res += ' ' + word; if len(pythonEchoTreeDict['followWordObjs']) == 0: return res; for subtree in pythonEchoTreeDict['followWordObjs']: res += self.extractWordSeqsHelper(subtree); return res; def initWordCaptureTally(self): self.performanceTally = []; def tallyWordCapture(self, sentenceTokens, emailID=-1, sentenceID=None, removeStopwords=False): ''' Measures overlap of each sentence token with trees created by this evaluator's database. Stopwords are removed here. Measures: - sentenceLen: number of words that are not stopwords. - failures: number of times a tree did not contain one of the words, and a new tree needed to be constructed by typing in the word. - outOfSeqs: number of times future word in the sentence was in an early tree. - depths: for each tree depth, how many of the sentence's words appeared at that depth. Creates a SentencePerformance instance that stores the result measures. Adds that instance to this evaluator's performanceTally array. @param sentenceTokens: tokens that make up the sentence. @type sentenceTokens: [string] @param emailID: optional ID to identify from which email the given sentence was taken. @type emailID: <any> @param sentenceID: optional ID to identify the given sentence within its email. @type sentenceID: <any> @param removeStopwords: whether or not to remove stopwords. @type removeStopwords: boolean @return: an array of all words successfully predicted (in any level of the tree) @rtype: [string] ''' # We'll modify sentenceTokens in the loop # below, so get a shallow copy for the loop: tokenCopy = copy.copy(sentenceTokens); for word in tokenCopy: if len(word) == 0: sentenceTokens.remove(word); continue; if removeStopwords and (word.lower() in STOPWORDS): sentenceTokens.remove(word); continue; if len(sentenceTokens) == 0: # The sentence was all empty words, or stopwords that we were asked to remove: return ""; if self.verbosity == Verbosity.DEBUG: print("Sentence %d tokens after cleanup: %s" % (sentenceID,str(sentenceTokens))); # Make a new SentencePerformance instance, passing this evaluator, # the array of stopword-free tokens, and the index in the self.performanceTally # array at which this new SentencePerformance instance will reside: if sentenceID is None: sentenceID = len(self.performanceTally); sentencePerf = SentencePerformance(self, sentenceTokens, emailID=emailID, sentenceID=sentenceID); predictedWords = []; # Start for real: tree = self.wordExplorer.makeWordTree(sentenceTokens[0], self.arity); treeWords = self.extractWordSet(self.wordExplorer.makeJSONTree(tree)); prevWord = sentenceTokens[0]; for wordPos, word in enumerate(sentenceTokens[1:]): #word = word.lower(); wordDepth = self.getDepthFromWord(tree, word); if self.verbosity == Verbosity.DEBUG: print(" Word '%s' score:\t\t%f %f" % (prevWord, 1.0 if wordDepth == 1 else 0.0, 0.5 if wordDepth == 2 else 0.0)) if wordDepth is None: # wanted word is not in tree anywhere: sentencePerf.addFailure(); # Is any of the future sentence words in the tree's word set? if wordDepth < len(sentenceTokens) - 1: for futureWord in sentenceTokens[wordPos+1:]: if futureWord in treeWords: sentencePerf.addOutOfSeq(); else: # Found word in tree: sentencePerf.addWordDepth(wordDepth); sentencePerf.addPredictedWord(word); predictedWords.append(word); # Build a new tree from the (virtually) typed in current word tree = self.wordExplorer.makeWordTree(word, self.arity); treeWords = self.extractWordSet(self.wordExplorer.makeJSONTree(tree)); prevWord = word; # Finished looking at every toking in the sentence. self.performanceTally.append(sentencePerf); if self.verbosity == Verbosity.DEBUG: totalDepthWeightedScore = 0.0 totalDepth1Score = 0; totalDepth2Score = 0; performance = self.performanceTally[-1] totalDepthWeightedScore += performance.getDepthWeightedSuccessSentence(); totalDepth1Score += performance.getDepthCount(1); totalDepth2Score += performance.getDepthCount(2); print("\t\t\tTotal: \t%f %f %f" % (totalDepth1Score, totalDepth2Score * 0.5, totalDepthWeightedScore)); print("\t\t\t \t-------------------------------"); return predictedWords; def readSentence(self, fd): sentenceOpener = '[' sentenceCloser= ']' res = ''; # Find start of next sentence: while 1: try: letter = fd.read(1); if letter == sentenceOpener: # Found start of sentence break; if len(letter) == 0: # Gone through the whole file: return None; except IOError: return None while 1: try: letter = fd.read(1); # Reached end of file before closing bracket: if len(letter) == 0: raise IOError; except IOError: print "Warning: ignoring unfinished sentence: %s." % res; return None if letter == sentenceCloser: return res; if letter == " " or letter in PUNCTUATION: continue; res += letter; def checksum(self, theStr): ''' Returns the sum of all the given string's ASCCII values. @param theStr: string to be checksummed. @type theStr: string @return: sum of ASCII values as checksum @rtype: int ''' return reduce(lambda x,y:x+y, map(ord, theStr)) def measurePerformance(self, csvFilePath, dbFilePath, arity, tokenFilePaths, verbosity=Verbosity.NONE, removeStopwords=False): ''' Token files must hold a string as produced by the Stanford NLP core tokenizer/sentence segmenter. Ex: "[foo, bar, fum]". Notice the ',<space>' after each token. That is the token separator. Assumed that db file is accessible for reading, that csv file can be opened/created for output, and that the token file paths are accessible for reading. @param csvFilePath: path to which to write the sentence-by-sentence CSV lines @type csvFilePath: string @param dbFilePath: path to the Bigram/Trigram probabilities table Sqlite3 db to use @type dbFilePath: sting @param arity: arity of ngrams to use in the trees @type arity: int @param tokenFilePaths: fully qualified paths to each token file. @type tokenFilePaths: string @param verbosity: if Verbosity.NONE: silent; if Verbosity.LOG: msg every 10 sentences. For debugging: Verbosity.DEBUG @type verbose: Verbosity @param removeStopwords: whether or not to remove ngrams with stopwords from the echo trees @type removeStopwords: boolean @return: Average of depth-weighted performance of all sentences @rtype: float. ''' if verbosity > 0: numSentencesDone = 0; reportEvery = PROGRESS_RATE; # progress every PROGRESS_RATE sentences # Be debug level verbose: if verbosity > 1: self.verbosity = verbosity; self.arity = arity; self.initWordCaptureTally(); # Total length of all words in all sentences that will be tested allWordsLen = 0; # A list of all words that were predicted successfully: allPredictedWords = []; for tokenFilePath in tokenFilePaths: msgID = self.checksum(tokenFilePath); sentenceID = 0; with open(tokenFilePath, 'r') as tokenFD: while 1: # Get one sentence as a comma-separated string of tokens: pythonSentenceTokens = self.readSentence(tokenFD); if self.verbosity == Verbosity.DEBUG: print("Sentence %d tokens: %s" % (numSentencesDone,str(pythonSentenceTokens))); if pythonSentenceTokens is None: # Done with one file. break; tokenArray = pythonSentenceTokens.split(','); # Compute the sentence length in characters, adding # a space (or closing period) for each token: for token in tokenArray: allWordsLen += len(token) + 1; # Do the stats: predictedWordsThisSentence = self.tallyWordCapture(tokenArray, emailID=msgID, sentenceID=sentenceID, removeStopwords=removeStopwords); if self.verbosity == Verbosity.DEBUG: print("Words predicted in sentence %d: %s." % (numSentencesDone, predictedWordsThisSentence)); print("Typing saved: " + str(self.performanceTally[-1].getPercentTypeSavings())); allPredictedWords.extend(predictedWordsThisSentence); sentenceID += 1; if self.verbosity != Verbosity.NONE: numSentencesDone += 1; if numSentencesDone % reportEvery == 0: print "At file %s. Done %d sentences." % (os.path.basename(tokenFilePath), numSentencesDone); numCharsSaved = 0; # Compute percentage typing saved for all sentences together. # Note that we cannot subtract one char for the automatically # generated space after each word, because users do have to # click on the word: for word in allPredictedWords: numCharsSaved += len(word); typingSaved = numCharsSaved * 100 / allWordsLen; with open(csvFilePath,'w') as CsvFd: csvAll = self.toCSV(outFileFD=CsvFd); if self.verbosity == Verbosity.DEBUG: print csvAll; # Compute mean sentence performance: totalPerfDbAndArity = 0.0; sentenceID = 0; for sentencePerformance in self.performanceTally: totalPerfDbAndArity += sentencePerformance.getDepthWeightedSuccessSentence(); if self.verbosity == Verbosity.DEBUG: print("Sentence %d tally: %f" % (sentenceID, sentencePerformance.getDepthWeightedSuccessSentence())); sentenceID += 1; if self.verbosity == Verbosity.DEBUG: print("Total score (sumSentenceScores/numSentences): %f / %d = %f" % (totalPerfDbAndArity, len(self.performanceTally), totalPerfDbAndArity/len(self.performanceTally))); return totalPerfDbAndArity/len(self.performanceTally)