Example #1
0
 def __init__(self, dbPath):
     self.wordExplorer = WordExplorer(dbPath);
     self.initWordCaptureTally();
Example #2
0
class Evaluator(object):
    
    def __init__(self, dbPath):
        self.wordExplorer = WordExplorer(dbPath);
        self.initWordCaptureTally();
        
    def getMaxDepthAllSentences(self):
        '''
        Runs through all sentences this Evaluator instance has
        measured, and returns the deepest depth of all sentences:
        '''
        maxDepth = 0;
        for sentencePerf in self.performanceTally:
            maxDepth = max(sentencePerf.getDeepestDepth(), maxDepth);
        return maxDepth;
        
    def toCSV(self, outFileFD=None):
        csv = self.getCSVHeader() + '\n';
        for sentencePerf in self.performanceTally:
            csv += sentencePerf.toCSV() + '\n';
        if outFileFD is not None:
            try:
                outFileFD.write(csv);
                outFileFD.flush();
            except IOError:
                print "Warning: could not write to outfile FD: %s" + str(outFileFD);
        return csv;
            
    def getCSVHeader(self):
        header = 'EmailID,SentenceID,SentenceLen,Failures,OutofSeq,NetFailure,NetSuccess';        
        for depthIndex in range(1,self.getMaxDepthAllSentences() + 1):
            header += ',Depth_' + str(depthIndex);
        return header;
    
    def extractWordSet(self, jsonEchoTreeStr):
        '''
        Given a JSON Echo Tree, return the root word and a flat set of
        all follow-on words.
        @param jsonEchoTreeStr: JSON EchoTree structure of any depth/breadth
        @type jsonEchoTreeStr: string
        '''
        pythonEchoTree = json.loads(jsonEchoTreeStr);
        flatTree  = self.extractWordSeqsHelper(pythonEchoTree);
        flatList  = flatTree.split();
        rootWord = flatList[0];
        flatSet = set(flatList[1:]);
        return (rootWord, flatSet);
    
    def getDepthFromWord(self, pythonEchoTree, word):
        '''
        Given a word, return its depth in the tree. Root postion is 0.
        @param pythonEchoTree: Python encoded EchoTree
        @type pythonEchoTree: Dict
        @param word: word to find in the EchoTree
        @type word: string
        @return: the depth at which the word occurs in the tree, or 0 if not present.
        @rtype: {int | None}
        '''
        return self.getDepthFromWordHelper(pythonEchoTree, word, depth=0);
    
    def getDepthFromWordHelper(self, pythonEchoTree, wordToFind, depth=0):
        if pythonEchoTree is None:
            return None;
        if pythonEchoTree['word'] == wordToFind: 
            return depth;
        for subtree in pythonEchoTree['followWordObjs']:
            newDepth = self.getDepthFromWordHelper(subtree, wordToFind, depth=depth+1);
            if newDepth is not None:
                return newDepth;
        return None;
    
    
    def extractSentences(self, jsonEchoTreeStr):
        '''
        Print all sentences that can be made from the EchoTree.
        @param jsonEchoTreeStr:
        @type jsonEchoTreeStr:
        '''
        #sentenceStructs = self.extractWordSeqs(jsonEchoTreeStr);
        pass
        
    
    def extractWordSeqs(self, jsonEchoTreeStr):
        '''
        Given a JSON EchoTree structure, return a structure representing all
        'sentences' generated by the tree via a depth-first walk. Example:
        root  pig    truffle
                     mud
              tree   deep
                     broad
        generates: 
            deque([root, OrderedDict([(tree, deque([broad, deep]))]), 
                         OrderedDict([(pig, deque([mud, truffle]))])])
        from which one can generate:
            - root tree broad
            - root tree deep
            - root pig mud
            - root pig truffle
            
        @param jsonEchoTreeStr: JSON encoded EchoTree
        @type jsonEchoTreeStr:string
        '''
        pythonEchoTree = json.loads(jsonEchoTreeStr);
        flatTree  = self.extractWordSeqsHelper(pythonEchoTree);
        flatQueue = deque(flatTree.split());
        # Number of words: breadth ** (depth-1) + 1
        numSibPops = WORD_TREE_BREADTH ** (WORD_TREE_DEPTH - 2);
        # Root word first:
        resDictQueue = deque([flatQueue[0]]);
        for dummy in range(numSibPops):
            sibs = deque([]);
            parentDict = OrderedDict();
            resDictQueue.append(parentDict);
            for dummy in range(WORD_TREE_BREADTH):
                sibs.append(flatQueue.pop());
            parentDict[flatQueue.pop()] = sibs;
        return resDictQueue;
    
    def extractWordSeqsHelper(self, pythonEchoTreeDict):
        '''
        Too-long example (it's what I had on hand:
        {u'word': u'reliability', 
         u'followWordObjs': [
                {u'word': u'new', 
                 u'followWordObjs': [
                     {u'word': u'power', 
                      u'followWordObjs': []}, 
                     {u'word': u'generation', 
                      u'followWordObjs': []}, 
                     {u'word': u'business', 
                      u'followWordObjs': []}, 
                     {u'word': u'product', 
                      u'followWordObjs': []}, 
                     {u'word': u'company', 
                      u'followWordObjs': []}]}, 
                {u'word': u'issues', 
                 u'followWordObjs': [
                     {u'word': u'related', 
                      u'followWordObjs': []}, 
                     {u'word': u'need', 
                      u'followWordObjs': []}, 
                     {u'word': u'raised', 
                      u'followWordObjs': []}, 
                     {u'word': u'such', 
                      u'followWordObjs': []}, 
                     {u'word': u'addressed', 
                      u'followWordObjs': []}]}, 
                {u'word': u'legislation', 
                 u'followWordObjs': [
                     {u'word': u'passed', 
                      u'followWordObjs': []}, 
                     {u'word': u'allow', 
                      u'followWordObjs': []}, 
                     {u'word': u'introduced', 
                      u'followWordObjs': []}, 
                     {u'word': u'require', 
                      u'followWordObjs': []}, 
                     {u'word': u'provide', 
                      u'followWordObjs': []}]}, 
                {u'word': u'standards', 
                 u'followWordObjs': [
                     {u'word': u'conduct', 
                      u'followWordObjs': []}, 
                     {u'word': u'set', 
                      u'followWordObjs': []}, 
                     {u'word': u'needed', 
                      u'followWordObjs': []}, 
                     {u'word': u'facilitate', 
                      u'followWordObjs': []}, 
                     {u'word': u'required', 
                      u'followWordObjs': []}]}, 
                {u'word': u'problems', 
                 u'followWordObjs': [
                     {u'word': u'please', 
                      u'followWordObjs': []}, 
                     {u'word': u'California', 
                      u'followWordObjs': []}, 
                     {u'word': u'accessing', 
                      u'followWordObjs': []}, 
                     {u'word': u'arise', 
                      u'followWordObjs': []}, 
                     {u'word': u'occur', 
                     u'followWordObjs': []}]}]}        
        
        @param pythonEchoTreeDict:
        @type pythonEchoTreeDict: dict
        '''
        res = '';
        word = pythonEchoTreeDict['word'];
        res += ' ' + word;
        if len(pythonEchoTreeDict['followWordObjs']) == 0:
            return res;
        for subtree in pythonEchoTreeDict['followWordObjs']:
            res += self.extractWordSeqsHelper(subtree);
        return res;
            
    def initWordCaptureTally(self):
        self.performanceTally = [];
        
    def tallyWordCapture(self, sentenceTokens, emailID=-1, sentenceID=None):
        '''
        Measures overlap of each sentence token with trees created
        by this evaluator's database. Stopwords are removed here. Measures:
        
           - sentenceLen: number of words that are not stopwords.
           - failures: number of times a tree did not contain one of the words, and a new tree needed to 
                       be constructed by typing in the word.
           - outOfSeqs: number of times future word in the sentence was in an early tree.
           - depths: for each tree depth, how many of the sentence's words appeared at that depth.
           
       Creates a SentencePerformance instance that stores the result measures. Adds
       that instance to this evaluator's performanceTally array.
                     
        @param sentenceTokens: tokens that make up the sentence.
        @type sentenceTokens: [string]
        @param emailID: optional ID to identify from which email the given sentence was taken.
        @type emailID: <any>
        @param sentenceID: optional ID to identify the given sentence within its email.
        @type sentenceID: <any>
        '''
        for word in sentenceTokens:
            if word.lower() in STOPWORDS or word in [';', ',', ':', '!', '%']:
                sentenceTokens.remove(word);
        # Make a new SentencePerformance instance, passing this evaluator,
        # the array of stopword-free tokens, and the index in the self.performanceTally
        # array at which this new SentencePerformance instance will reside:
        if sentenceID is None:
            sentenceID = len(self.performanceTally);
        sentencePerf = SentencePerformance(self, sentenceTokens, emailID=emailID, sentenceID=sentenceID);
        
        # Start for real:
        tree = self.wordExplorer.makeWordTree(sentenceTokens[0]);
        treeWords = self.extractWordSet(self.wordExplorer.makeJSONTree(tree));
        for wordPos, word in enumerate(sentenceTokens[1:]):
            word = word.lower();
            wordDepth = self.getDepthFromWord(tree, word);
            if wordDepth is None:
                # wanted word is not in tree anywhere:
                sentencePerf.addFailure();
                # Is any of the future sentence words in the tree's word set?
                if wordDepth < len(sentenceTokens) - 1:
                    for futureWord in sentenceTokens[wordPos+1:]:
                        if futureWord in treeWords:
                            sentencePerf.addOutOfSeq();
                # Build a new tree by (virtually) typing in the word
                tree =  self.wordExplorer.makeWordTree(word);
                treeWords = self.extractWordSet(self.wordExplorer.makeJSONTree(tree));
                continue;
            # Found word in tree:
            sentencePerf.addWordDepth(wordDepth);
        
        # Finished looking at every toking in the sentence.
        self.performanceTally.append(sentencePerf);
        
    def readSentence(self, fd):
        sentenceOpener = '['
        sentenceCloser= ']'
        res = '';
        # Find start of next sentence:
        while 1:
            try:
                letter = fd.read(1);
                if letter == sentenceOpener:
                    # Found start of sentence
                    res = letter;
                    break;
                if len(letter) == 0:
                    # Gone through the whole file:
                    return None;
            except IOError:
                return None
        while 1:
            try:
                letter = fd.read(1);
                # Reached end of file before closing bracket:
                if len(letter) == 0:
                    raise IOError;
            except IOError:
                print "Warning: ignoring unfinished sentence: %s." % res;
                return None
            res += letter;
            if letter == sentenceCloser:
                return res;
            
    def checksum(self, theStr):
        '''
        Returns the sum of all the given string's ASCCII values.
        @param theStr: string to be checksummed.
        @type theStr: string
        @return: sum of ASCII values as checksum
        @rtype: int
        '''
        return reduce(lambda x,y:x+y, map(ord, theStr))
            
            
    def measurePerformance(self, csvFilePath, dbFilePath, tokenFilePaths, verbose=False):
        '''
        Token files must hold a string as produced by the Stanford NLP core 
        tokenizer/sentence segmenter. Ex: "[foo, bar, fum]". Notice the ',<space>'
        after each token. That is the token separator.
        
        Assumed that db file is accessible for reading, that csv file can be
        opened/created for output, and that the token file paths are accessible
        for reading.
        
        @param evaluator:
        @type evaluator:
        @param csvFilePath:
        @type csvFilePath:
        @param dbFilePath:
        @type dbFilePath:
        @param tokenFilePaths: fully qualified paths to each token file.
        @type tokenFilePaths:
        @param verbose:
        @type verbose:
        @return: CSV formated table.
        @rtype: string
        '''
        if verbose:
            numSentencesDone = 0;
            reportEvery = 10; # progress every 10 sentences
            
        self.initWordCaptureTally();
        for tokenFilePath in tokenFilePaths:
            msgID = self.checksum(tokenFilePath);
            sentenceID = 0;
            with open(tokenFilePath, 'r') as tokenFD:
                while 1:
                    pythonSentenceTokens = self.readSentence(tokenFD);
                    if pythonSentenceTokens is None:
                        # Done with one file.
                        break;
                    self.tallyWordCapture(pythonSentenceTokens.split(', '), emailID=msgID, sentenceID=sentenceID);
                    sentenceID += 1;
                    if verbose:
                        numSentencesDone += 1;
                        if numSentencesDone % reportEvery == 0:
                            print "At file %s. Done %d sentences." % (os.path.basename(tokenFilePath), numSentencesDone);
                            
        with open(csvFilePath,'w') as CsvFd:
            csvAll = self.toCSV(outFileFD=CsvFd);
        if verbose:
            print csvAll;
        return csvAll;