def __init__(self, dbPath):
     self.wordExplorer = WordExplorer(dbPath);
     self.initWordCaptureTally();
     self.verbosity = Verbosity.NONE;
class Evaluator(object):
    
    def __init__(self, dbPath):
        self.wordExplorer = WordExplorer(dbPath);
        self.initWordCaptureTally();
        self.verbosity = Verbosity.NONE;
        
    def getMaxDepthAllSentences(self):
        '''
        Runs through all sentences this Evaluator instance has
        measured, and returns the deepest depth of all sentences:
        '''
        maxDepth = 0;
        for sentencePerf in self.performanceTally:
            maxDepth = max(sentencePerf.getDeepestDepth(), maxDepth);
        return maxDepth;
        
    def toCSV(self, outFileFD=None):
        csv = self.getCSVHeader() + '\n';
        for sentencePerf in self.performanceTally:
            csv += sentencePerf.toCSV() + '\n';
        if outFileFD is not None:
            try:
                outFileFD.write(csv);
                outFileFD.flush();
            except IOError:
                print "Warning: could not write to outfile FD: %s" + str(outFileFD);
        return csv;
            
    def getCSVHeader(self):
        header = 'EmailID,SentenceID,SentenceLen,Failures,OutofSeq,InputSavings';        
        for depthIndex in range(1,self.getMaxDepthAllSentences() + 1):
            header += ',Depth_' + str(depthIndex);
        header += ',DepthWeightedScore'
        return header;
    
    def extractWordSet(self, jsonEchoTreeStr):
        '''
        Given a JSON Echo Tree, return the root word and a flat set of
        all follow-on words.
        @param jsonEchoTreeStr: JSON EchoTree structure of any depth/breadth
        @type jsonEchoTreeStr: string
        '''
        pythonEchoTree = json.loads(jsonEchoTreeStr);
        flatTreeStr  = self.extractWordSeqsHelper(pythonEchoTree);
        lowerCaseFlatTreeList = [];
        for word in flatTreeStr.split(" "):
            lowerCaseFlatTreeList.append(word.lower());
        rootWord = lowerCaseFlatTreeList[0];
        flatSet = set(lowerCaseFlatTreeList[1:]);
        return (rootWord, flatSet);
    
    def getDepthFromWord(self, pythonEchoTree, word):
        '''
        Given a word, return its depth in the tree. Root postion is 0.
        @param pythonEchoTree: Python encoded EchoTree
        @type pythonEchoTree: Dict
        @param word: word to find in the EchoTree
        @type word: string
        @return: the depth at which the word occurs in the tree, or 0 if not present.
        @rtype: {int | None}
        '''
        #**********************
        #self.wordExplorer.printWordTree(pythonEchoTree, 2);
        #**********************
        resultDepths = []
        self.getDepthFromWordHelper(pythonEchoTree, word, resultDepths, depth=0);
        try:
            return min(resultDepths);
        except ValueError:
            return None;
    
    def getDepthFromWordHelper(self, pythonEchoTree, wordToFind, resultDepths, depth=0):
        if pythonEchoTree is None:
            return None;
        # While 'wordToFind' is always a single word, the
        # subtrees (pythonEchoTree['word']) will be 
        # two words for a trigram system, yet one word
        # for bigram systems. Check whether *any* word in the given
        # pythonEchoTree match wordToFind. 

        if wordToFind in pythonEchoTree['word'].split():
            resultDepths.append(depth);
            return;
        # No match; recursively check the subtrees:
        for subtree in pythonEchoTree['followWordObjs']:
            newDepth = self.getDepthFromWordHelper(subtree, wordToFind, resultDepths, depth=depth+1);
            if newDepth is not None:
                resultDepths.append(newDepth);
                return;
        return None;
    
    
    def extractSentences(self, jsonEchoTreeStr):
        '''
        Print all sentences that can be made from the EchoTree.
        @param jsonEchoTreeStr:
        @type jsonEchoTreeStr:
        '''
        #sentenceStructs = self.extractWordSeqs(jsonEchoTreeStr);
        pass
        
    
    def extractWordSeqs(self, jsonEchoTreeStr):
        '''
        Given a JSON EchoTree structure, return a structure representing all
        'sentences' generated by the tree via a depth-first walk. Example:
        root  pig    truffle
                     mud
              tree   deep
                     broad
        generates: 
            deque([root, OrderedDict([(tree, deque([broad, deep]))]), 
                         OrderedDict([(pig, deque([mud, truffle]))])])
        from which one can generate:
            - root tree broad
            - root tree deep
            - root pig mud
            - root pig truffle
            
        @param jsonEchoTreeStr: JSON encoded EchoTree
        @type jsonEchoTreeStr:string
        '''
        pythonEchoTree = json.loads(jsonEchoTreeStr);
        flatTree  = self.extractWordSeqsHelper(pythonEchoTree);
        flatQueue = deque(flatTree.split());
        # Number of words: breadth ** (depth-1) + 1
        numSibPops = WORD_TREE_BREADTH ** (WORD_TREE_DEPTH - 2);
        # Root word first:
        resDictQueue = deque([flatQueue[0]]);
        for dummy in range(numSibPops):
            sibs = deque([]);
            parentDict = OrderedDict();
            resDictQueue.append(parentDict);
            for dummy in range(WORD_TREE_BREADTH):
                sibs.append(flatQueue.pop());
            parentDict[flatQueue.pop()] = sibs;
        return resDictQueue;
    
    def extractWordSeqsHelper(self, pythonEchoTreeDict):
        '''
        Too-long example (it's what I had on hand:
        {u'word': u'reliability', 
         u'followWordObjs': [
                {u'word': u'new', 
                 u'followWordObjs': [
                     {u'word': u'power', 
                      u'followWordObjs': []}, 
                     {u'word': u'generation', 
                      u'followWordObjs': []}, 
                     {u'word': u'business', 
                      u'followWordObjs': []}, 
                     {u'word': u'product', 
                      u'followWordObjs': []}, 
                     {u'word': u'company', 
                      u'followWordObjs': []}]}, 
                {u'word': u'issues', 
                 u'followWordObjs': [
                     {u'word': u'related', 
                      u'followWordObjs': []}, 
                     {u'word': u'need', 
                      u'followWordObjs': []}, 
                     {u'word': u'raised', 
                      u'followWordObjs': []}, 
                     {u'word': u'such', 
                      u'followWordObjs': []}, 
                     {u'word': u'addressed', 
                      u'followWordObjs': []}]}, 
                {u'word': u'legislation', 
                 u'followWordObjs': [
                     {u'word': u'passed', 
                      u'followWordObjs': []}, 
                     {u'word': u'allow', 
                      u'followWordObjs': []}, 
                     {u'word': u'introduced', 
                      u'followWordObjs': []}, 
                     {u'word': u'require', 
                      u'followWordObjs': []}, 
                     {u'word': u'provide', 
                      u'followWordObjs': []}]}, 
                {u'word': u'standards', 
                 u'followWordObjs': [
                     {u'word': u'conduct', 
                      u'followWordObjs': []}, 
                     {u'word': u'set', 
                      u'followWordObjs': []}, 
                     {u'word': u'needed', 
                      u'followWordObjs': []}, 
                     {u'word': u'facilitate', 
                      u'followWordObjs': []}, 
                     {u'word': u'required', 
                      u'followWordObjs': []}]}, 
                {u'word': u'problems', 
                 u'followWordObjs': [
                     {u'word': u'please', 
                      u'followWordObjs': []}, 
                     {u'word': u'California', 
                      u'followWordObjs': []}, 
                     {u'word': u'accessing', 
                      u'followWordObjs': []}, 
                     {u'word': u'arise', 
                      u'followWordObjs': []}, 
                     {u'word': u'occur', 
                     u'followWordObjs': []}]}]}        
        
        @param pythonEchoTreeDict:
        @type pythonEchoTreeDict: dict
        '''
        res = '';
        word = pythonEchoTreeDict['word'];
        res += ' ' + word;
        if len(pythonEchoTreeDict['followWordObjs']) == 0:
            return res;
        for subtree in pythonEchoTreeDict['followWordObjs']:
            res += self.extractWordSeqsHelper(subtree);
        return res;
            
    def initWordCaptureTally(self):
        self.performanceTally = [];
        
    def tallyWordCapture(self, sentenceTokens, emailID=-1, sentenceID=None, removeStopwords=False):
        '''
        Measures overlap of each sentence token with trees created
        by this evaluator's database. Stopwords are removed here. Measures:
        
           - sentenceLen: number of words that are not stopwords.
           - failures: number of times a tree did not contain one of the words, and a new tree needed to 
                       be constructed by typing in the word.
           - outOfSeqs: number of times future word in the sentence was in an early tree.
           - depths: for each tree depth, how many of the sentence's words appeared at that depth.
           
       Creates a SentencePerformance instance that stores the result measures. Adds
       that instance to this evaluator's performanceTally array.
                     
        @param sentenceTokens: tokens that make up the sentence.
        @type sentenceTokens: [string]
        @param emailID: optional ID to identify from which email the given sentence was taken.
        @type emailID: <any>
        @param sentenceID: optional ID to identify the given sentence within its email.
        @type sentenceID: <any>
        @param removeStopwords: whether or not to remove stopwords.
        @type removeStopwords: boolean
        @return: an array of all words successfully predicted (in any level of the tree)
        @rtype: [string]
        '''
        # We'll modify sentenceTokens in the loop
        # below, so get a shallow copy for the loop:
        tokenCopy = copy.copy(sentenceTokens);
        for word in tokenCopy:
            if len(word) == 0:
                sentenceTokens.remove(word);
                continue;
            if removeStopwords and (word.lower() in STOPWORDS):
                sentenceTokens.remove(word);
                continue;
        if len(sentenceTokens) == 0:
            # The sentence was all empty words, or stopwords that we were asked to remove:
            return "";
        if self.verbosity == Verbosity.DEBUG:
            print("Sentence %d tokens after cleanup: %s" % (sentenceID,str(sentenceTokens)));        
                
        # Make a new SentencePerformance instance, passing this evaluator,
        # the array of stopword-free tokens, and the index in the self.performanceTally
        # array at which this new SentencePerformance instance will reside:
        if sentenceID is None:
            sentenceID = len(self.performanceTally);
        sentencePerf = SentencePerformance(self, sentenceTokens, emailID=emailID, sentenceID=sentenceID);
        predictedWords = [];
        
        # Start for real:
        tree = self.wordExplorer.makeWordTree(sentenceTokens[0], self.arity);
        treeWords = self.extractWordSet(self.wordExplorer.makeJSONTree(tree));
        prevWord = sentenceTokens[0];
        for wordPos, word in enumerate(sentenceTokens[1:]):
            #word = word.lower();
            wordDepth = self.getDepthFromWord(tree, word);
            if self.verbosity == Verbosity.DEBUG:
                print("   Word '%s' score:\t\t%f  %f" % (prevWord,
                                                         1.0 if wordDepth == 1 else 0.0, 
                                                         0.5 if wordDepth == 2 else 0.0))
            if wordDepth is None:
                # wanted word is not in tree anywhere:
                sentencePerf.addFailure();
                # Is any of the future sentence words in the tree's word set?
                if wordDepth < len(sentenceTokens) - 1:
                    for futureWord in sentenceTokens[wordPos+1:]:
                        if futureWord in treeWords:
                            sentencePerf.addOutOfSeq();
            else:
                # Found word in tree:
                sentencePerf.addWordDepth(wordDepth);
                sentencePerf.addPredictedWord(word);
                predictedWords.append(word);
            # Build a new tree from the (virtually) typed in current word
            tree =  self.wordExplorer.makeWordTree(word, self.arity);
            treeWords = self.extractWordSet(self.wordExplorer.makeJSONTree(tree));
            prevWord = word;
        
        # Finished looking at every toking in the sentence.
        self.performanceTally.append(sentencePerf);
        if self.verbosity == Verbosity.DEBUG:
            totalDepthWeightedScore = 0.0
            totalDepth1Score = 0;
            totalDepth2Score = 0;
            performance = self.performanceTally[-1]
            totalDepthWeightedScore += performance.getDepthWeightedSuccessSentence();
            totalDepth1Score        += performance.getDepthCount(1);
            totalDepth2Score        += performance.getDepthCount(2);
                
            print("\t\t\tTotal: \t%f  %f  %f" % (totalDepth1Score,
                                                 totalDepth2Score * 0.5,
                                                 totalDepthWeightedScore));
            print("\t\t\t       \t-------------------------------");
        return predictedWords;
    
    def readSentence(self, fd):
        sentenceOpener = '['
        sentenceCloser= ']'
        res = '';
        # Find start of next sentence:
        while 1:
            try:
                letter = fd.read(1);
                if letter == sentenceOpener:
                    # Found start of sentence
                    break;
                if len(letter) == 0:
                    # Gone through the whole file:
                    return None;
            except IOError:
                return None
        while 1:
            try:
                letter = fd.read(1);
                # Reached end of file before closing bracket:
                if len(letter) == 0:
                    raise IOError;
            except IOError:
                print "Warning: ignoring unfinished sentence: %s." % res;
                return None
            if letter == sentenceCloser:
                return res;
            if letter == " " or letter in PUNCTUATION:
                continue;
            res += letter;
            
    def checksum(self, theStr):
        '''
        Returns the sum of all the given string's ASCCII values.
        @param theStr: string to be checksummed.
        @type theStr: string
        @return: sum of ASCII values as checksum
        @rtype: int
        '''
        return reduce(lambda x,y:x+y, map(ord, theStr))
            
            
    def measurePerformance(self, csvFilePath, dbFilePath, arity, tokenFilePaths, verbosity=Verbosity.NONE, removeStopwords=False):
        '''
        Token files must hold a string as produced by the Stanford NLP core 
        tokenizer/sentence segmenter. Ex: "[foo, bar, fum]". Notice the ',<space>'
        after each token. That is the token separator.
        
        Assumed that db file is accessible for reading, that csv file can be
        opened/created for output, and that the token file paths are accessible
        for reading.
        
        @param csvFilePath: path to which to write the sentence-by-sentence CSV lines
        @type csvFilePath: string
        @param dbFilePath: path to the Bigram/Trigram probabilities table Sqlite3 db to use
        @type dbFilePath: sting
        @param arity: arity of ngrams to use in the trees
        @type arity: int
        @param tokenFilePaths: fully qualified paths to each token file.
        @type tokenFilePaths: string
        @param verbosity: if Verbosity.NONE: silent; if Verbosity.LOG: msg every 10 sentences. For debugging: Verbosity.DEBUG
        @type verbose: Verbosity
        @param removeStopwords: whether or not to remove ngrams with stopwords from the echo trees
        @type  removeStopwords: boolean
        @return: Average of depth-weighted performance of all sentences
        @rtype: float.
        '''
        if verbosity > 0:
            numSentencesDone = 0;
            reportEvery = PROGRESS_RATE; # progress every PROGRESS_RATE sentences
            # Be debug level verbose:
            if verbosity > 1:
                self.verbosity = verbosity;
            
        self.arity = arity;
        
        self.initWordCaptureTally();
        # Total length of all words in all sentences that will be tested
        allWordsLen = 0;
        # A list of all words that were predicted successfully:
        allPredictedWords = [];
        for tokenFilePath in tokenFilePaths:
            msgID = self.checksum(tokenFilePath);
            sentenceID = 0;
            with open(tokenFilePath, 'r') as tokenFD:
                while 1:
                    # Get one sentence as a comma-separated string of tokens:
                    pythonSentenceTokens = self.readSentence(tokenFD);
                    if self.verbosity == Verbosity.DEBUG:
                        print("Sentence %d tokens: %s" % (numSentencesDone,str(pythonSentenceTokens)));
                    if pythonSentenceTokens is None:
                        # Done with one file.
                        break;
                    tokenArray = pythonSentenceTokens.split(',');
                    # Compute the sentence length in characters, adding
                    # a space (or closing period) for each token:
                    for token in tokenArray:
                        allWordsLen += len(token) + 1;
                    # Do the stats:
                    predictedWordsThisSentence = self.tallyWordCapture(tokenArray, emailID=msgID, sentenceID=sentenceID, removeStopwords=removeStopwords);
                    if self.verbosity == Verbosity.DEBUG:
                        print("Words predicted in sentence %d: %s." % (numSentencesDone, predictedWordsThisSentence));
                        print("Typing saved: " + str(self.performanceTally[-1].getPercentTypeSavings()));
                    allPredictedWords.extend(predictedWordsThisSentence);
                    sentenceID += 1;
                    if self.verbosity != Verbosity.NONE:
                        numSentencesDone += 1;
                        if numSentencesDone % reportEvery == 0:
                            print "At file %s. Done %d sentences." % (os.path.basename(tokenFilePath), numSentencesDone);
                            
        numCharsSaved = 0;
        # Compute percentage typing saved for all sentences together.
        # Note that we cannot subtract one char for the automatically
        # generated space after each word, because users do have to
        # click on the word:
        for word in allPredictedWords:
            numCharsSaved += len(word);
        typingSaved = numCharsSaved * 100 / allWordsLen;
         
        with open(csvFilePath,'w') as CsvFd:
            csvAll = self.toCSV(outFileFD=CsvFd);
        if self.verbosity == Verbosity.DEBUG:
            print csvAll;
        # Compute mean sentence performance:
        totalPerfDbAndArity = 0.0;
        sentenceID = 0;
        for sentencePerformance in self.performanceTally:
            totalPerfDbAndArity += sentencePerformance.getDepthWeightedSuccessSentence();
            if self.verbosity == Verbosity.DEBUG:
                print("Sentence %d tally: %f" % (sentenceID, sentencePerformance.getDepthWeightedSuccessSentence()));
                sentenceID += 1;
        if self.verbosity == Verbosity.DEBUG:
            print("Total score (sumSentenceScores/numSentences): %f / %d = %f" % (totalPerfDbAndArity, 
                                                                                  len(self.performanceTally), 
                                                                                  totalPerfDbAndArity/len(self.performanceTally)));
        return totalPerfDbAndArity/len(self.performanceTally)