def __init__(self):
     self.parser = Parser()
     self.np_tf = {}
     self.np_itf = {}
     self.tfidf = {}
     self.definitions = []
     self.parse_log = {}
class Annotator():

    counter = 0

    def __init__(self):
        self.parser = Parser()
        self.np_tf = {}
        self.np_itf = {}
        self.tfidf = {}
        self.definitions = []
        self.parse_log = {}
        
        
 

    def indexConcepts(self, id, nps):
        if not id in self.np_tf.keys() :
            self.np_tf[id] = {}       
        
        for entry in nps :
            nouns = entry['nouns']
            concept = entry['concept']
            
            nouns.append(concept)
            
            for np in nouns :
                if np in self.np_tf[id].keys() :
                    self.np_tf[id][np] += 1
                else :
                    self.np_tf[id][np] = 1
                    
                if not np in self.np_itf.keys():
                    self.np_itf[np] = {}
                   
                if id in self.np_itf[np].keys():
                    self.np_itf[np][id] += 1
                else :
                    self.np_itf[np][id] = 1 
                
    def scoreConcepts(self):
        # Get the total number of documents
        doc_count = len(self.np_tf.keys())
        
        self.tfidf['count'] = doc_count
        self.tfidf['nps'] = {}
        self.tfidf['docs'] = {}
        # For every noun phrase in the inverse term frequency dictionary
        for np in self.np_itf :
            # Get the total number of documents (articles) in which this noun phrase occurs 
            np_total_doc_count = len(self.np_itf[np].keys())
            
            # Calculate the inverse document frequency (IDF) as the natural logarithm of the total 
            # number of documents divided by the number of documents containing this noun phrase
            idf = math.log(float(doc_count)/float(np_total_doc_count))
            print np,idf
            
            # Initialise the TFIDF for this noun phrase
            self.tfidf['nps'][np] = {}
            
            # For every document in which this noun phrase occurs
            for doc_id in self.np_itf[np] :
                # Initialise the TFIDF per noun phrase for this document
                self.tfidf['nps'][np][doc_id] = {}
                
                # Initialise the TFIDF per document
                if not doc_id in self.tfidf['docs'].keys() :
                    self.tfidf['docs'][doc_id] = {}
                
                # Initialise the TFIDF per document for this noun phrase
                self.tfidf['docs'][doc_id][np] = {}
                
                # Get the number of times this noun phrase occurs in the document
                np_occ_count = self.np_itf[np][doc_id]
                
                # Get the count for the noun phrase that occurs the most in this document.
                max_np_count = 0
                for onp in self.np_tf[doc_id] :
                    if self.np_tf[doc_id][onp] > max_np_count :
                        max_np_count = self.np_tf[doc_id][onp]
                

                # Calculate the normalized term frequency as the number of times the noun phrase occurs, divided by the maximum 
                # number of times any noun phrase occurs in this document
                tf = float(np_occ_count) / float(max_np_count)
                
                # Calculate TFIDF
                tfidf = tf*idf

                self.tfidf['nps'][np][doc_id]['tc'] = np_occ_count
                self.tfidf['nps'][np][doc_id]['tf'] = tf
                self.tfidf['nps'][np][doc_id]['idf'] = idf
                self.tfidf['nps'][np][doc_id]['max'] = max_np_count
                self.tfidf['nps'][np][doc_id]['tfidf'] = tfidf
                self.tfidf['nps'][np][doc_id]['dc'] = doc_count
                self.tfidf['nps'][np][doc_id]['ndc'] = np_total_doc_count
                
                self.tfidf['docs'][doc_id][np]['tc'] = np_occ_count
                self.tfidf['docs'][doc_id][np]['tf'] = tf
                self.tfidf['docs'][doc_id][np]['idf'] = idf
                self.tfidf['docs'][doc_id][np]['max'] = max_np_count
                self.tfidf['docs'][doc_id][np]['tfidf'] = tfidf          
                self.tfidf['docs'][doc_id][np]['dc'] = doc_count
                self.tfidf['docs'][doc_id][np]['ndc'] = np_total_doc_count 
                
    def getConcepts(self, definitions):
        concepts = []
        for (id, num, scope, term, mod, definition, stext) in definitions :
            concepts.append(term)
        
        return concepts
                
    def annotate(self, id, text):
        print "== Annotating {} ==".format(id)
        print "Tokenizing..."
        tokenized = self.parser.tokenizeText(text)
        print "Tagging..."
        tagged = self.parser.tagText(tokenized)
        print "Parsing..."
        parsed = self.parser.parseText(tagged)
        
        self.parse_log[id] = parsed
        
        stdout.write("Scanning for definitions...")
        dm = DefinitionMatcher()
        definitions_for_id = dm.match(id, tagged)
        stdout.write(" {} found.\n".format(len(definitions_for_id)))
        self.definitions.extend(definitions_for_id)
            
        stdout.write("Extracting concepts...")
        concepts = Util.extractConcepts(parsed)
        stdout.write(" {} found.\n".format(len(concepts)))
        
#        NB: This does not currently work, as the concepts from definitions should be in a dictionary      
#        stdout.write("Appending concepts from definitions to concept list...")
#        concepts.extend(self.getConcepts(self.definitions))
#        stdout.write(" {} total.\n".format(len(definitions_for_id)))

        if len(concepts) < 1 :
            print "No concepts found..."
        else :
            print "Adding concepts to index..."
            self.indexConcepts(id, concepts)
#            print "Linking concepts to Cornetto Wordnet..."
#            cl = ConceptLinker()
#            cl.link(id, concepts)

        print "=== NEXT ===\n\n"
        return definitions_for_id