Beispiel #1
0
 def _extract_concepts(document,terminology,MaxMatcher):
     """
     document:
     db:
     terminology:
     MaxMatcher:
     
     returns:
             Concept List
     """
     # Set threshold 
     op = Operation()
     threshold = 0.95
     doc_token = document
     #print "len(doc_token) " , len(doc_token)
     candidate_concepts = []
     
     #Prepare a dictionary for MaxMatcher result of tokens.
     for token_row in doc_token:
         if token_row not in MaxMatcher.keys():
             extracted_concepts = DB._execute("select cid, sig from "+ terminology +"_mm where word = '" + token_row + "'")
             MaxMatcher[token_row] = extracted_concepts
     for current_token_counter in range(len(doc_token)-3): #skip the last 3 token
         current_token = doc_token[current_token_counter]
         skip_counter = 0                                           # Number of skips
         skip_limit = 2                                        #Skip limit
         extracted_concepts = MaxMatcher[current_token]
         current_token_concepts = Set()
         current_token_score = dict()
         for c in extracted_concepts:                            # Create T_c
             current_token_concepts.add(c[0]) 
             current_token_score[c[0]] = c[1]
         next_token_counter = 1                                           # Next word counter
         next_token = doc_token[ current_token_counter + next_token_counter ]                     # t is the next word
         while (skip_counter < skip_limit):
             extracted_concepts = MaxMatcher[next_token]
             next_token_concepts = Set()
             next_token_score = dict()
             for c in extracted_concepts:
                 next_token_concepts.add(c[0])
                 next_token_score[c[0]] = c[1]
             mutual_concepts = next_token_concepts & current_token_concepts
             if len(mutual_concepts) == 0:
                 skip_counter = skip_counter + 1
             else:
                 current_token_concepts = mutual_concepts
                 for c in current_token_concepts:
                     current_token_score[c] += next_token_score[c]
             next_token_counter += 1
             if (current_token_counter + next_token_counter) < len (doc_token):
                 next_token = doc_token[ current_token_counter + next_token_counter ]
             else:
                 break
         candidate_concepts = op.union( candidate_concepts , [c for c in current_token_concepts if current_token_score[c]>threshold])
     #print "-----------------------------------------------"
     #print document
     #print candidate_concepts
     #print "-----------------------------------------------"
     return candidate_concepts
Beispiel #2
0
 def Indexing(self):
     '''
     IR Indexing Operations
         - Elimination of Stopwords
         - 
     '''
     DB._execute("DELETE from collection_index")
     print "Indexing is started..."
     tp = TextProcessor() 
     Collection._load()
     Collection._load_tags() #loading document with PMID, tags and abstracts
     for doc in Collection._documents:
         index_list = []
         for term in doc.abstract:
             index_list.append(term)
         if GlobalVariables.global_context_activated:
             for term in doc.tag:
                 index_list.append(term)
         index_list = tp.EliminateStopWords(index_list)
         index_list = tp.Stem(index_list)
         doc.set_index(index_list)
     print "Indexing is Done!"