def _extract_concepts(document,terminology,MaxMatcher): """ document: db: terminology: MaxMatcher: returns: Concept List """ # Set threshold op = Operation() threshold = 0.95 doc_token = document #print "len(doc_token) " , len(doc_token) candidate_concepts = [] #Prepare a dictionary for MaxMatcher result of tokens. for token_row in doc_token: if token_row not in MaxMatcher.keys(): extracted_concepts = DB._execute("select cid, sig from "+ terminology +"_mm where word = '" + token_row + "'") MaxMatcher[token_row] = extracted_concepts for current_token_counter in range(len(doc_token)-3): #skip the last 3 token current_token = doc_token[current_token_counter] skip_counter = 0 # Number of skips skip_limit = 2 #Skip limit extracted_concepts = MaxMatcher[current_token] current_token_concepts = Set() current_token_score = dict() for c in extracted_concepts: # Create T_c current_token_concepts.add(c[0]) current_token_score[c[0]] = c[1] next_token_counter = 1 # Next word counter next_token = doc_token[ current_token_counter + next_token_counter ] # t is the next word while (skip_counter < skip_limit): extracted_concepts = MaxMatcher[next_token] next_token_concepts = Set() next_token_score = dict() for c in extracted_concepts: next_token_concepts.add(c[0]) next_token_score[c[0]] = c[1] mutual_concepts = next_token_concepts & current_token_concepts if len(mutual_concepts) == 0: skip_counter = skip_counter + 1 else: current_token_concepts = mutual_concepts for c in current_token_concepts: current_token_score[c] += next_token_score[c] next_token_counter += 1 if (current_token_counter + next_token_counter) < len (doc_token): next_token = doc_token[ current_token_counter + next_token_counter ] else: break candidate_concepts = op.union( candidate_concepts , [c for c in current_token_concepts if current_token_score[c]>threshold]) #print "-----------------------------------------------" #print document #print candidate_concepts #print "-----------------------------------------------" return candidate_concepts
def Indexing(self): ''' IR Indexing Operations - Elimination of Stopwords - ''' DB._execute("DELETE from collection_index") print "Indexing is started..." tp = TextProcessor() Collection._load() Collection._load_tags() #loading document with PMID, tags and abstracts for doc in Collection._documents: index_list = [] for term in doc.abstract: index_list.append(term) if GlobalVariables.global_context_activated: for term in doc.tag: index_list.append(term) index_list = tp.EliminateStopWords(index_list) index_list = tp.Stem(index_list) doc.set_index(index_list) print "Indexing is Done!"