def retrive_documents(self,query_id): k1 = 1.2 k3 = 8.00 avg_dl = 122 b = 1 # from 0.25 to 2.00 increase 0.25 q = Query(query_id) #q.set_concepts(self.QueryConceptExtraction(q.text)) self._expand_query(q) return print "Retrieving Documents for: ", q.text Collection._load() Collection._load_go() Collection._load_tags() Collection._load_indexes() #Loads documents into _documents with PMID and Index score = dict() N = Collection._count Nt = dict() for term in q.text: Nt[term] = Collection._get_frequency(term) counter = 0 for doc in Collection._documents: summation = 0; dl = doc.length * 1.00 for t in q.text: tfn = doc.get_frequency(t) QQ = ' '.join(q.text) qtf = Document._term_frequency(QQ, t) K = k1*((1-b)+b*(dl/avg_dl)) w = log((N-Nt[t]+0.5)/(Nt[t]+0.5),2) if w<0: #this makes the result a negative number # if we break the result will be bigger than or equal to zero break p1 = (((k1+1)*tfn)/(K+tfn)) p2 = ((k3+1)*qtf/(k3+qtf)) p3 = w summation += p1*p2*p3 score[doc.PMID] = summation counter += 1
def DocumentExpantion(self): ''' db.Query("delete from collection_concepts;")!!! BM25TermWeightingModel BM25 or Best Match algorithm, calculates the weight of each word in each extracted concept for the document ''' print "Calculating weights is started..." wieght_threshold = 0.10 tp = TextProcessor() ontology = Ontology() db = DB() db.Query("delete from collection_concepts;") Collection._load() Collection._load_go() N = Collection._count #Terminologies are ('go','mesh','icd10','snomed') corresponding with columns 2,3,4,5 T = ontology.GetDict('go') #bring all ontologies into the memory to be faster! doc_avg_len = 122 k1 = 1.2 b = 1.00 doc_counter = 0 print Collection._count # tuning parameters! for d in Collection._documents: doc_counter += 1 doc_len = d.length weight = dict() for C in d.go: C = C.replace(' ','') # Extract concept variants for C var = ' ' for variant in T[C]: var += ' {0} '.format(variant) terms = set( var.split(tp.WordSplitter())) tp.remove_values_from_list(terms,'') l = len(terms) sumation = 0 for term in terms: term_weight = 0 #calculate the weight tf = d.get_frequency(term) #Here goes calculating the weight n_k = Collection._get_frequency(term) tf = d.get_frequency(term) try: term_weight = tf * (( log10((N-n_k+0.50)/(n_k+0.50)) )/(k1+((1-b)+b) * (doc_len/doc_avg_len)+(tf))) except: pass #print "One here!++++++++++++++++++++++++++++++++++" sumation += term_weight if (sumation/l) > wieght_threshold: weight[C] = (1.00/l) * sumation # Store concepts and weights in the database, concepts and their weights are semi-colon separated values = '' ConceptList = [] for row in weight: row = row.replace(" ",'') for term in T[row]: ConceptList.append(term) if values == '': values = str(row) + ';' + str(weight[row]) else: values += ',' + str(row) + ';' + str(weight[row]) d.set_tag(ConceptList) #Adding tag tags to documents query = 'Insert into collection_concepts (PMID, Concepts) values({0}, "{1}")'.format(d.PMID,values) #print query db.Query(query) print "Calculating weights is Done! Concepts are added to Database"