コード例 #1
0
ファイル: IR.py プロジェクト: armanfatahi/ContextSensitiveIR
 def retrive_documents(self,query_id):
     k1 = 1.2
     k3 = 8.00
     avg_dl = 122
     b = 1 # from 0.25 to 2.00 increase 0.25
     q = Query(query_id)
     #q.set_concepts(self.QueryConceptExtraction(q.text))
     self._expand_query(q)
     return
     print "Retrieving Documents for: ", q.text
     Collection._load()
     Collection._load_go()
     Collection._load_tags()
     Collection._load_indexes()      #Loads documents into _documents with PMID and Index
     score = dict()
     N = Collection._count
     Nt = dict()
     for term in q.text:
         Nt[term] = Collection._get_frequency(term)
     counter = 0
     for doc in Collection._documents:
         summation = 0;
         dl = doc.length * 1.00
         for t in q.text:
             tfn = doc.get_frequency(t)
             QQ = ' '.join(q.text)
             qtf = Document._term_frequency(QQ, t)
             K = k1*((1-b)+b*(dl/avg_dl))
             w = log((N-Nt[t]+0.5)/(Nt[t]+0.5),2)
             if w<0:
                 #this makes the result a negative number
                 # if we break the result will be bigger than or equal to zero
                 break
             p1 = (((k1+1)*tfn)/(K+tfn))
             p2 = ((k3+1)*qtf/(k3+qtf))
             p3 = w
             summation += p1*p2*p3
         score[doc.PMID] = summation
         counter += 1
コード例 #2
0
ファイル: IR.py プロジェクト: armanfatahi/ContextSensitiveIR
 def _expand_query(self,q):
     #--STEP 1----------Extract TOP DOCUMENTS ----------------------------
     tp = TextProcessor()
     param = Parameter()
     k1      = 1.2
     k3      = 8.00
     avg_dl  = 122
     b       = 1                     # from 0.25 to 2.00 increase 0.25    
     Collection._load_indexes()      # Loads indexes into _documents
     N = len(Collection._documents)
     score = dict()
     for D in Collection._documents:
         summation = 0;
         dl = D.length * 1.00
         for t in q.text:
             Nt = Collection._get_frequency(t)
             tfn = D.get_frequency(t)
             qtf = q.get_frequency(t)
             K = k1*((1-b)+b*(dl/avg_dl))
             w = log((N-Nt+0.5)/(Nt+0.5),2)
             if w<0:
                 #this makes the result a negative number
                 # if we break the result will be bigger than or equal to zero
                 break
             p1 = (((k1+1)*tfn)/(K+tfn))
             p2 = ((k3+1)*qtf/(k3+qtf))
             p3 = w
             summation += p1*p2*p3
             
         score[D.PMID] = summation
     M = param.GetDocNumberForLocalContext()
     TopDocs = []
     TopNums = []
     new_score = dict()
     for item in score.iterkeys():
         if score[item] > 0:
             new_score[item] = score[item]
     
     for i in range(M):
         TopNums.append(0)
         TopDocs.append('')
     for D in score.iterkeys():
         for i in range(M):
             if score[D] > TopNums[i]:
                 for j in range(M-i-1):
                     TopDocs[M-j-1] = TopDocs[M-j-2]
                     TopNums[M-j-1] = TopNums[M-j-2]
                 TopDocs[i] = D
                 TopNums[i] = score[D]
                 break
     Display._plot(new_score, q)
     TopDocsTexts = ''        
     TopDocsTexts = tp.Tokenize(TopDocsTexts)
     TopDocsTexts = TextProcessor._remove_stop_words(TopDocsTexts)
     #---STEP 2---------Calculate weight of each term which is a member of new query----------------------------
     K = TopDocsTexts
     Beta = 0.4
     weight = dict()
     MaxTFQ = 0.001
     for term in TopDocsTexts:
         tfq = q.get_frequency(term)
         if tfq > MaxTFQ:
             MaxTFQ = tfq
     tfqN = 0
     MaxInfo = 0
     for term in TopDocsTexts:
         Lambda = Document._term_frequency(' '.join(K), term)
         Freq_t_k = Document._term_frequency(' '.join(K), term)
         log1 = log(1.00/(1.00+Lambda),2)
         log2 = log(Lambda/(1.00+Lambda),2)
         InfoBO1 = -log1 - Freq_t_k * log2
         if InfoBO1 > MaxInfo:
             MaxInfo = InfoBO1
     for term in TopDocsTexts:
         Lambda = Document._term_frequency(' '.join(K), term)
         Freq_t_k = Document._term_frequency(' '.join(K), term)
         log1 = log(1.00/(1.00+Lambda),2)
         log2 = log(Lambda/(1.00+Lambda),2)
         InfoBO1 = -log1 - Freq_t_k * log2
         tfq = q.get_frequency(term)
         tfqN = (tfq +0.00) /MaxTFQ
         if MaxInfo >0 :
             weight[term] = tfqN + Beta*(InfoBO1/MaxInfo)
         else:
             weight[term] = 0
     QPrime = []
     for term in weight.iterkeys():
         if weight[term] > 0.25:
             QPrime.append(term)
     return  QPrime