Ejemplo n.º 1
0
def load_KL_Data(goldstd_dir,mprc_result_dir, prc_result_dir, medline_dir, mprc_eval_dir,sampleSize):
    '''Load PRC top hits and MPRC top hits'''
    eval = Evaluation(goldstd_dir,mprc_result_dir, prc_result_dir, medline_dir, mprc_eval_dir,sampleSize)
    eval.loadMPRChits()
    eval.loadPRChits()
    prc_tophits = eval.PRCtophits
    mprc_tophits = eval.tophits
    sample = {}
    for key in prc_tophits.keys():
        sample[key] = [mprc_tophits[key],prc_tophits[key]]
    return sample
Ejemplo n.º 2
0
class SimilarityAnalysis(MPRC):
    '''Interpret the similarity between two documents and identify matched terms and associated scores.'''
    def __init__(self,pairDict, base_dir, database_dir, stemmed_corpus_dir, vocab_dir, knnterm_dir, model, goldstd_dir,mprc_result_dir,prc_result_dir,medline_dir,interpret_dir, sampleSize): 
        super(SimilarityAnalysis,self).__init__(pairDict, base_dir, database_dir, stemmed_corpus_dir, vocab_dir, mprc_result_dir, knnterm_dir, model)
        self.query = pairDict.keys()[0] # current query PMID
        self.interpret_dir = interpret_dir
        self.interpret_file = os.path.join(interpret_dir,"%s"%self.query)
        self.eval = Evaluation(goldstd_dir,mprc_result_dir,prc_result_dir,medline_dir,self.interpret_file,sampleSize)
        self.output = {}
        self.pklout = {}
        self.knnTermDict = {}
        
    def run_MPRC_SKG(self):
        # get original PRC weights
        self.getVocab() # the vocabulary of the articles in pairDict
        self.vectorizeText()
#         self.getKNNterms()
        self.buildDocFreq() # get the document frequency for every word in the vocabulary
        self.calPRCscores() # calculate the weights
#         self.cal_PRC_Similarity() # calculate the similarity
        orig_wtMatrix = self.prc_matrix # the weight matrix from PRC
        self.adjustWeights()
        self.buildDocFreq() # get the document frequency for every word in the vocabulary
        self.calPRCscores() # calculate the weights
        skg_wtMatrix = self.prc_matrix # the weight matrix from MPRC_SKG
        # print the precision on this query
        self.eval.loadMPRChits()
        self.analzeResults_mprc_skg(orig_wtMatrix, skg_wtMatrix)
#         self.analyzeResults_mprc()
#         self.saveOutput()
        
    def run_PRC(self):
        # get original PRC weights
        self.getVocab() # the vocabulary of the articles in pairDict
        self.vectorizeText()
        self.buildDocFreq() # get the document frequency for every word in the vocabulary
        self.calPRCscores() # calculate the weights
        orig_wtMatrix = self.prc_matrix # the weight matrix from PRC
        # print the precision on this query
        self.eval.loadPRChits()
        self.analyzeResults_prc(orig_wtMatrix)
        self.saveOutput()
    
    def run_MPRC(self):
        '''Compare the difference between PRC's selection and MRPC's selections in terms of matched terms.'''
        self.getVocab() # the vocabulary of the articles in pairDict, 100 articles in the corpus, pmidList size 100
        self.vectorizeText()
        orig_doc_term_matrix = self.doc_term_matrix
        self.buildDocFreq() # get the document frequency for every word in the vocabulary
        self.getKNNterms()
        self.calMPRCscores() # calculate the weights
        self.eval.loadMPRChits()
        self.analyzeResults_mprc(orig_doc_term_matrix)   
        self.saveOutput()     
 
    def analyzeResults_prc(self, orig_wtMatrix):
        summary = ''
        if self.query not in self.eval.PRCtophits.keys():
            print "This query %s does not exist in pre-calculated PRC top hits."%self.query
            return
        for similar in self.eval.PRCtophits[self.query]: # PRC selected similar articles
            matchTermScoreDict = self.analyzeEachPair_prc(similar, orig_wtMatrix)
            self.pklout[similar] = (matchTermScoreDict)
            # output this pair of articles, their matched terms and weight changes
            summary += "Current pair: %s - %s\n" %(self.query, similar)
            for k,v in matchTermScoreDict.iteritems():
                summary += "%s: %s\n"%(k,str(v))
        if self.query not in self.output.keys():
            self.output[self.query] = [summary]
        else:
            self.output[self.query].append(summary)  
    
    def analyzeEachPair_prc(self, similar, orig_wtMatrix):
        '''Analyze PRC outputs'''
        query_vocab_index = np.where(self.doc_term_matrix[self.pmidList.index(self.query),:]>0)[0].tolist() # query_vocab is a list of index, not acutal terms
        # get the vocabulary of the similar text
        similar_vocab_index = np.where(self.doc_term_matrix[self.pmidList.index(similar),:]>0)[0].tolist() # similar article vocabulary indices
        similar_vocab = [self.vocab[index] for index in similar_vocab_index]
        match = {}
        # matched terms in the similar article
        for index in query_vocab_index:
            ori_term = self.vocab[index]
            match[ori_term]=[] # initialize match term dictionary            
        # term weights in the query
        for term in match.keys():
            if term in similar_vocab:
                query_orig_wt = orig_wtMatrix[0,self.vocab.index(term)]
                similar_orig_wt = orig_wtMatrix[self.pmidList.index(similar),self.vocab.index(term)]
                match[term] = [query_orig_wt, similar_orig_wt]
            else:
                query_orig_wt = orig_wtMatrix[0,self.vocab.index(term)]
                match[term] = [query_orig_wt, 0] # 0 means the similar article does not contain this term         
        return match

    def analyzeResults_mprc(self,orig_doc_term_matrix):
        '''Extract every pair of articles and call the analyzer function'''
        for similar in self.eval.tophits[self.query]: # MPRC selected similar articles
            if similar not in self.pmidList: # if MPRC's selection is not in the original BM25 top 100 selection. this should not happen.
                continue
            self.analyzeEachPair_mprc(similar,orig_doc_term_matrix)
        
    def analyzeEachPair_mprc(self,similar,orig_doc_term_matrix):
        '''Analyze a pair of query text and model predicted similar text''' 
        # get the vocabulary of the query text
        query_vocab_index = np.where(orig_doc_term_matrix[self.pmidList.index(self.query),:]>0)[0].tolist() # query_vocab is a list of index, not acutal terms
        query_vocab = [self.vocab[index] for index in query_vocab_index]
        # get the vocabulary of the similar text
        similar_vocab_index = np.where(orig_doc_term_matrix[self.pmidList.index(similar),:]>0)[0].tolist() # query_vocab is a list of index, not acutal terms
        similar_vocab = [self.vocab[index] for index in similar_vocab_index]
        match = {}
        # get the expanded vocabulary of the query text and the matched terms in the similar article
        for index in query_vocab_index:
            ori_term = self.vocab[index]
            overlap=[]
            if ori_term in self.knnTermDict.keys():
                knn_termList = self.knnTermDict[ori_term]
                knn_termList = [t for t in knn_termList if t in self.vocab]
                overlap = list(set([ori_term]+knn_termList).intersection(set(similar_vocab)))
            else:
                overlap = list(set([ori_term]).intersection(set(similar_vocab)))
            if overlap:
                match[ori_term] = overlap
        # output the summary of matched terms
        summary = "Current pair: %s - %s\n" %(self.query, similar)
        summary += "Word count of the query %s: %d\n"%(self.query,np.sum(self.doc_term_matrix[self.pmidList.index(self.query),:]))
        summary += "Word count of the similar article %s: %d\n"%(similar,np.sum(self.doc_term_matrix[self.pmidList.index(similar),:]))
        for k,v in match.iteritems():
            summary += "%s: %s\n"%(k,";".join(v))
        summary += "\n"
        if self.query not in self.output.keys():
            self.output[self.query] = [summary]
        else:
            self.output[self.query].append(summary)
        
    def analzeResults_mprc_skg(self, orig_wtMatrix, skg_wtMatrix):
        '''Extract every pair of articles and call the analyzer function'''
        summary = ''
        for similar in self.eval.tophits[self.query]: # MPRC_SKG selected similar articles
            matchTermScoreDict = self.analyzeEachPair_mprc_skg(similar, orig_wtMatrix, skg_wtMatrix)
            # output this pair of articles, their matched terms and weight changes
            summary += "Current pair: %s - %s\n" %(self.query, similar)
            for k,v in matchTermScoreDict.iteritems():
                summary += "%s: %s\n"%(k,str(v))
                summary += "\n"
        if self.query not in self.output.keys():
            self.output[self.query] = [summary]
        else:
            self.output[self.query].append(summary)
            
    def analyzeEachPair_mprc_skg(self, similar, orig_wtMatrix, skg_wtMatrix):
        '''Analyze MPRC_SKG outputs'''
        query_vocab_index = np.where(self.doc_term_matrix[self.pmidList.index(self.query),:]>0)[0].tolist() # query_vocab is a list of index, not acutal terms
#         query_vocab = [self.vocab[index] for index in query_vocab_index]
        # get the vocabulary of the similar text
        similar_vocab_index = np.where(self.doc_term_matrix[self.pmidList.index(similar),:]>0)[0].tolist() # query_vocab is a list of index, not acutal terms
        similar_vocab = [self.vocab[index] for index in similar_vocab_index]
        match = {}
        # matched terms in the similar article
        for index in query_vocab_index:
            ori_term = self.vocab[index]
            if ori_term in similar_vocab:
                match[ori_term]=[] # initialize match term dictionary
        # term weights in the query
        for term in match.keys():
            query_orig_wt = orig_wtMatrix[0,self.vocab.index(term)]
            query_new_wt = skg_wtMatrix[0,self.vocab.index(term)]
            similar_orig_wt = orig_wtMatrix[self.pmidList.index(similar),self.vocab.index(term)]
            similar_new_wt = skg_wtMatrix[self.pmidList.index(similar),self.vocab.index(term)]
            match[term] = [query_new_wt/query_orig_wt,similar_new_wt/similar_orig_wt]
        return match
                    
    def saveOutput(self):
        fout = file(self.interpret_file,"w")
        for summary  in self.output.values():
            for s in summary:
                fout.write(s)
        pklFile = self.interpret_file+".pkl"
        pickle.dump(self.pklout,file(pklFile,"w"))