def load_KL_Data(goldstd_dir,mprc_result_dir, prc_result_dir, medline_dir, mprc_eval_dir,sampleSize): '''Load PRC top hits and MPRC top hits''' eval = Evaluation(goldstd_dir,mprc_result_dir, prc_result_dir, medline_dir, mprc_eval_dir,sampleSize) eval.loadMPRChits() eval.loadPRChits() prc_tophits = eval.PRCtophits mprc_tophits = eval.tophits sample = {} for key in prc_tophits.keys(): sample[key] = [mprc_tophits[key],prc_tophits[key]] return sample
class SimilarityAnalysis(MPRC): '''Interpret the similarity between two documents and identify matched terms and associated scores.''' def __init__(self,pairDict, base_dir, database_dir, stemmed_corpus_dir, vocab_dir, knnterm_dir, model, goldstd_dir,mprc_result_dir,prc_result_dir,medline_dir,interpret_dir, sampleSize): super(SimilarityAnalysis,self).__init__(pairDict, base_dir, database_dir, stemmed_corpus_dir, vocab_dir, mprc_result_dir, knnterm_dir, model) self.query = pairDict.keys()[0] # current query PMID self.interpret_dir = interpret_dir self.interpret_file = os.path.join(interpret_dir,"%s"%self.query) self.eval = Evaluation(goldstd_dir,mprc_result_dir,prc_result_dir,medline_dir,self.interpret_file,sampleSize) self.output = {} self.pklout = {} self.knnTermDict = {} def run_MPRC_SKG(self): # get original PRC weights self.getVocab() # the vocabulary of the articles in pairDict self.vectorizeText() # self.getKNNterms() self.buildDocFreq() # get the document frequency for every word in the vocabulary self.calPRCscores() # calculate the weights # self.cal_PRC_Similarity() # calculate the similarity orig_wtMatrix = self.prc_matrix # the weight matrix from PRC self.adjustWeights() self.buildDocFreq() # get the document frequency for every word in the vocabulary self.calPRCscores() # calculate the weights skg_wtMatrix = self.prc_matrix # the weight matrix from MPRC_SKG # print the precision on this query self.eval.loadMPRChits() self.analzeResults_mprc_skg(orig_wtMatrix, skg_wtMatrix) # self.analyzeResults_mprc() # self.saveOutput() def run_PRC(self): # get original PRC weights self.getVocab() # the vocabulary of the articles in pairDict self.vectorizeText() self.buildDocFreq() # get the document frequency for every word in the vocabulary self.calPRCscores() # calculate the weights orig_wtMatrix = self.prc_matrix # the weight matrix from PRC # print the precision on this query self.eval.loadPRChits() self.analyzeResults_prc(orig_wtMatrix) self.saveOutput() def run_MPRC(self): '''Compare the difference between PRC's selection and MRPC's selections in terms of matched terms.''' self.getVocab() # the vocabulary of the articles in pairDict, 100 articles in the corpus, pmidList size 100 self.vectorizeText() orig_doc_term_matrix = self.doc_term_matrix self.buildDocFreq() # get the document frequency for every word in the vocabulary self.getKNNterms() self.calMPRCscores() # calculate the weights self.eval.loadMPRChits() self.analyzeResults_mprc(orig_doc_term_matrix) self.saveOutput() def analyzeResults_prc(self, orig_wtMatrix): summary = '' if self.query not in self.eval.PRCtophits.keys(): print "This query %s does not exist in pre-calculated PRC top hits."%self.query return for similar in self.eval.PRCtophits[self.query]: # PRC selected similar articles matchTermScoreDict = self.analyzeEachPair_prc(similar, orig_wtMatrix) self.pklout[similar] = (matchTermScoreDict) # output this pair of articles, their matched terms and weight changes summary += "Current pair: %s - %s\n" %(self.query, similar) for k,v in matchTermScoreDict.iteritems(): summary += "%s: %s\n"%(k,str(v)) if self.query not in self.output.keys(): self.output[self.query] = [summary] else: self.output[self.query].append(summary) def analyzeEachPair_prc(self, similar, orig_wtMatrix): '''Analyze PRC outputs''' query_vocab_index = np.where(self.doc_term_matrix[self.pmidList.index(self.query),:]>0)[0].tolist() # query_vocab is a list of index, not acutal terms # get the vocabulary of the similar text similar_vocab_index = np.where(self.doc_term_matrix[self.pmidList.index(similar),:]>0)[0].tolist() # similar article vocabulary indices similar_vocab = [self.vocab[index] for index in similar_vocab_index] match = {} # matched terms in the similar article for index in query_vocab_index: ori_term = self.vocab[index] match[ori_term]=[] # initialize match term dictionary # term weights in the query for term in match.keys(): if term in similar_vocab: query_orig_wt = orig_wtMatrix[0,self.vocab.index(term)] similar_orig_wt = orig_wtMatrix[self.pmidList.index(similar),self.vocab.index(term)] match[term] = [query_orig_wt, similar_orig_wt] else: query_orig_wt = orig_wtMatrix[0,self.vocab.index(term)] match[term] = [query_orig_wt, 0] # 0 means the similar article does not contain this term return match def analyzeResults_mprc(self,orig_doc_term_matrix): '''Extract every pair of articles and call the analyzer function''' for similar in self.eval.tophits[self.query]: # MPRC selected similar articles if similar not in self.pmidList: # if MPRC's selection is not in the original BM25 top 100 selection. this should not happen. continue self.analyzeEachPair_mprc(similar,orig_doc_term_matrix) def analyzeEachPair_mprc(self,similar,orig_doc_term_matrix): '''Analyze a pair of query text and model predicted similar text''' # get the vocabulary of the query text query_vocab_index = np.where(orig_doc_term_matrix[self.pmidList.index(self.query),:]>0)[0].tolist() # query_vocab is a list of index, not acutal terms query_vocab = [self.vocab[index] for index in query_vocab_index] # get the vocabulary of the similar text similar_vocab_index = np.where(orig_doc_term_matrix[self.pmidList.index(similar),:]>0)[0].tolist() # query_vocab is a list of index, not acutal terms similar_vocab = [self.vocab[index] for index in similar_vocab_index] match = {} # get the expanded vocabulary of the query text and the matched terms in the similar article for index in query_vocab_index: ori_term = self.vocab[index] overlap=[] if ori_term in self.knnTermDict.keys(): knn_termList = self.knnTermDict[ori_term] knn_termList = [t for t in knn_termList if t in self.vocab] overlap = list(set([ori_term]+knn_termList).intersection(set(similar_vocab))) else: overlap = list(set([ori_term]).intersection(set(similar_vocab))) if overlap: match[ori_term] = overlap # output the summary of matched terms summary = "Current pair: %s - %s\n" %(self.query, similar) summary += "Word count of the query %s: %d\n"%(self.query,np.sum(self.doc_term_matrix[self.pmidList.index(self.query),:])) summary += "Word count of the similar article %s: %d\n"%(similar,np.sum(self.doc_term_matrix[self.pmidList.index(similar),:])) for k,v in match.iteritems(): summary += "%s: %s\n"%(k,";".join(v)) summary += "\n" if self.query not in self.output.keys(): self.output[self.query] = [summary] else: self.output[self.query].append(summary) def analzeResults_mprc_skg(self, orig_wtMatrix, skg_wtMatrix): '''Extract every pair of articles and call the analyzer function''' summary = '' for similar in self.eval.tophits[self.query]: # MPRC_SKG selected similar articles matchTermScoreDict = self.analyzeEachPair_mprc_skg(similar, orig_wtMatrix, skg_wtMatrix) # output this pair of articles, their matched terms and weight changes summary += "Current pair: %s - %s\n" %(self.query, similar) for k,v in matchTermScoreDict.iteritems(): summary += "%s: %s\n"%(k,str(v)) summary += "\n" if self.query not in self.output.keys(): self.output[self.query] = [summary] else: self.output[self.query].append(summary) def analyzeEachPair_mprc_skg(self, similar, orig_wtMatrix, skg_wtMatrix): '''Analyze MPRC_SKG outputs''' query_vocab_index = np.where(self.doc_term_matrix[self.pmidList.index(self.query),:]>0)[0].tolist() # query_vocab is a list of index, not acutal terms # query_vocab = [self.vocab[index] for index in query_vocab_index] # get the vocabulary of the similar text similar_vocab_index = np.where(self.doc_term_matrix[self.pmidList.index(similar),:]>0)[0].tolist() # query_vocab is a list of index, not acutal terms similar_vocab = [self.vocab[index] for index in similar_vocab_index] match = {} # matched terms in the similar article for index in query_vocab_index: ori_term = self.vocab[index] if ori_term in similar_vocab: match[ori_term]=[] # initialize match term dictionary # term weights in the query for term in match.keys(): query_orig_wt = orig_wtMatrix[0,self.vocab.index(term)] query_new_wt = skg_wtMatrix[0,self.vocab.index(term)] similar_orig_wt = orig_wtMatrix[self.pmidList.index(similar),self.vocab.index(term)] similar_new_wt = skg_wtMatrix[self.pmidList.index(similar),self.vocab.index(term)] match[term] = [query_new_wt/query_orig_wt,similar_new_wt/similar_orig_wt] return match def saveOutput(self): fout = file(self.interpret_file,"w") for summary in self.output.values(): for s in summary: fout.write(s) pklFile = self.interpret_file+".pkl" pickle.dump(self.pklout,file(pklFile,"w"))