def embedded_query_expansion_qi(self, interpolated_aplpha, m): query_embedded = self.query_embedded query_wordcount = self.query_wordcount collection = self.collection collection_total_similarity = self.collection_total_similarity word2vec = self.word2vec # copy query model query_model = Pickle.load(open("model/query_model.pkl", "rb")) embedded_query_expansion = query_model update_embedded_query_expansion = {} if os.path.isfile("model/update_embedded_query_expansion_qi.pkl") == True: # check if a file exist update_embedded_query_expansion = Pickle.load(open("model/update_embedded_query_expansion_qi.pkl", "rb")) else: # calculate every query for query, query_word_count_dict in query_wordcount.items(): top_prob_dict = {} # calculate every word in collection for word in collection.keys(): # for every word in current query query_length = ProcDoc.word_sum(query_word_count_dict) * 1.0 # p(w|q) p_w_q = 0 if not word in query_word_count_dict: for word_sq, word_sq_count in query_word_count_dict.items(): total_probability = collection_total_similarity[word_sq] if word_sq in query_embedded: cur_word_similarity = word2vec.getWordSimilarity(collection[word], query_embedded[word_sq]) p_w_q += (cur_word_similarity / total_probability ) * (word_sq_count / query_length) # storage probability top_prob_dict[word] = p_w_q # softmax top_prob_dict = ProcDoc.softmax(top_prob_dict) # sorted top_prob_dict by value(probability) top_prob_list = sorted(top_prob_dict.items(), key=operator.itemgetter(1), reverse = True) # storage update query model value update_embedded_query_expansion[query] = top_prob_list Pickle.dump(update_embedded_query_expansion, open("model/update_embedded_query_expansion_qi.pkl", "wb"), True) # update query model for update_query, update_query_word_list in update_embedded_query_expansion.items(): filepath = "visual/" + update_query + "_qi.png" if os.path.isfile(filepath) == False: visualization.visualization(collection, update_query_word_list, filepath) for update_word, update_count in update_query_word_list[:m]: update = update_count origin = 0 if update_word in query_model[update_query]: origin = query_model[update_query][update_word] query_model[update_query].pop(update_word, None) embedded_query_expansion[update_query][update_word] = interpolated_aplpha * origin + (1 - interpolated_aplpha) * update for un_changed_word in query_model[update_query].keys(): embedded_query_expansion[update_query][un_changed_word] *= interpolated_aplpha # softmax embedded_query_expansion[update_query] = ProcDoc.softmax(embedded_query_expansion[update_query]) return embedded_query_expansion
def __init__(self, num_of_homo_feats=10, max_qry_length=1794, max_doc_length=2907, query_path=None, document_path=None, corpus="TDT2"): res_pos = True str2int = True self.num_vocab = 51253 self.max_qry_length = max_qry_length self.max_doc_length = max_doc_length self.num_of_homo_feats = num_of_homo_feats if query_path == None: query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if document_path == None: document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW" # read document, reserve position doc = ProcDoc.read_file(document_path) self.doc = ProcDoc.doc_preprocess(doc, res_pos, str2int) # read query, reserve position qry = ProcDoc.read_file(query_path) self.qry = ProcDoc.query_preprocess(qry, res_pos, str2int) # HMMTrainingSet self.hmm_training_set = ProcDoc.read_relevance_dict() self.homo_feats = self.__genFeature(num_of_homo_feats)
def specific_modeling(feedback_doc): # normalize, sum of the (word_prob = 1) in the document feedback_w_doc = ProcDoc.inverted_word_doc(dict(feedback_doc)) for word, doc_unigram in feedback_w_doc.items(): feedback_w_doc[word] = ProcDoc.softmax(dict(doc_unigram)) # specific modeling # if the term frequency is supported by almost all documents # the term will be penalized because of its low prevalence. specific_model = {} for word, doc_unigram in feedback_w_doc.items(): # calculate each word in current document word_specific_level = 0 for doc_name, prob in doc_unigram.items(): cur_doc_word_prob = prob ''' for other_doc_name, other_prob in doc_unigram.items(): if doc_name == other_doc_name: continue cur_doc_word_prob *= (1 - other_prob) ''' # word_specific_level += cur_doc_word_prob word_specific_level += -1 * cur_doc_word_prob * log( cur_doc_word_prob) # specific_model[word] = word_specific_level specific_model[word] = sigmoid(1.0 / (0.5 + word_specific_level)) # softmax specific_model = ProcDoc.softmax(dict(specific_model)) return specific_model
def feedback(query_docs_point_dict, query_model, doc_unigram, doc_wordcount, general_model, background_model, topN): lambda_bg = 0.1 lambda_fb = 0.8 lambda_ir_fb = 0.2 lambda_q = 0.1 specific_model = {} for q_key, docs_point_list in query_docs_point_dict.items(): feedback_doc = {} feedback_doc_wc = {} # Extract feedback document for doc_name in docs_point_list[0:topN]: feedback_doc[doc_name] = copy.deepcopy(doc_unigram[doc_name]) feedback_doc_wc[doc_name] = copy.deepcopy(doc_wordcount[doc_name]) # generate specific model specific_model = specific_modeling(dict(feedback_doc)) # generate significant model significant_model = significant_modeling(general_model, specific_model, feedback_doc, feedback_doc_wc) ''' ir_feedback_doc = {} ir_feedback_doc_wc = {} # Extract irrelevant feedback document for doc_name, point in docs_point_list[len(docs_point_list)-topN:]: ir_feedback_doc[doc_name] = doc_unigram[doc_name] ir_feedback_doc_wc[doc_name] = doc_wordcount[doc_name] # generate specific model ir_specific_model = specific_modeling(dict(ir_feedback_doc)) # generate significant model ir_significant_model = significant_modeling(general_model, ir_specific_model, ir_feedback_doc, ir_feedback_doc_wc) ''' for word, fb_w_prob in significant_model.items(): original_prob = 0.0 if word in query_model[q_key]: original_prob = query_model[q_key][word] else: original_prob = 0.0 # update query unigram query_model[q_key][word] = (lambda_q * original_prob) + ( lambda_fb * fb_w_prob) + (lambda_bg * background_model[word]) ''' for word, ir_fb_w_prob in ir_significant_model.items(): if word in query_model[q_key]: query_model[q_key][word] = (1 - lambda_ir_fb) * query_model[q_key][word] + lambda_ir_fb * ir_fb_w_prob ''' query_model[q_key] = ProcDoc.softmax(dict(query_model[q_key])) query_model, query_IDs = ProcDoc.dict2np(query_model) # plot_diagram.plotModel(general_model, specific_model, significant_model, feedback_doc_wc, feedback_doc) return [query_model, query_IDs]
def run(): INIT_PROBABILITY = 1.0 / 60 topic_word_prob_dict = ProcDoc.read_clusters() # read cluster P(W|T), {T: {W:Prob}} doc_topic_prob_dict = defaultdict(dict) # P(T|D),{D:{T:Prob}} doc_word_topic_prob_dict = defaultdict(dict) # P(T| w, D), {D: {word:{T:prob}}} doc_wc_dict = ProcDoc.read_doc_dict() # read document (Doc No.,Doc content) doc_wc_dict = ProcDoc.doc_preprocess(doc_wc_dict) # calculate word of the background # convert (Doc No.,Doc content) to (Doc_No, {word, count}) for docName, content in doc_wc_dict.items(): temp_dict = ProcDoc.word_count(content, {}) doc_wc_dict[docName] = temp_dict # initialize P(T|D) print "Initialize P(T|D)" for docName, wordCount in doc_wc_dict.items(): topic_prob = {} for topic, wordProb in topic_word_prob_dict.items(): doc_topic_prob_dict[docName][topic] = INIT_PROBABILITY ''' print "Initialize P(T| w, D)" for docName, wordCount in doc_wc_dict.items(): word_list = {} for word, frequency in wordCount.items(): topic_prob = {} for topic, wordProb in topic_word_prob_dict.items(): topic_prob[topic] = 0.0 word_list[word] = topic_prob doc_word_topic_prob_dict[docName] = word_list ''' print "start PLSA" [topic_word_prob_dict, doc_topic_prob_dict] = PLSA.Probability_LSA(doc_wc_dict, doc_topic_prob_dict, topic_word_prob_dict, doc_word_topic_prob_dict) print "end PLSA" p_plsa = {} # PLSA P(W|D) {D: {W : Prob}} for doc, topic_prob_list in doc_topic_prob_dict.items(): p_plsa_word = {} for topic, doc_prob in topic_prob_list.items(): for word, word_prob in topic_word_prob_dict[topic].items(): print word, word_prob if word in p_plsa_word: p_plsa_word[word] += word_prob * doc_prob else: p_plsa_word[word] = word_prob * doc_prob p_plsa[doc] = p_plsa_word return p_plsa
def __genFeature(self, num_of_homo_feats): ###################### TODO ###################### print "generate h features" qry = self.qry doc = self.doc homo_feats = {} df = ProcDoc.docFreq(doc) for q_id, q_terms in qry.items(): npscq = np.asarray([self.scq(df, q_term) for q_term in q_terms]) homo_feats[q_id] = np.asarray([ np.sum(npscq), np.amax(npscq), np.amin(npscq), np.mean(npscq) ]) # np.sum(a) # np.amax(a) # np.amin(a) # np.mean(a) # a.prod()**(1.0/len(a)) # len(a) / np.sum(1.0/a) # var = variation(a, axis=0) idmax = np.argmax(var) return homo_feats
def significant_modeling(general_model, specific_model, feedback_doc, feedback_doc_wc): lambda_sw = 0.1 lambda_s = 0.2 lambda_g = 0.7 significant_model = {} # initialize feedback_word = [] for doc_name, word_count in feedback_doc_wc.items(): for word, count in word_count.items(): if word in feedback_word: continue else: feedback_word.append(word) for s_word in feedback_word: significant_model[s_word] = 1.0 / len(feedback_word) hidden_significant_doc_word = {} objective_value_list = [] # EM training for step in range(100): # E Step: for doc_name, word_count in feedback_doc_wc.items(): hidden_word_variable = {} for word, count in word_count.items(): denominator = lambda_sw * significant_model[ word] + lambda_s * specific_model[ word] + lambda_g * general_model[word] hidden_word_variable[ word] = lambda_sw * significant_model[word] / denominator hidden_significant_doc_word[doc_name] = hidden_word_variable # M Step: denominator = 0.0 for word in list(significant_model.keys()): word_sum = 0 for doc_name, word_count in feedback_doc_wc.items(): if word in word_count: word_sum += word_count[word] * hidden_significant_doc_word[ doc_name][word] denominator += word_sum significant_model[word] = word_sum significant_model = { word: word_sum / denominator for word, word_sum in dict(significant_model).items() } # softmax significant_model = ProcDoc.softmax(dict(significant_model)) # Objective function objective_value = 0.0 for doc_name, word_count in feedback_doc_wc.items(): for word, count in word_count.items(): objective_value += count * log(lambda_sw * significant_model[word] + lambda_g * general_model[word] + lambda_s * specific_model[word]) objective_value_list.append(objective_value) #plot_diagram.plotList(objective_value_list) return significant_model
def main(): documents = ProcDoc.read_doc() texts = [[word for word in document.lower().split()] for document in documents] total_docs = len(texts) * 1.0 term_freq = [] doc_freq = {} for text in texts: cur_term_freq = {} for token in text: if token in cur_term_freq: cur_term_freq[token] += 1 else: cur_term_freq[token] = 1 if token in doc_freq: doc_freq[token] += 1 else: doc_freq[token] = 1 term_freq.append(cur_term_freq) tfidf = [] for doc_tf in term_freq: doc_tfidf = {} for term, tf in doc_tf.items(): idf = log(1 + total_docs / doc_freq[term]) doc_tfidf[term] = tf / idf tfidf.append(doc_tfidf) _tfidf = [] for doc_tfidf in tfidf: vector = [] for token in doc_freq.keys(): if token in doc_tfidf: vector.append(doc_tfidf[token]) else: vector.append(0) _tfidf.append(vector) _tfidf = np.array(_tfidf) output = Queue() pipeline = [_tfidf[:len(_tfidf) * 1/ 4], _tfidf[len(_tfidf) * 1/ 4:len(_tfidf) * 2/ 4], _tfidf[len(_tfidf) * 2/ 4:len(_tfidf) * 3/ 4], _tfidf[len(_tfidf) * 3/ 4:]] processes = [Process(target=my_cosine_similarity, args=(output, x_func, _tfidf)) for x_func in pipeline] for p in processes: p.start() result = [output.get() for p in processes] result.sort() cosine_sim = [] for r in results: cosine_sim += r[1] cosine_sim = sparse.csr_matrix(cosine_sim) print cosine_sim return cosine_sim
def feedback(query_docs_point_dict, query_model, doc_unigram, doc_wordcount, general_model, background_model, topN): lambda_bg = 0.1 lambda_fb = 0.8 lambda_ir_fb = 0.2 lambda_q = 0.1 specific_model = {} significant_model_dict = {} for q_key, docs_point_list in query_docs_point_dict.items(): feedback_doc = {} feedback_doc_wc = {} # Extract feedback document for doc_name, point in docs_point_list[0:topN]: feedback_doc[doc_name] = copy.deepcopy(doc_unigram[doc_name]) feedback_doc_wc[doc_name] = copy.deepcopy(doc_wordcount[doc_name]) # generate specific model specific_model = specific_modeling(dict(feedback_doc)) # generate significant model significant_model = significant_modeling(general_model, specific_model, feedback_doc, feedback_doc_wc) ''' ir_feedback_doc = {} ir_feedback_doc_wc = {} # Extract irrelevant feedback document for doc_name, point in docs_point_list[len(docs_point_list)-topN:]: ir_feedback_doc[doc_name] = doc_unigram[doc_name] ir_feedback_doc_wc[doc_name] = doc_wordcount[doc_name] # generate specific model ir_specific_model = specific_modeling(dict(ir_feedback_doc)) # generate significant model ir_significant_model = significant_modeling(general_model, ir_specific_model, ir_feedback_doc, ir_feedback_doc_wc) ''' for word, fb_w_prob in significant_model.items(): original_prob = 0.0 if word in query_model[q_key]: original_prob = query_model[q_key][word] else: original_prob = 0.0 # update query unigram query_model[q_key][word] = (lambda_q * original_prob) + ( lambda_fb * fb_w_prob) + (lambda_bg * background_model[word]) ''' for word, ir_fb_w_prob in ir_significant_model.items(): if word in query_model[q_key]: query_model[q_key][word] = (1 - lambda_ir_fb) * query_model[q_key][word] + lambda_ir_fb * ir_fb_w_prob ''' query_model[q_key] = ProcDoc.softmax(dict(query_model[q_key])) #plot_diagram.plotModel(general_model, specific_model, significant_model, feedback_doc_wc, feedback_doc) if topN == None: with open("rel_supervised_swlm_entropy_s.pkl", "wb") as file: Pickle.dump(query_model, file, True) else: with open("rel_swlm_entropy_S_" + str(topN) + ".pkl", "wb") as file: Pickle.dump(query_model, file, True) return query_model
def calculate(pred_relevance, split_idx): rel_query_model = pred_relevance print type(rel_query_model) print rel_query_model.shape.eval() with open("query_model.pkl", "rb") as file: query_model = Pickle.load(file)[:split_idx] with open("query_list.pkl", "rb") as file: query_list = Pickle.load(file)[:split_idx] with open("doc_model.pkl", "rb") as file: doc_model = Pickle.load(file) with open("doc_list.pkl", "rb") as file: doc_list = Pickle.load(file) #with open("relevance_model_RM.pkl", "rb") as file : rel_query_model = Pickle.load(file) #with open("query_relevance_model_RLE.pkl", "rb") as file : rel_query_model = Pickle.load(file) background_model = ProcDoc.read_background_dict() qry_eval = evaluate.evaluate_model(True) ''' document smoothing ''' for doc_idx in range(doc_model.shape[0]): doc_vec = doc_model[doc_idx] doc_model[doc_idx] = (1 - doc_lambda) * doc_vec + doc_lambda * background_model mAP_list = [] query_rel_list = [] query_bg_list = [] doc_model = np.log(doc_model) doc_model = doc_model for rel_qry_lambda in np.linspace(0, 1., num=11): ''' query smoothing ''' with open("query_model.pkl", "rb") as file: query_model = Pickle.load(file)[:split_idx] X = T.matrix() Y = (1- rel_qry_lambda)*X + rel_qry_lambda * rel_query_model f = theano.function([X], Y) query_model = f(query_model) result = np.argsort(-np.dot(query_model, doc_model.T), axis = 1) query_docs_ranking = {} ''' speedup ''' for q_idx in range(len(query_list)): docs_ranking = [] for doc_idx in result[q_idx]: docs_ranking.append(doc_list[doc_idx]) query_docs_ranking[query_list[q_idx]] = docs_ranking ''' query for query_key, query_vec in zip(query_list, query_model): print len(query_docs_ranking.keys()) query_result = np.argsort(-(query_vec * doc_model).sum(axis = 1)) docs_ranking = [] for doc_`idx in query_result: docs_ranking.append(doc_list[doc_idx]) query_docs_ranking[query_key] = docs_ranking mAP = eval.mean_average_precision(query_docs_ranking) print mAP, qry_lambda, rel_qry_lambda ''' mAP = qry_eval.mean_average_precision(query_docs_ranking) mAP_list.append(mAP) return max(mAP_list)
def plotModel(general_model, specific_model, significant_model, feedback_doc_wc, feedback_doc_unigram): general_model_softmax = {} general_list = [] specific_list = [] significant_list = [] unigram_list = [] feedback_wc = {} feedback_wu = {} for doc, wc in feedback_doc_wc.items(): total_word_sum = ProcDoc.word_sum(wc) for word, count in wc.items(): if word in feedback_wc: feedback_wc[word] += count feedback_wu[ word] += total_word_sum * feedback_doc_unigram[doc][word] else: feedback_wc[word] = count feedback_wu[ word] = total_word_sum * feedback_doc_unigram[doc][word] feedback_wc = sorted(feedback_wc.items(), key=operator.itemgetter(1), reverse=True) total_word_sum = ProcDoc.word_sum(dict(feedback_wc)) for word, count in feedback_wc: general_list.append(count) specific_list.append(total_word_sum * specific_model[word]) significant_list.append(total_word_sum * significant_model[word]) unigram_list.append(feedback_wu[word]) import matplotlib.pyplot as plt plt.figure(8) plt.plot(range(len(general_list)), general_list, label='general') plt.plot(range(len(specific_list)), specific_list, label='specific') # plt.plot(range(len(significant_list)), significant_list, label='significant') # plt.plot(range(len(unigram_list)), unigram_list, label='unigram') plt.title('Loss') plt.legend(loc='upper left') plt.title('Accuracy') plt.show() r = raw_input()
def __init__(self, qry_path=None, rel_path=None, isTraining=True, doc_path=None): # default training step if qry_path == None: qry_path = "../Corpus/TDT2/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if doc_path == None: doc_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW" if rel_path == None: rel_path = "../Corpus/TDT2/Train/QDRelevanceTDT2_forHMMOutSideTrain" self.vocab_size = 51253 # relevance set self.rel_set = ProcDoc.readRELdict(rel_path, isTraining) self.evaluate_model = EvaluateModel(rel_path, isTraining) # read documents doc = ProcDoc.readFile(doc_path) self.doc = ProcDoc.docPreproc(doc) self.doc_len = Statistical.compLenAcc(self.doc) # read queries qry = ProcDoc.readFile(qry_path) self.qry_tf = ProcDoc.qryPreproc(qry, self.rel_set) self.qry_len = Statistical.compLenAcc(self.qry_tf) [self.qry, self.doc] = Statistical.TFIDF(self.qry_tf, self.doc, self.qry_len, self.doc_len) # dict to numpy self.qry_tf, self.qry_tf_IDs = self.__dict2np(self.qry_tf) self.qry, self.qry_IDs = self.__dict2np(self.qry, self.qry_tf_IDs) self.doc, self.doc_IDs = self.__dict2np(self.doc) # precompute len(document) self.doc = Statistical.l2Normalize(self.doc)
def __init__(self, query_model, isSpoken=False): smoothing = 0.0 with open("test_query_list.pkl", "rb") as file: self.query_list = pickle.load(file) with open("doc_list.pkl", "rb") as file: self.doc_list = pickle.load(file) if isSpoken: with open("doc_model_wc_s.pkl", "rb") as file: doc_model = pickle.load(file) else: with open("doc_model_wc.pkl", "rb") as file: doc_model = pickle.load(file) background_model = ProcDoc.read_background_dict() self.query_model = copy.deepcopy(query_model) self.vocabulary_size = 51253 self.doc_model = copy.deepcopy(doc_model) self.doc_model = doc_model self.background_model = background_model
def __init__(self, query_model): self.query_model = copy.deepcopy(query_model) self.vocabulary_size = 51253 smoothing = 0.1 with open("test_query_list.pkl", "rb") as file: self.query_list = pickle.load(file) with open("doc_list.pkl", "rb") as file: self.doc_list = pickle.load(file) with open("doc_model.pkl", "rb") as file: doc_model = pickle.load(file) self.background_model = ProcDoc.read_background_dict() ''' smoothing ''' for d_idx, doc_vec in enumerate(doc_model): doc_model[d_idx] = ( 1 - smoothing) * doc_vec + smoothing * self.background_model self.doc_model = doc_model
def __genFeature(self, num_of_homo_feats): print "generate h features" qry = self.qry doc = self.doc homo_feats = {} df = ProcDoc.docFreq(doc) for q_id, q_terms in qry.items(): npscq = np.asarray([self.__scq(df, q_term) for q_term in q_terms]) harm_mean = self.__harm_mean(npscq) geo_mean = self.__geo_mean(npscq) homo_feats[q_id] = np.asarray([ np.std(npscq), np.sum(npscq), np.amax(npscq), np.amin(npscq), np.mean(npscq), harm_mean, geo_mean ]) return homo_feats
def MStep(doc_wc_dict, doc_topic_prob_dict, topic_word_prob_dict, doc_word_topic_prob_dict): # P(w | T) for tp, w_prob_list in topic_word_prob_dict.items(): for word, word_prob in w_prob_list.items(): denominator = 0.0 for w, w_p in w_prob_list.items(): for doc_name, doc_wc_list in doc_wc_dict.items(): try: d_w_c = doc_wc_list[w] d_w_t_p = doc_word_topic_prob_dict[doc_name][w][tp] denominator += d_w_c * d_w_t_p except KeyError: pass molecellur = 0.0 for doc_name, doc_wc_list in doc_wc_dict.items(): try: d_w_c = doc_wc_list[word] d_w_t_p = doc_word_topic_prob_dict[doc_name][word][tp] molecellur += d_w_c * d_w_t_p except KeyError: pass if denominator != 0.0: topic_word_prob_dict[tp][word] = molecellur / denominator # P(T| D) for doc_name, topic_list in doc_topic_prob_dict.items(): denominator = ProcDoc.word_sum(doc_wc_dict[doc_name]) * 1.0 for tp, tp_prob in topic_list.items(): molecellur = 0.0 for d_w, doc_wc in doc_wc_dict[doc_name].items(): try: d_w_c = doc_wc d_w_t_p = doc_word_topic_prob_dict[doc_name][d_w][tp] molecellur += d_w_c * d_w_t_p / denominator except KeyError: pass doc_topic_prob_dict[doc_name][tp] = molecellur
import sys sys.path.append("../Tools") import numpy as np import cPickle as pickle import ProcDoc from PLSA_class import pLSA from Clustering import ClusterModel np.random.seed(1337) corpus = "TDT2" doc_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW" cluster_dir = "Topic" num_of_topic = 4 iterations = 20 doc = ProcDoc.readFile(doc_path) doc_dict = ProcDoc.docPreproc(doc) # general model collection = {} for doc_ID, word_count in doc_dict.items(): for word, count in word_count.items(): if word in collection: collection[word] += count else: collection[word] = count if not os.path.isfile(cluster_dir + "/pwz_list.pkl"): with open("exp/w_IDs.pkl", "wb") as wIDs_file : pickle.dump(collection.keys(), wIDs_file, True) cluster_mdl = ClusterModel(doc_dict, collection.keys(), num_of_topic) cluster_mdl.save(cluster_dir)
import cPickle as Pickle import os data = {} # content of document (doc, content) background_model = {} # word count of 2265 document (word, number of words) general_model = {} query = {} # query vocabulary = np.zeros(51253) #document_path = "../Corpus/Spoken_Doc" document_path = "../Corpus/SPLIT_DOC_WDID_NEW" query_path = "../Corpus/Train/XinTrainQryTDT2/QUERY_WDID_NEW" # read document data = ProcDoc.read_file(document_path) doc_wordcount = ProcDoc.doc_preprocess(data) # HMMTraingSet HMMTraingSetDict = ProcDoc.read_relevance_dict() query_relevance = {} query = ProcDoc.read_file(query_path) query = ProcDoc.query_preprocess(query) query_wordcount = {} for q, q_content in query.items(): query_wordcount[q] = ProcDoc.word_count(q_content, {}) query_unigram = ProcDoc.unigram(query_wordcount)
path = CommonPath(is_training, is_short, is_spoken) log_filename = path.getLogFilename() qry_path = path.getQryPath() doc_path = path.getDocPath() rel_path = path.getRelPath() dict_path = path.getDictPath() bg_path = path.getBGPath() print("Vector-Space-Model") # read relevant set for queries and documents eval_mdl = Evaluate.EvaluateModel(rel_path, is_training) rel_set = eval_mdl.getAset() # Preprocess for queries and documents qry_file = ProcDoc.readFile(qry_path) doc_file = ProcDoc.readFile(doc_path) # Term Frequency qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set) doc_mdl_dict = ProcDoc.docPreproc(doc_file) # Convert dictionary to numpy array (feasible to compute) qry_mdl_np_, qry_IDs = ProcDoc.dict2npSparse(qry_mdl_dict) doc_mdl_np_, doc_IDs = ProcDoc.dict2npSparse(doc_mdl_dict) # TF-IDF print("TF-IDF") [qry_mdl_np, doc_mdl_np] = Statistical.TFIDF(qry_mdl_np_, doc_mdl_np_, {"qry":[3, 3], "doc": [3, 3]}) # Cosine Similarity
with open(model_path + "doc_list.pkl", "rb") as f: doc_list = Pickle.load(f) with open(model_path + "query_list.pkl", "rb") as f: qry_list = Pickle.load(f) with open(model_path + "test_query_list.pkl", "rb") as f: tstQry_list = Pickle.load(f) wordModel = word2vec_model.word2vec_model() wordVec = wordModel.getWord2Vec() vocab_length = wordModel.vocabulary_length print vocab_length # document doc = ProcDoc.read_file(document_path) doc = ProcDoc.doc_preprocess(doc) #[docTmpList, docEmbList] = content2Emb(doc, wordVec, 100) #doc_emb = rePermute(docTmpList, docEmbList, doc_list) #doc_emb = content2List(doc, doc_list) #doc_emb = np.asarray(doc_emb) #print doc_emb.shape #np.save(model_path + "doc_id_fix_pad.npy", doc_emb) # train query query = ProcDoc.read_file(query_path) query = ProcDoc.query_preprocess(query) #[qryTmpList, qryEmbList] = content2Emb(query, wordVec, 100) #qry_emb = rePermute(qryTmpList, qryEmbList, qry_list) qry_emb = content2List(query, qry_list) qry_emb = np.asarray(qry_emb)
type_feat = "sparse" # or embeddings query_path = None document_path = None QDrel_file_path = None corpus = "TDT2" # qry and doc if query_path == None: query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if document_path == None: document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW" if QDrel_file_path == None: QDrel_file_path = "../Significant-Words-Language-Models/train-qry-results-0.675969697596.txt" # relevancy set hmm_training_set = ProcDoc.readRELdict() # read document, reserve position doc = ProcDoc.readFile(document_path) doc = ProcDoc.docPreproc(doc, RES_POS) # read query, reserve position qry = ProcDoc.readFile(query_path) qry = ProcDoc.qryPreproc(qry, hmm_training_set, RES_POS) QDrel = RelPrep.readQDRel(QDrel_file_path) print len(qry), len(doc) print len(QDrel) NRMprep.getTrainAndValidation(qry, doc, QDrel, NUM_VOCAB, type_rank, type_feat) # (pointwise or pairwise) and (sparse or embeddings) # prepare data and label
ID_map = {} def ID2Word(proc_dict, ID_map): for key, content in proc_dict.items(): for i, ID in enumerate(content): content[i] = ID_map[ID] return proc_dict # read relevant set for queries and documents eval_mdl = Evaluate.EvaluateModel(rel_path, is_training) rel_set = eval_mdl.getAset() # read queris and documents qry_file = ProcDoc.readFile(qry_path) doc_file = ProcDoc.readFile(doc_path) # preprocess + reserve postion infomation qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set, True) doc_mdl_dict = ProcDoc.docPreproc(doc_file, True) # read dictionary (ID, Word) import codecs with codecs.open(dict_path, 'r', encoding='utf-8') as rf: for idx, line in enumerate(rf.readlines()): info = line.split("\r\n")[0].split(" ") ID_map[idx] = info[-1] qry_mdl_dict = ID2Word(qry_mdl_dict, ID_map) doc_mdl_dict = ID2Word(doc_mdl_dict, ID_map)
import numpy as np import ProcDoc import cPickle as Pickle corpus = "TDT2" model_path = "../Corpus/model/" + corpus + "/UM/" with open(model_path + "query_model.pkl", "rb") as f: qry_model = Pickle.load(f) with open(model_path + "doc_model.pkl", "rb") as f: doc_model = Pickle.load(f) background = ProcDoc.read_background_dict() qry_smooth_alpha = 0. doc_smooth_alpha = 0.8 background_model = ProcDoc.read_background_dict() print background_model.shape for idx, vec in enumerate(doc_model): doc_model[idx] = (1 - doc_smooth_alpha) * vec + doc_smooth_alpha * background for idx, vec in enumerate(qry_model): qry_model[idx] = (1 - qry_smooth_alpha) * vec + qry_smooth_alpha * background LM_score = np.dot(qry_model, np.log(doc_model).T) with open("LM_score.pkl", "wb") as f: Pickle.dump(LM_score, f, True)
# -*- coding: utf-8 -*- import ProcDoc from gensim import corpora, models, matutils from sklearn.cluster import KMeans documents = ProcDoc.read_doc() documents = ProcDoc.doc_preprocess(documents) # remove common words and tokenize texts = [[word for word in document.lower().split()] for document in documents] texts = [[token for token in text] for text in texts] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] print "TFIDF:" corpus_tfidf = matutils.corpus2csc(corpus_tfidf).transpose() print corpus_tfidf print "__________________________________________" num_of_clusters = 64 kmeans = KMeans(n_clusters=num_of_clusters) doc_cluster = kmeans.fit_predict(corpus_tfidf) clusters = [[] for i in range(num_of_clusters)] doc_index = 0 for cluster in doc_cluster: clusters[cluster].append(doc_index) doc_index += 1
import cPickle as Pickle import plot_diagram data = {} # content of document (doc, content) background_model = {} # word count of 2265 document (word, number of words) general_model = {} query = {} # query query_lambda = 0.4 doc_lambda = 0.8 document_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW" query_path = "../Corpus/TDT2/QUERY_WDID_NEW_middle" #query_path = "../Corpus/Train/XinTrainQryTDT2/QUERY_WDID_NEW" # document model data = ProcDoc.read_file(document_path) doc_wordcount = ProcDoc.doc_preprocess(data) doc_unigram = ProcDoc.unigram(doc_wordcount) #word_idf = ProcDoc.inverse_document_frequency(doc_wordcount) # background_model background_model = ProcDoc.read_background_dict() # general model collection = {} for key, value in doc_wordcount.items(): for word, count in value.items(): if word in collection: collection[word] += count else:
import ProcDoc import Expansion import timeit import evaluate import cPickle as Pickle data = {} # content of document (doc, content) query = {} # query doc_freq = {} document_path = "../../Corpus/TDT2/Spoken_Doc" query_path = "../../Corpus/TDT2/QUERY_WDID_NEW" #with open("HMMTraingSetDict.pkl", "rb") as file: HMMTraingSetDict = Pickle.load(file) # document model data = ProcDoc.read_file(document_path) doc_wordcount = ProcDoc.doc_preprocess(data) total_docs = len(doc_wordcount.keys()) * 1.0 [doc_model, doc_freq] = ProcDoc.compute_TFIDF(doc_wordcount) # query model query = ProcDoc.read_file(query_path) query = ProcDoc.query_preprocess(query) query_wordcount = {} for q_key, q_content in query.items(): query_wordcount[q_key] = ProcDoc.word_count(q_content, {}) query_model = defaultdict(dict) for q_key, word_count_dict in query_wordcount.items(): max_freq = np.max(np.array(word_count_dict.values()), axis=0)
doc_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW" nn_method += ".h5" results_file += ".txt" rel_lambda = 0.5 dict_path = "../Corpus/TDT2/LDC_Lexicon.txt" bg_path = "../Corpus/background" # read relevant set for queries and documents eval_mdl = Evaluate.EvaluateModel(rel_path, is_training) rel_set = eval_mdl.getAset() alpha = 0.8 beta = 0.4 qry_file = ProcDoc.readFile(qry_path) doc_file = ProcDoc.readFile(doc_path) qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set) doc_mdl_dict = ProcDoc.docPreproc(doc_file) qry_unimdl_dict = ProcDoc.unigram(qry_mdl_dict) doc_unimdl_dict = ProcDoc.unigram(doc_mdl_dict) # origin query model qry_mdl_np, qry_IDs = ProcDoc.dict2npSparse(qry_unimdl_dict) # refine query model ref_qry_mdl_np, qry_IDs = ProcDoc.dict2npSparse(qry_unimdl_dict) doc_mdl_np, doc_IDs = ProcDoc.dict2npSparse(doc_unimdl_dict) NRM_mdl_np = nn_model.predict(nn_method, qry_mdl_np)
def embedded_query_expansion_ci(query_embedded, query_wordcount, collection, collection_total_similarity, word2vec, interpolated_aplpha, m): # load query model query_model = Pickle.load(open("model/query_model.pkl", "rb")) embedded_query_expansion = query_model update_embedded_query_expansion = {} if os.path.isfile("model/update_embedded_query_expansion_ci.pkl") == True: # check if a file exist update_embedded_query_expansion = Pickle.load( open("model/update_embedded_query_expansion_ci.pkl", "rb")) else: # calculate every query for query, query_word_count_dict in query_wordcount.items(): top_prob_dict = {} # calculate every word in collection for word in collection.keys(): total_probability = collection_total_similarity[word] p_w_q = 0 if not word in query_word_count_dict: p_w_q = total_probability # p(w|q) # total probability theory(for every query term) for query_term in query_word_count_dict.keys(): if query_term in query_embedded: cur_word_similarity = word2vec.getWordSimilarity( query_embedded[query_term], collection[word]) p_w_q *= (cur_word_similarity / total_probability) # storage probability top_prob_dict[word] = p_w_q # softmax top_prob_dict = ProcDoc.softmax(top_prob_dict) # sorted top_prob_dict by value(probability) top_prob_list = sorted(top_prob_dict.items(), key=operator.itemgetter(1), reverse=True) update_embedded_query_expansion[query] = top_prob_list # storage update expansion Pickle.dump(update_embedded_query_expansion, open("model/update_embedded_query_expansion_ci.pkl", "wb"), True) # update query model for update_query, update_query_word_list in update_embedded_query_expansion.items( ): filepath = "visual/" + update_query + "_ci.png" if os.path.isfile(filepath) == False: visualization.visualization(collection, update_query_word_list, filepath) for update_word, update_count in update_query_word_list[:m]: update = update_count origin = 0 if update_word in query_model[update_query]: origin = query_model[update_query][update_word] query_model[update_query].pop(update_word, None) embedded_query_expansion[update_query][ update_word] = interpolated_aplpha * origin + ( 1 - interpolated_aplpha) * update for un_changed_word in query_model[update_query].keys(): embedded_query_expansion[update_query][ un_changed_word] *= interpolated_aplpha # softmax embedded_query_expansion[update_query] = ProcDoc.softmax( embedded_query_expansion[update_query]) return embedded_query_expansion
query_path = None, document_path = None, corpus = "TDT2"): #ranks = ["pointwise", "pairwise"] #feats = ["spare", "emb"] res_pos = True self.num_vocab = 51253 self.num_feats = len_feats self.type_rank = type_rank self.type_feat = type_feat # qry and doc if query_path == None: query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if document_path == None: document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW" # relevancy set self.hmm_training_set = ProcDoc.readRELdict() # read document, reserve position doc = ProcDoc.readFile(document_path) self.doc = ProcDoc.docPreproc(doc, res_pos) # read query, reserve position qry = ProcDoc.readFile(query_path) self.qry = ProcDoc.qryPreproc(qry, self.hmm_training_set, res_pos) # generate h featrues self.input_feats = self.__genFeature(self.num_feats) def genTrainValidSet(self, percent = None, isTest = False): print "generate training set and validation set" if percent == None: percent = 80
background_model = {} # word count of 2265 document (word, number of words) general_model = {} query = {} # query query_lambda = 0 doc_lambda = 0.9 #remove_list = ["update_embedded_query_expansion_ci.pkl", "update_embedded_query_expansion_qi.pkl", "collection_embedded.pkl", "query_embedded.pkl", "collection_total_similarity.pkl"] remove_list = [] document_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW" query_path = "../Corpus/TDT2/QUERY_WDID_NEW_middle" word_emb_path = "data/word2vec_dict.pkl" relevance_path = "../Corpus/TDT2/AssessmentTrainSet/AssessmentTrainSet.txt" # document model data = ProcDoc.read_file(document_path) doc_wordcount = ProcDoc.doc_preprocess(data) doc_unigram = ProcDoc.unigram(dict(doc_wordcount)) doc_mdl, doc_IDs = ProcDoc.dict2np(doc_unigram) # background_model background_model = ProcDoc.read_background_dict() background_model_np = ProcDoc.read_background_np() # document smoothing for doc_idx in xrange(doc_mdl.shape[0]): doc_vec = doc_mdl[doc_idx] doc_mdl[doc_idx] = ( 1 - doc_lambda) * doc_vec + doc_lambda * background_model_np # general model collection = {}