def __init__(self, num_of_homo_feats=10, max_qry_length=1794, max_doc_length=2907, query_path=None, document_path=None, corpus="TDT2"): res_pos = True str2int = True self.num_vocab = 51253 self.max_qry_length = max_qry_length self.max_doc_length = max_doc_length self.num_of_homo_feats = num_of_homo_feats if query_path == None: query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if document_path == None: document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW" # read document, reserve position doc = ProcDoc.read_file(document_path) self.doc = ProcDoc.doc_preprocess(doc, res_pos, str2int) # read query, reserve position qry = ProcDoc.read_file(query_path) self.qry = ProcDoc.query_preprocess(qry, res_pos, str2int) # HMMTrainingSet self.hmm_training_set = ProcDoc.read_relevance_dict() self.homo_feats = self.__genFeature(num_of_homo_feats)
def run(): INIT_PROBABILITY = 1.0 / 60 topic_word_prob_dict = ProcDoc.read_clusters() # read cluster P(W|T), {T: {W:Prob}} doc_topic_prob_dict = defaultdict(dict) # P(T|D),{D:{T:Prob}} doc_word_topic_prob_dict = defaultdict(dict) # P(T| w, D), {D: {word:{T:prob}}} doc_wc_dict = ProcDoc.read_doc_dict() # read document (Doc No.,Doc content) doc_wc_dict = ProcDoc.doc_preprocess(doc_wc_dict) # calculate word of the background # convert (Doc No.,Doc content) to (Doc_No, {word, count}) for docName, content in doc_wc_dict.items(): temp_dict = ProcDoc.word_count(content, {}) doc_wc_dict[docName] = temp_dict # initialize P(T|D) print "Initialize P(T|D)" for docName, wordCount in doc_wc_dict.items(): topic_prob = {} for topic, wordProb in topic_word_prob_dict.items(): doc_topic_prob_dict[docName][topic] = INIT_PROBABILITY ''' print "Initialize P(T| w, D)" for docName, wordCount in doc_wc_dict.items(): word_list = {} for word, frequency in wordCount.items(): topic_prob = {} for topic, wordProb in topic_word_prob_dict.items(): topic_prob[topic] = 0.0 word_list[word] = topic_prob doc_word_topic_prob_dict[docName] = word_list ''' print "start PLSA" [topic_word_prob_dict, doc_topic_prob_dict] = PLSA.Probability_LSA(doc_wc_dict, doc_topic_prob_dict, topic_word_prob_dict, doc_word_topic_prob_dict) print "end PLSA" p_plsa = {} # PLSA P(W|D) {D: {W : Prob}} for doc, topic_prob_list in doc_topic_prob_dict.items(): p_plsa_word = {} for topic, doc_prob in topic_prob_list.items(): for word, word_prob in topic_word_prob_dict[topic].items(): print word, word_prob if word in p_plsa_word: p_plsa_word[word] += word_prob * doc_prob else: p_plsa_word[word] = word_prob * doc_prob p_plsa[doc] = p_plsa_word return p_plsa
import os data = {} # content of document (doc, content) background_model = {} # word count of 2265 document (word, number of words) general_model = {} query = {} # query vocabulary = np.zeros(51253) #document_path = "../Corpus/Spoken_Doc" document_path = "../Corpus/SPLIT_DOC_WDID_NEW" query_path = "../Corpus/Train/XinTrainQryTDT2/QUERY_WDID_NEW" # read document data = ProcDoc.read_file(document_path) doc_wordcount = ProcDoc.doc_preprocess(data) # HMMTraingSet HMMTraingSetDict = ProcDoc.read_relevance_dict() query_relevance = {} query = ProcDoc.read_file(query_path) query = ProcDoc.query_preprocess(query) query_wordcount = {} for q, q_content in query.items(): query_wordcount[q] = ProcDoc.word_count(q_content, {}) query_unigram = ProcDoc.unigram(query_wordcount)
with open(model_path + "doc_list.pkl", "rb") as f: doc_list = Pickle.load(f) with open(model_path + "query_list.pkl", "rb") as f: qry_list = Pickle.load(f) with open(model_path + "test_query_list.pkl", "rb") as f: tstQry_list = Pickle.load(f) wordModel = word2vec_model.word2vec_model() wordVec = wordModel.getWord2Vec() vocab_length = wordModel.vocabulary_length print vocab_length # document doc = ProcDoc.read_file(document_path) doc = ProcDoc.doc_preprocess(doc) #[docTmpList, docEmbList] = content2Emb(doc, wordVec, 100) #doc_emb = rePermute(docTmpList, docEmbList, doc_list) #doc_emb = content2List(doc, doc_list) #doc_emb = np.asarray(doc_emb) #print doc_emb.shape #np.save(model_path + "doc_id_fix_pad.npy", doc_emb) # train query query = ProcDoc.read_file(query_path) query = ProcDoc.query_preprocess(query) #[qryTmpList, qryEmbList] = content2Emb(query, wordVec, 100) #qry_emb = rePermute(qryTmpList, qryEmbList, qry_list) qry_emb = content2List(query, qry_list) qry_emb = np.asarray(qry_emb) print qry_emb.shape
# -*- coding: utf-8 -*- import ProcDoc from gensim import corpora, models, matutils from sklearn.cluster import KMeans documents = ProcDoc.read_doc() documents = ProcDoc.doc_preprocess(documents) # remove common words and tokenize texts = [[word for word in document.lower().split()] for document in documents] texts = [[token for token in text] for text in texts] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] print "TFIDF:" corpus_tfidf = matutils.corpus2csc(corpus_tfidf).transpose() print corpus_tfidf print "__________________________________________" num_of_clusters = 64 kmeans = KMeans(n_clusters=num_of_clusters) doc_cluster = kmeans.fit_predict(corpus_tfidf) clusters = [[] for i in range(num_of_clusters)] doc_index = 0 for cluster in doc_cluster: clusters[cluster].append(doc_index) doc_index += 1
# check whether a file exists before read if os.path.isfile(query_item_path): with open(query_item_path, 'r') as f: # read content of query documant (doc, content) query[query_item] = f.read() # count background_word for key, value in data.items(): background_word = ProcDoc.word_count(value, dict(background_word)) for key, value in query.items(): background_word = ProcDoc.word_count(value, dict(background_word)) background_word_sum = ProcDoc.word_sum(background_word) # doc preprocess data = ProcDoc.doc_preprocess(data) # query preprocess query = ProcDoc.query_preprocess(query) query_word_count = {} for q, q_content in query.items(): query_word_count[q] = ProcDoc.word_count(q_content, {}) # query process assessment = readAssessment.get_assessment() lambda_test = {0: 0} interval = 0.1 isBreak = "run" while isBreak != "exit": for my_lambda in np.arange(0, 1, interval): if my_lambda in lambda_test: continue
# read content of query documant (doc, content) query[query_item] = f.read() # count background_word for key, value in data.items(): background_word = ProcDoc.word_count(value, dict(background_word)) for key, value in query.items(): background_word = ProcDoc.word_count(value, dict(background_word)) background_word_sum = ProcDoc.word_sum(background_word) #background_word = sorted(background_word.items(), key=lambda x: x[1], reverse=True) #{k: v for k, v in sorted(background_word.items(), key=lambda item: item[1])} #print(background_word) # doc preprocess [data_tf, data] = ProcDoc.doc_preprocess(data) # query preprocess query = ProcDoc.query_preprocess(query) query_word_count = {} for q, q_content in query.items(): query_word_count[q] = ProcDoc.word_count(q_content, {}) feedback_model = [] # query process assessment = readAssessment.get_assessment() lambda_test = {0: 0} interval = 0.1 isBreak = "run" while isBreak != "exit": isWrite = True