def __init__(self, num_of_homo_feats=10, max_qry_length=1794, max_doc_length=2907, query_path=None, document_path=None, corpus="TDT2"): res_pos = True self.num_vocab = 51253 self.max_qry_length = max_qry_length self.max_doc_length = max_doc_length self.num_of_homo_feats = num_of_homo_feats if query_path == None: query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if document_path == None: document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW" # relevance set self.hmm_training_set = ProcDoc.readRELdict() # read document, reserve position doc = ProcDoc.readFile(document_path) self.doc = ProcDoc.docPreproc(doc, res_pos, 200) # read query, reserve position qry = ProcDoc.readFile(query_path) self.qry = ProcDoc.qryPreproc(qry, self.hmm_training_set, res_pos, 200) # generate h featrues self.homo_feats = self.__genFeature(num_of_homo_feats)
def __init__(self, qry_path=None, rel_path=None, isTraining=True, doc_path=None): # default training step if qry_path == None: qry_path = "../Corpus/TDT2/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if doc_path == None: doc_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW" if rel_path == None: rel_path = "../Corpus/TDT2/Train/QDRelevanceTDT2_forHMMOutSideTrain" self.vocab_size = 51253 # relevance set self.rel_set = ProcDoc.readRELdict(rel_path, isTraining) self.evaluate_model = EvaluateModel(rel_path, isTraining) # read documents doc = ProcDoc.readFile(doc_path) self.doc = ProcDoc.docPreproc(doc) self.doc_len = Statistical.compLenAcc(self.doc) # read queries qry = ProcDoc.readFile(qry_path) self.qry_tf = ProcDoc.qryPreproc(qry, self.rel_set) self.qry_len = Statistical.compLenAcc(self.qry_tf) [self.qry, self.doc] = Statistical.TFIDF(self.qry_tf, self.doc, self.qry_len, self.doc_len) # dict to numpy self.qry_tf, self.qry_tf_IDs = self.__dict2np(self.qry_tf) self.qry, self.qry_IDs = self.__dict2np(self.qry, self.qry_tf_IDs) self.doc, self.doc_IDs = self.__dict2np(self.doc) # precompute len(document) self.doc = Statistical.l2Normalize(self.doc)
sys.path.append("../Tools") import numpy as np import cPickle as pickle import ProcDoc from PLSA_class import pLSA from Clustering import ClusterModel np.random.seed(1337) corpus = "TDT2" doc_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW" cluster_dir = "Topic" num_of_topic = 4 iterations = 20 doc = ProcDoc.readFile(doc_path) doc_dict = ProcDoc.docPreproc(doc) # general model collection = {} for doc_ID, word_count in doc_dict.items(): for word, count in word_count.items(): if word in collection: collection[word] += count else: collection[word] = count if not os.path.isfile(cluster_dir + "/pwz_list.pkl"): with open("exp/w_IDs.pkl", "wb") as wIDs_file : pickle.dump(collection.keys(), wIDs_file, True) cluster_mdl = ClusterModel(doc_dict, collection.keys(), num_of_topic) cluster_mdl.save(cluster_dir)
dict_path = path.getDictPath() bg_path = path.getBGPath() print("Vector-Space-Model") # read relevant set for queries and documents eval_mdl = Evaluate.EvaluateModel(rel_path, is_training) rel_set = eval_mdl.getAset() # Preprocess for queries and documents qry_file = ProcDoc.readFile(qry_path) doc_file = ProcDoc.readFile(doc_path) # Term Frequency qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set) doc_mdl_dict = ProcDoc.docPreproc(doc_file) # Convert dictionary to numpy array (feasible to compute) qry_mdl_np_, qry_IDs = ProcDoc.dict2npSparse(qry_mdl_dict) doc_mdl_np_, doc_IDs = ProcDoc.dict2npSparse(doc_mdl_dict) # TF-IDF print("TF-IDF") [qry_mdl_np, doc_mdl_np] = Statistical.TFIDF(qry_mdl_np_, doc_mdl_np_, {"qry":[3, 3], "doc": [3, 3]}) # Cosine Similarity # L2-normalize qry_mdl_np = Statistical.l2Norm(qry_mdl_np) doc_mdl_np = Statistical.l2Norm(doc_mdl_np) def retrieval(qry_mdl, doc_mdl):
corpus = "TDT2" # qry and doc if query_path == None: query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if document_path == None: document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW" if QDrel_file_path == None: QDrel_file_path = "../Significant-Words-Language-Models/train-qry-results-0.675969697596.txt" # relevancy set hmm_training_set = ProcDoc.readRELdict() # read document, reserve position doc = ProcDoc.readFile(document_path) doc = ProcDoc.docPreproc(doc, RES_POS) # read query, reserve position qry = ProcDoc.readFile(query_path) qry = ProcDoc.qryPreproc(qry, hmm_training_set, RES_POS) QDrel = RelPrep.readQDRel(QDrel_file_path) print len(qry), len(doc) print len(QDrel) NRMprep.getTrainAndValidation(qry, doc, QDrel, NUM_VOCAB, type_rank, type_feat) # (pointwise or pairwise) and (sparse or embeddings) # prepare data and label # NRMPrep.getTrainAndValidation(qry, doc, type_rank, type_feat, percent) # return train.data, train.label, val.data, val.label # create model
self.num_vocab = 51253 self.num_feats = len_feats self.type_rank = type_rank self.type_feat = type_feat # qry and doc if query_path == None: query_path = "../Corpus/" + corpus + "/Train/XinTrainQryTDT2/QUERY_WDID_NEW" if document_path == None: document_path = "../Corpus/" + corpus + "/SPLIT_DOC_WDID_NEW" # relevancy set self.hmm_training_set = ProcDoc.readRELdict() # read document, reserve position doc = ProcDoc.readFile(document_path) self.doc = ProcDoc.docPreproc(doc, res_pos) # read query, reserve position qry = ProcDoc.readFile(query_path) self.qry = ProcDoc.qryPreproc(qry, self.hmm_training_set, res_pos) # generate h featrues self.input_feats = self.__genFeature(self.num_feats) def genTrainValidSet(self, percent = None, isTest = False): print "generate training set and validation set" if percent == None: percent = 80 qry = self.qry doc = self.doc total_qry = len(qry.keys()) total_doc = len(doc.keys())
return proc_dict, att_dict # read relevant set for queries and documents print(rel_path, is_training) eval_mdl = Evaluate.EvaluateModel(rel_path, is_training) rel_set = eval_mdl.getAset() print(qry_path, doc_path) # read queris and documents qry_file = ProcDoc.readFile(qry_path) doc_file = ProcDoc.readFile(doc_path) # preprocess + reserve postion infomation qry_mdl_dict = ProcDoc.qryPreproc(qry_file, rel_set, True, True) doc_mdl_dict = ProcDoc.docPreproc(doc_file, True, True) # bag of word qry_bow_dict = ProcDoc.qryPreproc(qry_file, rel_set) doc_bow_dict = ProcDoc.docPreproc(doc_file) # unigram if att_types.index(att) == 0: qry_att_dict = ProcDoc.unigram(qry_bow_dict) doc_att_dict = ProcDoc.unigram(doc_bow_dict) elif att_types.index(att) == 1: qry_att_dict = qry_bow_dict for q_key, q_cont in qry_att_dict.items(): for q_w, q_w_uni in q_cont.items(): qry_att_dict[q_key][q_w] = "1.0" if is_spoken: