def get_model(cls, data_store=None, regen=False, num_topics=None, save=True): md_file_path = get_md_path() dict_file_path = get_dict_path() simmx_file_path = get_simmx_path() num_topics_file_path = get_num_topic_path() if not os.path.isfile(md_file_path) or not \ os.path.isfile(dict_file_path) or not \ os.path.isfile(simmx_file_path) or not \ os.path.isfile(num_topics_file_path) or regen: engine_logger.info("Generating LDA models.") dictionary, corpus = docs_to_corpus(data_store.doc_set, rm_stop_words=True) # generate LDA model # LDA model is trained on all the docs model = models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary) sim_matrix = similarities.MatrixSimilarity(model[corpus]) if save: # saving dictionary.save_as_text(dict_file_path) model.save(md_file_path) sim_matrix.save(simmx_file_path) write_num_topics(num_topics_file_path, num_topics) else: engine_logger.info("Loading existing LDA models.") dictionary = corpora.Dictionary.load_from_text(dict_file_path) model = models.TfidfModel.load(md_file_path) sim_matrix = similarities.SparseMatrixSimilarity.load(simmx_file_path) num_topics = read_num_topics(num_topics_file_path) return LdaModelStruct(model=model, dictionary=dictionary, sim_matrix=sim_matrix, num_topics=num_topics)
def cross_validate(self): iter_num = 0 train_test_data_tuples = split_raw_data_k_fold(self.raw_data, self.num_folds) for train_data_store, train_data, test_data in train_test_data_tuples: engine_logger.info("Cross validation iter: %d" % iter_num) eval_model = self._get_eval_model(train_data_store, train_data, test_data) eval_model.run_eval() train_accuracy, train_relevance, test_accuracy, test_relevance = eval_model.report_metrics() print train_accuracy, train_relevance, test_accuracy, test_relevance self.each_run_train_accuracy.append(train_accuracy) self.each_run_train_relevance.append(train_relevance) self.each_run_test_accuracy.append(test_accuracy) self.each_run_test_relevance.append(test_relevance) if iter_num == 0 and options.write_output: eval_model.write_output() iter_num += 1 self.avg_train_accuracy = sum(self.each_run_train_accuracy) / float(self.num_folds) self.avg_train_relevance = sum(self.each_run_train_relevance) / float(self.num_folds) self.avg_test_accuracy = sum(self.each_run_test_accuracy) / float(self.num_folds) self.avg_test_relevance = sum(self.each_run_test_relevance) / float(self.num_folds)
def cross_validate(self): iter_num = 0 train_test_data_tuples = split_raw_data_k_fold(self.raw_data, self.num_folds) for train_data_store, train_data, test_data in train_test_data_tuples: engine_logger.info("Cross validation iter: %d" % iter_num) eval_model = self._get_eval_model(train_data_store, train_data, test_data) eval_model.run_eval() train_accuracy, train_relevance, test_accuracy, test_relevance = eval_model.report_metrics( ) print train_accuracy, train_relevance, test_accuracy, test_relevance self.each_run_train_accuracy.append(train_accuracy) self.each_run_train_relevance.append(train_relevance) self.each_run_test_accuracy.append(test_accuracy) self.each_run_test_relevance.append(test_relevance) if iter_num == 0 and options.write_output: eval_model.write_output() iter_num += 1 self.avg_train_accuracy = sum(self.each_run_train_accuracy) / float( self.num_folds) self.avg_train_relevance = sum(self.each_run_train_relevance) / float( self.num_folds) self.avg_test_accuracy = sum(self.each_run_test_accuracy) / float( self.num_folds) self.avg_test_relevance = sum(self.each_run_test_relevance) / float( self.num_folds)
def write_output(self): show_topic_words = isinstance(self.model, TopicWordLookupModelStruct) engine_logger.info("Writing output") with open('cv_test.log', 'w') as f: f.write("%f, %f\n" % (self.test_data_set.accuracy, self.test_data_set.avg_relevance_score)) for idx in range(len(self.test_data_set.questions)): f.write("Question: %s\n" % self.test_data_set.questions[idx].encode('utf-8')) if show_topic_words: f.write("Question topic words: %s\n" % self.test_data_set.question_topic_words[idx]) f.write("Correct answer: %s\n" % self.test_data_set.top_answers[idx].encode('utf-8')) if show_topic_words: f.write("Answer topic words: %s\n" % self.test_data_set.answer_topic_words[idx]) f.write("Retrieved question: %s\n" % self.test_data_set.retrieved_questions[idx].encode('utf-8')) f.write("Retrieved answer: %s\n" % self.test_data_set.retrieved_answers[idx].encode('utf-8')) f.write("Label: %s, relevance score: %f\n" % (self.test_data_set.judgement_labels[idx], self.test_data_set.relevance_scores[idx])) f.write("================================================================== \n") with open('cv_train.log', 'w') as f: f.write(">>>>>>>> Training data\n") for idx in range(len(self.train_data_set.questions)): f.write("Question: %s\n" % self.train_data_set.questions[idx].encode('utf-8')) if show_topic_words: f.write("Question topic words: %s\n" % self.train_data_set.question_topic_words[idx]) f.write("Answer: %s\n" % self.train_data_set.top_answers[idx].encode('utf-8')) if show_topic_words: f.write("Answer topic words: %s\n" % self.train_data_set.answer_topic_words[idx]) f.write("================================================================== \n")
def train_model(self): bin_path = get_bin_path() train_data_path = get_train_data_path() model_path = get_md_path() if self.svm_kernel_type == self.RANK_SVM_KERNEL_TYPE_LINEAR: subprocess.call([bin_path, '-c', str(self.c), train_data_path, model_path]) engine_logger.info("Finished training linear SVM model") elif self.svm_kernel_type == self.RANK_SVM_KERNEL_TYPE_RBF: subprocess.call( [bin_path, '-c', str(self.c), '-t', '2', '-g', str(self.gamma), train_data_path, model_path]) engine_logger.info("Finished RBF kernel based SVM model")
def write_training_data(self): query_id = self.query_id_offset engine_logger.info("Writing training data. Num of queries: %s" % len(self.rank_data)) file_path = "%s.offset%s" % (get_train_data_path(), self.query_id_offset) with open(file_path, 'w') as f: for question, pairs in self.rank_data: engine_logger.debug("Writing for query %s" % query_id) f.write("# query %s\n" % query_id) qa_pairs = [t[0] for t in pairs] features = self.matcher.match(question, qa_pairs) for idx, feature in enumerate(features): labeled_score = pairs[idx][1] f.write("%s qid:%s %s\n" % (labeled_score, query_id, str(feature))) query_id += 1
def write_output(self): show_topic_words = isinstance(self.model, TopicWordLookupModelStruct) engine_logger.info("Writing output") with open('cv_test.log', 'w') as f: f.write("%f, %f\n" % (self.test_data_set.accuracy, self.test_data_set.avg_relevance_score)) for idx in range(len(self.test_data_set.questions)): f.write("Question: %s\n" % self.test_data_set.questions[idx].encode('utf-8')) if show_topic_words: f.write("Question topic words: %s\n" % self.test_data_set.question_topic_words[idx]) f.write("Correct answer: %s\n" % self.test_data_set.top_answers[idx].encode('utf-8')) if show_topic_words: f.write("Answer topic words: %s\n" % self.test_data_set.answer_topic_words[idx]) f.write( "Retrieved question: %s\n" % self.test_data_set.retrieved_questions[idx].encode('utf-8') ) f.write( "Retrieved answer: %s\n" % self.test_data_set.retrieved_answers[idx].encode('utf-8')) f.write("Label: %s, relevance score: %f\n" % (self.test_data_set.judgement_labels[idx], self.test_data_set.relevance_scores[idx])) f.write( "================================================================== \n" ) with open('cv_train.log', 'w') as f: f.write(">>>>>>>> Training data\n") for idx in range(len(self.train_data_set.questions)): f.write("Question: %s\n" % self.train_data_set.questions[idx].encode('utf-8')) if show_topic_words: f.write("Question topic words: %s\n" % self.train_data_set.question_topic_words[idx]) f.write("Answer: %s\n" % self.train_data_set.top_answers[idx].encode('utf-8')) if show_topic_words: f.write("Answer topic words: %s\n" % self.train_data_set.answer_topic_words[idx]) f.write( "================================================================== \n" )
def get_new_model(cls, n_estimators, train_data, train_labels, cross_validation_folds, save=True): engine_logger.info("Training new calibrated boosted decision stumps") md_file_path = get_md_path() boosted_decision_stumps = AdaBoostClassifier( DecisionTreeClassifier(max_depth=1), n_estimators=n_estimators) calibrated_bds = CalibratedClassifierCV(boosted_decision_stumps, method='sigmoid', cv=cross_validation_folds) calibrated_bds.fit(train_data, train_labels) if save: joblib.dump(calibrated_bds, md_file_path) return calibrated_bds
def get_model(cls, data_store=None, regen=False, save=True): md_file_path = get_md_path() dict_file_path = get_dict_path() q_simmx_file_path = get_q_simmx_path() a_simmx_file_path = get_a_simmx_path() if not os.path.isfile(md_file_path) or not \ os.path.isfile(dict_file_path) or not \ os.path.isfile(q_simmx_file_path) or not \ os.path.isfile(a_simmx_file_path) or regen: engine_logger.info("Generating TF_IDF models.") dictionary, corpus = docs_to_corpus(data_store.doc_set) # vocabulary and tf-idf are computed on the whole space model = models.TfidfModel(corpus) # while query similarities are executed on several space, i.e. question space and answer separately # extract answer corpus question_corpus = [corpus[qid] for qid in data_store.question_set] question_sim_matrix = similarities.SparseMatrixSimilarity( model[question_corpus], num_features=len(dictionary)) # extract answer corpus answer_corpus = [corpus[aid] for aid in data_store.answer_set] answer_sim_matrix = similarities.SparseMatrixSimilarity( model[answer_corpus], num_features=len(dictionary)) if save: # saving dictionary.save_as_text(dict_file_path) model.save(md_file_path) question_sim_matrix.save(q_simmx_file_path) answer_sim_matrix.save(a_simmx_file_path) else: engine_logger.info("Loading existing TF_IDF models.") dictionary = corpora.Dictionary.load_from_text(dict_file_path) model = models.TfidfModel.load(md_file_path) question_sim_matrix = similarities.SparseMatrixSimilarity.load( q_simmx_file_path) answer_sim_matrix = similarities.SparseMatrixSimilarity.load( a_simmx_file_path) return TfIdfModelStruct(model=model, dictionary=dictionary, question_sim_matrix=question_sim_matrix, answer_sim_matrix=answer_sim_matrix)
def load_models(self): """ Load all the models from files :return """ self.tfidf_model = TfIdfModelStruct.get_model() self.lda_model = LdaModelStruct.get_model() self.topic_word_lookup_model = TopicWordLookupModelStruct.get_model() self.word2vec_model = Word2VecModel() self.matcher = Matcher( self.tfidf_model, self.lda_model, self.topic_word_lookup_model, self.word2vec_model ) self.rank_model = read_rank_model_from_file() engine_logger.info("Rank based query engine is up") self.is_up = True
def get_model(cls, data_store=None, regen=False, num_topics=None, save=True): md_file_path = get_md_path() dict_file_path = get_dict_path() simmx_file_path = get_simmx_path() num_topics_file_path = get_num_topic_path() if not os.path.isfile(md_file_path) or not \ os.path.isfile(dict_file_path) or not \ os.path.isfile(simmx_file_path) or not \ os.path.isfile(num_topics_file_path) or regen: engine_logger.info("Generating LDA models.") dictionary, corpus = docs_to_corpus(data_store.doc_set, rm_stop_words=True) # generate LDA model # LDA model is trained on all the docs model = models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary) sim_matrix = similarities.MatrixSimilarity(model[corpus]) if save: # saving dictionary.save_as_text(dict_file_path) model.save(md_file_path) sim_matrix.save(simmx_file_path) write_num_topics(num_topics_file_path, num_topics) else: engine_logger.info("Loading existing LDA models.") dictionary = corpora.Dictionary.load_from_text(dict_file_path) model = models.TfidfModel.load(md_file_path) sim_matrix = similarities.SparseMatrixSimilarity.load( simmx_file_path) num_topics = read_num_topics(num_topics_file_path) return LdaModelStruct(model=model, dictionary=dictionary, sim_matrix=sim_matrix, num_topics=num_topics)
def read_rank_model_from_file(): md_file_path = get_md_path() if not os.path.isfile(md_file_path): raise Exception("Missing model file: %s" % md_file_path) with open(md_file_path, 'r') as f: contents = f.readlines() type_line = contents[1] type_line_splits = type_line.split(" ") if type_line_splits[0] == "0": # kernel type: linear threshold_line = contents[-2] threshold = float(threshold_line.split(" ")[0]) sv_line = contents[-1] splits = sv_line.split(" ")[1:-1] weight_vec = [] for part in splits: weight_vec.append(float(part.split(":")[1])) engine_logger.info("Load linear SVM rank model from file") return LinearSVMRankModel(weight_vec=weight_vec, threshold=threshold) elif type_line_splits[0] == "2": # kernel type: rbf gamma_param_line = contents[3] gamma = float(gamma_param_line.split(" ")[0]) threshold_line = contents[10] threshold = float(threshold_line.split(" ")[0]) alphays = [] svs = [] for line in contents[11:]: line_splits = line.split(" ") alphays.append(float(line_splits[0])) sv_vec = [] for part in line_splits[1:-1]: sv_vec.append(float(part.split(":")[1])) svs.append(sv_vec) engine_logger.info("Load RBF kernel SVM rank model from file") return RBFSVMRankModel(alphays, svs, gamma, threshold) else: engine_logger.error("Un-recognized model file format")
def get_model(cls, data_store=None, regen=False, save=True): md_file_path = get_md_path() dict_file_path = get_dict_path() q_simmx_file_path = get_q_simmx_path() a_simmx_file_path = get_a_simmx_path() if not os.path.isfile(md_file_path) or not \ os.path.isfile(dict_file_path) or not \ os.path.isfile(q_simmx_file_path) or not \ os.path.isfile(a_simmx_file_path) or regen: engine_logger.info("Generating TF_IDF models.") dictionary, corpus = docs_to_corpus(data_store.doc_set) # vocabulary and tf-idf are computed on the whole space model = models.TfidfModel(corpus) # while query similarities are executed on several space, i.e. question space and answer separately # extract answer corpus question_corpus = [corpus[qid] for qid in data_store.question_set] question_sim_matrix = similarities.SparseMatrixSimilarity(model[question_corpus], num_features=len(dictionary)) # extract answer corpus answer_corpus = [corpus[aid] for aid in data_store.answer_set] answer_sim_matrix = similarities.SparseMatrixSimilarity(model[answer_corpus], num_features=len(dictionary)) if save: # saving dictionary.save_as_text(dict_file_path) model.save(md_file_path) question_sim_matrix.save(q_simmx_file_path) answer_sim_matrix.save(a_simmx_file_path) else: engine_logger.info("Loading existing TF_IDF models.") dictionary = corpora.Dictionary.load_from_text(dict_file_path) model = models.TfidfModel.load(md_file_path) question_sim_matrix = similarities.SparseMatrixSimilarity.load(q_simmx_file_path) answer_sim_matrix = similarities.SparseMatrixSimilarity.load(a_simmx_file_path) return TfIdfModelStruct(model=model, dictionary=dictionary, question_sim_matrix=question_sim_matrix, answer_sim_matrix=answer_sim_matrix)
def get_model(cls, data_store=None, regen=False, save=True): dict_file_path = get_dict_path() simmx_file_path = get_simmx_path() if not os.path.isfile(dict_file_path) or not os.path.isfile(simmx_file_path) or \ regen: engine_logger.info("Generating topic word lookup model") # construct on a refined vocabulary of only topic words topic_words_across_all_docs = [] for pair in data_store.topic_word_docs: topic_words_per_doc = pair[1] normed_topic_words_per_doc = [ cls.normalize_word(word) for word in topic_words_per_doc ] topic_words_across_all_docs.append(normed_topic_words_per_doc) dictionary = corpora.Dictionary(topic_words_across_all_docs) instance = TopicWordLookupModelStruct(dictionary, None) topic_word_vecs = [ instance.get_topic_word_vec(doc) for doc in data_store.doc_set ] simmx = \ similarities.SparseMatrixSimilarity(topic_word_vecs, num_features=len(dictionary)) instance.simmx = simmx if save: # saving simmx.save(simmx_file_path) dictionary.save_as_text(dict_file_path) return instance else: engine_logger.info("Loading existing topic word lookup model") dictionary = corpora.Dictionary.load_from_text(dict_file_path) simmx = similarities.SparseMatrixSimilarity.load(simmx_file_path) return TopicWordLookupModelStruct(dictionary, simmx)
def get_model(cls, data_store=None, regen=False, save=True): dict_file_path = get_dict_path() simmx_file_path = get_simmx_path() if not os.path.isfile(dict_file_path) or not os.path.isfile(simmx_file_path) or \ regen: engine_logger.info("Generating topic word lookup model") # construct on a refined vocabulary of only topic words topic_words_across_all_docs = [] for pair in data_store.topic_word_docs: topic_words_per_doc = pair[1] normed_topic_words_per_doc = [cls.normalize_word(word) for word in topic_words_per_doc] topic_words_across_all_docs.append(normed_topic_words_per_doc) dictionary = corpora.Dictionary(topic_words_across_all_docs) instance = TopicWordLookupModelStruct(dictionary, None) topic_word_vecs = [instance.get_topic_word_vec(doc) for doc in data_store.doc_set] simmx = \ similarities.SparseMatrixSimilarity(topic_word_vecs, num_features=len(dictionary)) instance.simmx = simmx if save: # saving simmx.save(simmx_file_path) dictionary.save_as_text(dict_file_path) return instance else: engine_logger.info("Loading existing topic word lookup model") dictionary = corpora.Dictionary.load_from_text(dict_file_path) simmx = similarities.SparseMatrixSimilarity.load(simmx_file_path) return TopicWordLookupModelStruct(dictionary, simmx)
if options.write_rank_data: pair = options.pair.split(",") start_end = map(int, pair) sw = StopWatch() tfidf_model_struct = tfidf_model.TfIdfModelStruct.get_model() lda_model_struct = lda_train.LdaModelStruct.get_model() topic_word_lookup_model_struct = topic_word_lookup.TopicWordLookupModelStruct.get_model() word2vec_model = Word2VecModel() matcher = Matcher( tfidf_model_struct, lda_model_struct, topic_word_lookup_model_struct, word2vec_model ) data_part = data[start_end[0]:start_end[1]] engine_logger.info("data pair: %s, length: %s" % (start_end, len(data_part))) rank_model = RankTrainingDataGenerator( matcher, data_part, query_id_offset=start_end[0]) rank_model.write_training_data() print "total time to write rank data: %s seconds" % sw.stop() else: num_splits = int(options.num_splits) unit = total_num_data / num_splits pairs = [] for i in range(num_splits): if i == num_splits - 1: pairs.append((i*unit, total_num_data)) else: pairs.append((i*unit, (i+1)*unit)) print pairs
def __init__(self, raw_json_object, load_rank_training_data=True): engine_logger.info("Initializing data store.") # doc set stores all the documents self.doc_set = list() self.doc_to_id = dict() self.qa_context = dict() # set of question doc ids self.question_set = list() # set of question doc ids self.answer_set = list() # question has only one answer self.qid_to_qa_pair = dict() # one answer can corresponds to multiple questions self.aid_to_qa_pairs = dict() # data to train svm rank self.rank_data = dict() self.topic_word_docs = [] for segment in raw_json_object: question = segment['_question'] qid, added = self._add_question(question) if added: # context if 'context' in segment: self.qa_context[qid] = segment['context'] answer = segment['answer'] aid, added = self._add_answer(answer) self._add_qa_pair(qid, aid) # store data for topic words if '_question_topic_words' in segment: self.topic_word_docs.append((question, segment['_question_topic_words'])) if 'answer_topic_words' in segment: self.topic_word_docs.append((answer, segment['answer_topic_words'])) if load_rank_training_data: # loop for the ranked answer, add them at last for segment in raw_json_object: if 'qa_pairs_with_matching_score' in segment and \ len(segment['qa_pairs_with_matching_score']) > 0: self.rank_data[segment['_question']] = [] for pair_dict in segment['qa_pairs_with_matching_score']: qid, q_added = self._add_question(pair_dict['_question']) aid, a_added = self._add_answer(pair_dict['answer']) if q_added: # don't override previous question-answer pair self._add_qa_pair(qid, aid) elif a_added: # the question is not new, but the answer is new, usually we shoudln't see this if aid not in self.aid_to_qa_pairs: self.aid_to_qa_pairs[aid] = [] self.aid_to_qa_pairs[aid].append((qid, aid)) self.rank_data[segment['_question']].append(( ( pair_dict['_question'], pair_dict['answer'] ), pair_dict['score']) ) engine_logger.info("# docs loaded: %s" % len(self.doc_set)) engine_logger.info("# questions loaded: %s" % len(self.question_set)) engine_logger.info("# answers loaded: %s" % len(self.answer_set)) engine_logger.info("# topic word labeled docs: %s" % len(self.topic_word_docs))
def __init__(self, eager_loading=True): engine_logger.info("Loading word2vec: %s" % filepath) self.model = None if eager_loading: self.model = word2vec.load(filepath, encoding='ISO-8859-1')
def __init__(self, raw_json_object, load_rank_training_data=True): engine_logger.info("Initializing data store.") # doc set stores all the documents self.doc_set = list() self.doc_to_id = dict() self.qa_context = dict() # set of question doc ids self.question_set = list() # set of question doc ids self.answer_set = list() # question has only one answer self.qid_to_qa_pair = dict() # one answer can corresponds to multiple questions self.aid_to_qa_pairs = dict() # data to train svm rank self.rank_data = dict() self.topic_word_docs = [] for segment in raw_json_object: question = segment['_question'] qid, added = self._add_question(question) if added: # context if 'context' in segment: self.qa_context[qid] = segment['context'] answer = segment['answer'] aid, added = self._add_answer(answer) self._add_qa_pair(qid, aid) # store data for topic words if '_question_topic_words' in segment: self.topic_word_docs.append( (question, segment['_question_topic_words'])) if 'answer_topic_words' in segment: self.topic_word_docs.append( (answer, segment['answer_topic_words'])) if load_rank_training_data: # loop for the ranked answer, add them at last for segment in raw_json_object: if 'qa_pairs_with_matching_score' in segment and \ len(segment['qa_pairs_with_matching_score']) > 0: self.rank_data[segment['_question']] = [] for pair_dict in segment['qa_pairs_with_matching_score']: qid, q_added = self._add_question( pair_dict['_question']) aid, a_added = self._add_answer(pair_dict['answer']) if q_added: # don't override previous question-answer pair self._add_qa_pair(qid, aid) elif a_added: # the question is not new, but the answer is new, usually we shoudln't see this if aid not in self.aid_to_qa_pairs: self.aid_to_qa_pairs[aid] = [] self.aid_to_qa_pairs[aid].append((qid, aid)) self.rank_data[segment['_question']].append( ((pair_dict['_question'], pair_dict['answer']), pair_dict['score'])) engine_logger.info("# docs loaded: %s" % len(self.doc_set)) engine_logger.info("# questions loaded: %s" % len(self.question_set)) engine_logger.info("# answers loaded: %s" % len(self.answer_set)) engine_logger.info("# topic word labeled docs: %s" % len(self.topic_word_docs))
data = list(data_store.rank_data.iteritems()) # sort by question to guarantee the order data.sort(key=lambda t: t[0]) if options.write_rank_data: pair = options.pair.split(",") start_end = map(int, pair) sw = StopWatch() tfidf_model_struct = tfidf_model.TfIdfModelStruct.get_model() lda_model_struct = lda_train.LdaModelStruct.get_model() topic_word_lookup_model_struct = topic_word_lookup.TopicWordLookupModelStruct.get_model( ) word2vec_model = Word2VecModel() matcher = Matcher(tfidf_model_struct, lda_model_struct, topic_word_lookup_model_struct, word2vec_model) data_part = data[start_end[0]:start_end[1]] engine_logger.info("data pair: %s, length: %s" % (start_end, len(data_part))) rank_model = RankTrainingDataGenerator( matcher, data_part, query_id_offset=start_end[0]) rank_model.write_training_data() print "total time to write rank data: %s seconds" % sw.stop() else: num_splits = int(options.num_splits) unit = total_num_data / num_splits pairs = [] for i in range(num_splits): if i == num_splits - 1: pairs.append((i * unit, total_num_data)) else: pairs.append((i * unit, (i + 1) * unit)) print pairs
def load_existing_model(cls): md_file_path = get_md_path() engine_logger.info( "Loading existing calibrated boosted decision stumps") model = joblib.load(md_file_path) return model
print "not indexed matches avg: %f" % (sum(map(lambda t: t[1], not_index_results)) / float(len(not_index_results))) print not_index_results if __name__ == '__main__': (options, args) = parser.parse_args() if options.data_file: raw_data = load_raw_data(options.data_file) if options.silence: engine_logger.setLevel(logging.INFO) if options.eval_tfidf: engine_logger.info("Cross validation on TFIDF model for %s folds" % options.num_folds) cv = SingleModelCrossValidationRunner( raw_data, int(options.num_folds), TfIdfModelStruct, EvaluateQuestionRetrieveSingleModel, {}) cv.cross_validate() print cv.report() if options.eval_lda: engine_logger.info("Cross validation on LDA model for %s folds" % options.num_folds) cv = SingleModelCrossValidationRunner( raw_data, int(options.num_folds), LdaModelStruct, EvaluateDocRetrieveSingleModel, {'num_topics': int(options.num_topics)}) cv.cross_validate() print cv.report()
def run_eval(self): engine_logger.info("Evaluate on training data set") self._eval_on_data_set(self.train_data_set) engine_logger.info("Evaluate on test data set") self._eval_on_data_set(self.test_data_set)
sum(map(lambda t: t[1], not_index_results)) / float(len(not_index_results))) print not_index_results if __name__ == '__main__': (options, args) = parser.parse_args() if options.data_file: raw_data = load_raw_data(options.data_file) if options.silence: engine_logger.setLevel(logging.INFO) if options.eval_tfidf: engine_logger.info("Cross validation on TFIDF model for %s folds" % options.num_folds) cv = SingleModelCrossValidationRunner( raw_data, int(options.num_folds), TfIdfModelStruct, EvaluateQuestionRetrieveSingleModel, {}) cv.cross_validate() print cv.report() if options.eval_lda: engine_logger.info("Cross validation on LDA model for %s folds" % options.num_folds) cv = SingleModelCrossValidationRunner( raw_data, int(options.num_folds), LdaModelStruct, EvaluateDocRetrieveSingleModel, {'num_topics': int(options.num_topics)}) cv.cross_validate() print cv.report()
def __init__(self, svm_kernel_type, c, gamma=0.5): engine_logger.info("Initializeing RankModelTrainer. Kernel type: %s, c: %s, gamma: %s" % (svm_kernel_type, c, gamma)) self.svm_kernel_type = svm_kernel_type self.c = c self.gamma = gamma