Example #1
0
    def get_model(cls, data_store=None, regen=False, num_topics=None, save=True):
        md_file_path = get_md_path()
        dict_file_path = get_dict_path()
        simmx_file_path = get_simmx_path()
        num_topics_file_path = get_num_topic_path()

        if not os.path.isfile(md_file_path) or not \
                os.path.isfile(dict_file_path) or not \
                os.path.isfile(simmx_file_path) or not \
                os.path.isfile(num_topics_file_path) or regen:
            engine_logger.info("Generating LDA models.")

            dictionary, corpus = docs_to_corpus(data_store.doc_set, rm_stop_words=True)
            # generate LDA model
            # LDA model is trained on all the docs
            model = models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
            sim_matrix = similarities.MatrixSimilarity(model[corpus])

            if save:
                # saving
                dictionary.save_as_text(dict_file_path)
                model.save(md_file_path)
                sim_matrix.save(simmx_file_path)
                write_num_topics(num_topics_file_path, num_topics)
        else:
            engine_logger.info("Loading existing LDA models.")
            dictionary = corpora.Dictionary.load_from_text(dict_file_path)
            model = models.TfidfModel.load(md_file_path)
            sim_matrix = similarities.SparseMatrixSimilarity.load(simmx_file_path)
            num_topics = read_num_topics(num_topics_file_path)

        return LdaModelStruct(model=model, dictionary=dictionary, sim_matrix=sim_matrix, num_topics=num_topics)
Example #2
0
    def cross_validate(self):
        iter_num = 0
        train_test_data_tuples = split_raw_data_k_fold(self.raw_data, self.num_folds)
        for train_data_store, train_data, test_data in train_test_data_tuples:
            engine_logger.info("Cross validation iter: %d" % iter_num)

            eval_model = self._get_eval_model(train_data_store, train_data, test_data)
            eval_model.run_eval()

            train_accuracy, train_relevance, test_accuracy, test_relevance = eval_model.report_metrics()
            print train_accuracy, train_relevance, test_accuracy, test_relevance
            self.each_run_train_accuracy.append(train_accuracy)
            self.each_run_train_relevance.append(train_relevance)
            self.each_run_test_accuracy.append(test_accuracy)
            self.each_run_test_relevance.append(test_relevance)

            if iter_num == 0 and options.write_output:
                eval_model.write_output()

            iter_num += 1

        self.avg_train_accuracy = sum(self.each_run_train_accuracy) / float(self.num_folds)
        self.avg_train_relevance = sum(self.each_run_train_relevance) / float(self.num_folds)
        self.avg_test_accuracy = sum(self.each_run_test_accuracy) / float(self.num_folds)
        self.avg_test_relevance = sum(self.each_run_test_relevance) / float(self.num_folds)
Example #3
0
    def cross_validate(self):
        iter_num = 0
        train_test_data_tuples = split_raw_data_k_fold(self.raw_data,
                                                       self.num_folds)
        for train_data_store, train_data, test_data in train_test_data_tuples:
            engine_logger.info("Cross validation iter: %d" % iter_num)

            eval_model = self._get_eval_model(train_data_store, train_data,
                                              test_data)
            eval_model.run_eval()

            train_accuracy, train_relevance, test_accuracy, test_relevance = eval_model.report_metrics(
            )
            print train_accuracy, train_relevance, test_accuracy, test_relevance
            self.each_run_train_accuracy.append(train_accuracy)
            self.each_run_train_relevance.append(train_relevance)
            self.each_run_test_accuracy.append(test_accuracy)
            self.each_run_test_relevance.append(test_relevance)

            if iter_num == 0 and options.write_output:
                eval_model.write_output()

            iter_num += 1

        self.avg_train_accuracy = sum(self.each_run_train_accuracy) / float(
            self.num_folds)
        self.avg_train_relevance = sum(self.each_run_train_relevance) / float(
            self.num_folds)
        self.avg_test_accuracy = sum(self.each_run_test_accuracy) / float(
            self.num_folds)
        self.avg_test_relevance = sum(self.each_run_test_relevance) / float(
            self.num_folds)
Example #4
0
    def write_output(self):
        show_topic_words = isinstance(self.model, TopicWordLookupModelStruct)
        engine_logger.info("Writing output")
        with open('cv_test.log', 'w') as f:
            f.write("%f, %f\n" % (self.test_data_set.accuracy, self.test_data_set.avg_relevance_score))
            for idx in range(len(self.test_data_set.questions)):
                f.write("Question: %s\n" % self.test_data_set.questions[idx].encode('utf-8'))
                if show_topic_words:
                    f.write("Question topic words: %s\n" % self.test_data_set.question_topic_words[idx])
                f.write("Correct answer: %s\n" % self.test_data_set.top_answers[idx].encode('utf-8'))
                if show_topic_words:
                    f.write("Answer topic words: %s\n" % self.test_data_set.answer_topic_words[idx])
                f.write("Retrieved question: %s\n" % self.test_data_set.retrieved_questions[idx].encode('utf-8'))
                f.write("Retrieved answer: %s\n" % self.test_data_set.retrieved_answers[idx].encode('utf-8'))
                f.write("Label: %s, relevance score: %f\n" %
                        (self.test_data_set.judgement_labels[idx], self.test_data_set.relevance_scores[idx]))
                f.write("================================================================== \n")

        with open('cv_train.log', 'w') as f:
            f.write(">>>>>>>> Training data\n")
            for idx in range(len(self.train_data_set.questions)):
                f.write("Question: %s\n" % self.train_data_set.questions[idx].encode('utf-8'))
                if show_topic_words:
                    f.write("Question topic words: %s\n" % self.train_data_set.question_topic_words[idx])
                f.write("Answer: %s\n" % self.train_data_set.top_answers[idx].encode('utf-8'))
                if show_topic_words:
                    f.write("Answer topic words: %s\n" % self.train_data_set.answer_topic_words[idx])
                f.write("================================================================== \n")
Example #5
0
 def train_model(self):
     bin_path = get_bin_path()
     train_data_path = get_train_data_path()
     model_path = get_md_path()
     if self.svm_kernel_type == self.RANK_SVM_KERNEL_TYPE_LINEAR:
         subprocess.call([bin_path, '-c', str(self.c), train_data_path, model_path])
         engine_logger.info("Finished training linear SVM model")
     elif self.svm_kernel_type == self.RANK_SVM_KERNEL_TYPE_RBF:
         subprocess.call(
             [bin_path, '-c', str(self.c), '-t', '2', '-g', str(self.gamma), train_data_path, model_path])
         engine_logger.info("Finished RBF kernel based SVM model")
Example #6
0
    def write_training_data(self):
        query_id = self.query_id_offset
        engine_logger.info("Writing training data. Num of queries: %s" % len(self.rank_data))
        file_path = "%s.offset%s" % (get_train_data_path(), self.query_id_offset)
        with open(file_path, 'w') as f:
            for question, pairs in self.rank_data:
                engine_logger.debug("Writing for query %s" % query_id)
                f.write("# query %s\n" % query_id)
                qa_pairs = [t[0] for t in pairs]
                features = self.matcher.match(question, qa_pairs)
                for idx, feature in enumerate(features):
                    labeled_score = pairs[idx][1]
                    f.write("%s qid:%s %s\n" % (labeled_score, query_id, str(feature)))

                query_id += 1
Example #7
0
    def write_output(self):
        show_topic_words = isinstance(self.model, TopicWordLookupModelStruct)
        engine_logger.info("Writing output")
        with open('cv_test.log', 'w') as f:
            f.write("%f, %f\n" % (self.test_data_set.accuracy,
                                  self.test_data_set.avg_relevance_score))
            for idx in range(len(self.test_data_set.questions)):
                f.write("Question: %s\n" %
                        self.test_data_set.questions[idx].encode('utf-8'))
                if show_topic_words:
                    f.write("Question topic words: %s\n" %
                            self.test_data_set.question_topic_words[idx])
                f.write("Correct answer: %s\n" %
                        self.test_data_set.top_answers[idx].encode('utf-8'))
                if show_topic_words:
                    f.write("Answer topic words: %s\n" %
                            self.test_data_set.answer_topic_words[idx])
                f.write(
                    "Retrieved question: %s\n" %
                    self.test_data_set.retrieved_questions[idx].encode('utf-8')
                )
                f.write(
                    "Retrieved answer: %s\n" %
                    self.test_data_set.retrieved_answers[idx].encode('utf-8'))
                f.write("Label: %s, relevance score: %f\n" %
                        (self.test_data_set.judgement_labels[idx],
                         self.test_data_set.relevance_scores[idx]))
                f.write(
                    "================================================================== \n"
                )

        with open('cv_train.log', 'w') as f:
            f.write(">>>>>>>> Training data\n")
            for idx in range(len(self.train_data_set.questions)):
                f.write("Question: %s\n" %
                        self.train_data_set.questions[idx].encode('utf-8'))
                if show_topic_words:
                    f.write("Question topic words: %s\n" %
                            self.train_data_set.question_topic_words[idx])
                f.write("Answer: %s\n" %
                        self.train_data_set.top_answers[idx].encode('utf-8'))
                if show_topic_words:
                    f.write("Answer topic words: %s\n" %
                            self.train_data_set.answer_topic_words[idx])
                f.write(
                    "================================================================== \n"
                )
 def get_new_model(cls,
                   n_estimators,
                   train_data,
                   train_labels,
                   cross_validation_folds,
                   save=True):
     engine_logger.info("Training new calibrated boosted decision stumps")
     md_file_path = get_md_path()
     boosted_decision_stumps = AdaBoostClassifier(
         DecisionTreeClassifier(max_depth=1), n_estimators=n_estimators)
     calibrated_bds = CalibratedClassifierCV(boosted_decision_stumps,
                                             method='sigmoid',
                                             cv=cross_validation_folds)
     calibrated_bds.fit(train_data, train_labels)
     if save:
         joblib.dump(calibrated_bds, md_file_path)
     return calibrated_bds
Example #9
0
    def get_model(cls, data_store=None, regen=False, save=True):
        md_file_path = get_md_path()
        dict_file_path = get_dict_path()
        q_simmx_file_path = get_q_simmx_path()
        a_simmx_file_path = get_a_simmx_path()
        if not os.path.isfile(md_file_path) or not \
                os.path.isfile(dict_file_path) or not \
                os.path.isfile(q_simmx_file_path) or not \
                os.path.isfile(a_simmx_file_path) or regen:
            engine_logger.info("Generating TF_IDF models.")

            dictionary, corpus = docs_to_corpus(data_store.doc_set)

            # vocabulary and tf-idf are computed on the whole space
            model = models.TfidfModel(corpus)
            # while query similarities are executed on several space, i.e. question space and answer separately
            # extract answer corpus
            question_corpus = [corpus[qid] for qid in data_store.question_set]
            question_sim_matrix = similarities.SparseMatrixSimilarity(
                model[question_corpus], num_features=len(dictionary))
            # extract answer corpus
            answer_corpus = [corpus[aid] for aid in data_store.answer_set]
            answer_sim_matrix = similarities.SparseMatrixSimilarity(
                model[answer_corpus], num_features=len(dictionary))

            if save:
                # saving
                dictionary.save_as_text(dict_file_path)
                model.save(md_file_path)
                question_sim_matrix.save(q_simmx_file_path)
                answer_sim_matrix.save(a_simmx_file_path)
        else:
            engine_logger.info("Loading existing TF_IDF models.")

            dictionary = corpora.Dictionary.load_from_text(dict_file_path)
            model = models.TfidfModel.load(md_file_path)
            question_sim_matrix = similarities.SparseMatrixSimilarity.load(
                q_simmx_file_path)
            answer_sim_matrix = similarities.SparseMatrixSimilarity.load(
                a_simmx_file_path)

        return TfIdfModelStruct(model=model,
                                dictionary=dictionary,
                                question_sim_matrix=question_sim_matrix,
                                answer_sim_matrix=answer_sim_matrix)
Example #10
0
 def load_models(self):
     """
     Load all the models from files
     :return
     """
     self.tfidf_model = TfIdfModelStruct.get_model()
     self.lda_model = LdaModelStruct.get_model()
     self.topic_word_lookup_model = TopicWordLookupModelStruct.get_model()
     self.word2vec_model = Word2VecModel()
     self.matcher = Matcher(
         self.tfidf_model,
         self.lda_model,
         self.topic_word_lookup_model,
         self.word2vec_model
     )
     self.rank_model = read_rank_model_from_file()
     engine_logger.info("Rank based query engine is up")
     self.is_up = True
Example #11
0
    def get_model(cls,
                  data_store=None,
                  regen=False,
                  num_topics=None,
                  save=True):
        md_file_path = get_md_path()
        dict_file_path = get_dict_path()
        simmx_file_path = get_simmx_path()
        num_topics_file_path = get_num_topic_path()

        if not os.path.isfile(md_file_path) or not \
                os.path.isfile(dict_file_path) or not \
                os.path.isfile(simmx_file_path) or not \
                os.path.isfile(num_topics_file_path) or regen:
            engine_logger.info("Generating LDA models.")

            dictionary, corpus = docs_to_corpus(data_store.doc_set,
                                                rm_stop_words=True)
            # generate LDA model
            # LDA model is trained on all the docs
            model = models.ldamodel.LdaModel(corpus,
                                             num_topics=num_topics,
                                             id2word=dictionary)
            sim_matrix = similarities.MatrixSimilarity(model[corpus])

            if save:
                # saving
                dictionary.save_as_text(dict_file_path)
                model.save(md_file_path)
                sim_matrix.save(simmx_file_path)
                write_num_topics(num_topics_file_path, num_topics)
        else:
            engine_logger.info("Loading existing LDA models.")
            dictionary = corpora.Dictionary.load_from_text(dict_file_path)
            model = models.TfidfModel.load(md_file_path)
            sim_matrix = similarities.SparseMatrixSimilarity.load(
                simmx_file_path)
            num_topics = read_num_topics(num_topics_file_path)

        return LdaModelStruct(model=model,
                              dictionary=dictionary,
                              sim_matrix=sim_matrix,
                              num_topics=num_topics)
Example #12
0
def read_rank_model_from_file():
    md_file_path = get_md_path()
    if not os.path.isfile(md_file_path):
        raise Exception("Missing model file: %s" % md_file_path)

    with open(md_file_path, 'r') as f:
        contents = f.readlines()
        type_line = contents[1]
        type_line_splits = type_line.split(" ")
        if type_line_splits[0] == "0":
            # kernel type: linear
            threshold_line = contents[-2]
            threshold = float(threshold_line.split(" ")[0])
            sv_line = contents[-1]
            splits = sv_line.split(" ")[1:-1]
            weight_vec = []
            for part in splits:
                weight_vec.append(float(part.split(":")[1]))

            engine_logger.info("Load linear SVM rank model from file")
            return LinearSVMRankModel(weight_vec=weight_vec,
                                      threshold=threshold)

        elif type_line_splits[0] == "2":
            # kernel type: rbf
            gamma_param_line = contents[3]
            gamma = float(gamma_param_line.split(" ")[0])
            threshold_line = contents[10]
            threshold = float(threshold_line.split(" ")[0])
            alphays = []
            svs = []
            for line in contents[11:]:
                line_splits = line.split(" ")
                alphays.append(float(line_splits[0]))
                sv_vec = []
                for part in line_splits[1:-1]:
                    sv_vec.append(float(part.split(":")[1]))
                svs.append(sv_vec)

            engine_logger.info("Load RBF kernel SVM rank model from file")
            return RBFSVMRankModel(alphays, svs, gamma, threshold)
        else:
            engine_logger.error("Un-recognized model file format")
Example #13
0
    def get_model(cls, data_store=None, regen=False, save=True):
        md_file_path = get_md_path()
        dict_file_path = get_dict_path()
        q_simmx_file_path = get_q_simmx_path()
        a_simmx_file_path = get_a_simmx_path()
        if not os.path.isfile(md_file_path) or not \
                os.path.isfile(dict_file_path) or not \
                os.path.isfile(q_simmx_file_path) or not \
                os.path.isfile(a_simmx_file_path) or regen:
            engine_logger.info("Generating TF_IDF models.")

            dictionary, corpus = docs_to_corpus(data_store.doc_set)

            # vocabulary and tf-idf are computed on the whole space
            model = models.TfidfModel(corpus)
            # while query similarities are executed on several space, i.e. question space and answer separately
            # extract answer corpus
            question_corpus = [corpus[qid] for qid in data_store.question_set]
            question_sim_matrix = similarities.SparseMatrixSimilarity(model[question_corpus],
                                                                      num_features=len(dictionary))
            # extract answer corpus
            answer_corpus = [corpus[aid] for aid in data_store.answer_set]
            answer_sim_matrix = similarities.SparseMatrixSimilarity(model[answer_corpus], num_features=len(dictionary))

            if save:
                # saving
                dictionary.save_as_text(dict_file_path)
                model.save(md_file_path)
                question_sim_matrix.save(q_simmx_file_path)
                answer_sim_matrix.save(a_simmx_file_path)
        else:
            engine_logger.info("Loading existing TF_IDF models.")

            dictionary = corpora.Dictionary.load_from_text(dict_file_path)
            model = models.TfidfModel.load(md_file_path)
            question_sim_matrix = similarities.SparseMatrixSimilarity.load(q_simmx_file_path)
            answer_sim_matrix = similarities.SparseMatrixSimilarity.load(a_simmx_file_path)

        return TfIdfModelStruct(model=model,
                                dictionary=dictionary,
                                question_sim_matrix=question_sim_matrix,
                                answer_sim_matrix=answer_sim_matrix)
Example #14
0
    def get_model(cls, data_store=None, regen=False, save=True):
        dict_file_path = get_dict_path()
        simmx_file_path = get_simmx_path()
        if not os.path.isfile(dict_file_path) or not os.path.isfile(simmx_file_path) or \
                regen:
            engine_logger.info("Generating topic word lookup model")

            # construct on a refined vocabulary of only topic words
            topic_words_across_all_docs = []
            for pair in data_store.topic_word_docs:
                topic_words_per_doc = pair[1]
                normed_topic_words_per_doc = [
                    cls.normalize_word(word) for word in topic_words_per_doc
                ]
                topic_words_across_all_docs.append(normed_topic_words_per_doc)

            dictionary = corpora.Dictionary(topic_words_across_all_docs)
            instance = TopicWordLookupModelStruct(dictionary, None)

            topic_word_vecs = [
                instance.get_topic_word_vec(doc) for doc in data_store.doc_set
            ]
            simmx = \
                similarities.SparseMatrixSimilarity(topic_word_vecs,
                                                    num_features=len(dictionary))

            instance.simmx = simmx
            if save:
                # saving
                simmx.save(simmx_file_path)
                dictionary.save_as_text(dict_file_path)

            return instance
        else:
            engine_logger.info("Loading existing topic word lookup model")
            dictionary = corpora.Dictionary.load_from_text(dict_file_path)
            simmx = similarities.SparseMatrixSimilarity.load(simmx_file_path)

            return TopicWordLookupModelStruct(dictionary, simmx)
Example #15
0
    def get_model(cls, data_store=None, regen=False, save=True):
        dict_file_path = get_dict_path()
        simmx_file_path = get_simmx_path()
        if not os.path.isfile(dict_file_path) or not os.path.isfile(simmx_file_path) or \
                regen:
            engine_logger.info("Generating topic word lookup model")

            # construct on a refined vocabulary of only topic words
            topic_words_across_all_docs = []
            for pair in data_store.topic_word_docs:
                topic_words_per_doc = pair[1]
                normed_topic_words_per_doc = [cls.normalize_word(word) for word in topic_words_per_doc]
                topic_words_across_all_docs.append(normed_topic_words_per_doc)

            dictionary = corpora.Dictionary(topic_words_across_all_docs)
            instance = TopicWordLookupModelStruct(dictionary,
                                                  None)

            topic_word_vecs = [instance.get_topic_word_vec(doc) for doc in data_store.doc_set]
            simmx = \
                similarities.SparseMatrixSimilarity(topic_word_vecs,
                                                    num_features=len(dictionary))

            instance.simmx = simmx
            if save:
                # saving
                simmx.save(simmx_file_path)
                dictionary.save_as_text(dict_file_path)

            return instance
        else:
            engine_logger.info("Loading existing topic word lookup model")
            dictionary = corpora.Dictionary.load_from_text(dict_file_path)
            simmx = similarities.SparseMatrixSimilarity.load(simmx_file_path)

            return TopicWordLookupModelStruct(dictionary, simmx)
Example #16
0
        if options.write_rank_data:
            pair = options.pair.split(",")
            start_end = map(int, pair)
            sw = StopWatch()
            tfidf_model_struct = tfidf_model.TfIdfModelStruct.get_model()
            lda_model_struct = lda_train.LdaModelStruct.get_model()
            topic_word_lookup_model_struct = topic_word_lookup.TopicWordLookupModelStruct.get_model()
            word2vec_model = Word2VecModel()
            matcher = Matcher(
                tfidf_model_struct,
                lda_model_struct,
                topic_word_lookup_model_struct,
                word2vec_model
            )
            data_part = data[start_end[0]:start_end[1]]
            engine_logger.info("data pair: %s, length: %s" % (start_end, len(data_part)))
            rank_model = RankTrainingDataGenerator(
                matcher, data_part, query_id_offset=start_end[0])
            rank_model.write_training_data()
            print "total time to write rank data: %s seconds" % sw.stop()
        else:
            num_splits = int(options.num_splits)
            unit = total_num_data / num_splits
            pairs = []
            for i in range(num_splits):
                if i == num_splits - 1:
                    pairs.append((i*unit, total_num_data))
                else:
                    pairs.append((i*unit, (i+1)*unit))
            print pairs
Example #17
0
    def __init__(self, raw_json_object, load_rank_training_data=True):
        engine_logger.info("Initializing data store.")
        # doc set stores all the documents
        self.doc_set = list()
        self.doc_to_id = dict()
        self.qa_context = dict()
        # set of question doc ids
        self.question_set = list()
        # set of question doc ids
        self.answer_set = list()
        # question has only one answer
        self.qid_to_qa_pair = dict()
        # one answer can corresponds to multiple questions
        self.aid_to_qa_pairs = dict()
        # data to train svm rank
        self.rank_data = dict()

        self.topic_word_docs = []

        for segment in raw_json_object:
            question = segment['_question']
            qid, added = self._add_question(question)
            if added:
                # context
                if 'context' in segment:
                    self.qa_context[qid] = segment['context']

            answer = segment['answer']
            aid, added = self._add_answer(answer)

            self._add_qa_pair(qid, aid)

            # store data for topic words
            if '_question_topic_words' in segment:
                self.topic_word_docs.append((question, segment['_question_topic_words']))
            if 'answer_topic_words' in segment:
                self.topic_word_docs.append((answer, segment['answer_topic_words']))

        if load_rank_training_data:
            # loop for the ranked answer, add them at last
            for segment in raw_json_object:
                if 'qa_pairs_with_matching_score' in segment and \
                                len(segment['qa_pairs_with_matching_score']) > 0:
                    self.rank_data[segment['_question']] = []
                    for pair_dict in segment['qa_pairs_with_matching_score']:
                        qid, q_added = self._add_question(pair_dict['_question'])
                        aid, a_added = self._add_answer(pair_dict['answer'])
                        if q_added:
                            # don't override previous question-answer pair
                            self._add_qa_pair(qid, aid)
                        elif a_added:
                            # the question is not new, but the answer is new, usually we shoudln't see this
                            if aid not in self.aid_to_qa_pairs:
                                self.aid_to_qa_pairs[aid] = []
                            self.aid_to_qa_pairs[aid].append((qid, aid))

                        self.rank_data[segment['_question']].append((
                            (
                                pair_dict['_question'],
                                pair_dict['answer']
                            ),
                            pair_dict['score'])
                        )
        engine_logger.info("# docs loaded: %s" % len(self.doc_set))
        engine_logger.info("# questions loaded: %s" % len(self.question_set))
        engine_logger.info("# answers loaded: %s" % len(self.answer_set))
        engine_logger.info("# topic word labeled docs: %s" % len(self.topic_word_docs))
Example #18
0
 def __init__(self, eager_loading=True):
     engine_logger.info("Loading word2vec: %s" % filepath)
     self.model = None
     if eager_loading:
         self.model = word2vec.load(filepath, encoding='ISO-8859-1')
Example #19
0
    def __init__(self, raw_json_object, load_rank_training_data=True):
        engine_logger.info("Initializing data store.")
        # doc set stores all the documents
        self.doc_set = list()
        self.doc_to_id = dict()
        self.qa_context = dict()
        # set of question doc ids
        self.question_set = list()
        # set of question doc ids
        self.answer_set = list()
        # question has only one answer
        self.qid_to_qa_pair = dict()
        # one answer can corresponds to multiple questions
        self.aid_to_qa_pairs = dict()
        # data to train svm rank
        self.rank_data = dict()

        self.topic_word_docs = []

        for segment in raw_json_object:
            question = segment['_question']
            qid, added = self._add_question(question)
            if added:
                # context
                if 'context' in segment:
                    self.qa_context[qid] = segment['context']

            answer = segment['answer']
            aid, added = self._add_answer(answer)

            self._add_qa_pair(qid, aid)

            # store data for topic words
            if '_question_topic_words' in segment:
                self.topic_word_docs.append(
                    (question, segment['_question_topic_words']))
            if 'answer_topic_words' in segment:
                self.topic_word_docs.append(
                    (answer, segment['answer_topic_words']))

        if load_rank_training_data:
            # loop for the ranked answer, add them at last
            for segment in raw_json_object:
                if 'qa_pairs_with_matching_score' in segment and \
                                len(segment['qa_pairs_with_matching_score']) > 0:
                    self.rank_data[segment['_question']] = []
                    for pair_dict in segment['qa_pairs_with_matching_score']:
                        qid, q_added = self._add_question(
                            pair_dict['_question'])
                        aid, a_added = self._add_answer(pair_dict['answer'])
                        if q_added:
                            # don't override previous question-answer pair
                            self._add_qa_pair(qid, aid)
                        elif a_added:
                            # the question is not new, but the answer is new, usually we shoudln't see this
                            if aid not in self.aid_to_qa_pairs:
                                self.aid_to_qa_pairs[aid] = []
                            self.aid_to_qa_pairs[aid].append((qid, aid))

                        self.rank_data[segment['_question']].append(
                            ((pair_dict['_question'], pair_dict['answer']),
                             pair_dict['score']))
        engine_logger.info("# docs loaded: %s" % len(self.doc_set))
        engine_logger.info("# questions loaded: %s" % len(self.question_set))
        engine_logger.info("# answers loaded: %s" % len(self.answer_set))
        engine_logger.info("# topic word labeled docs: %s" %
                           len(self.topic_word_docs))
Example #20
0
        data = list(data_store.rank_data.iteritems())
        # sort by question to guarantee the order
        data.sort(key=lambda t: t[0])
        if options.write_rank_data:
            pair = options.pair.split(",")
            start_end = map(int, pair)
            sw = StopWatch()
            tfidf_model_struct = tfidf_model.TfIdfModelStruct.get_model()
            lda_model_struct = lda_train.LdaModelStruct.get_model()
            topic_word_lookup_model_struct = topic_word_lookup.TopicWordLookupModelStruct.get_model(
            )
            word2vec_model = Word2VecModel()
            matcher = Matcher(tfidf_model_struct, lda_model_struct,
                              topic_word_lookup_model_struct, word2vec_model)
            data_part = data[start_end[0]:start_end[1]]
            engine_logger.info("data pair: %s, length: %s" %
                               (start_end, len(data_part)))
            rank_model = RankTrainingDataGenerator(
                matcher, data_part, query_id_offset=start_end[0])
            rank_model.write_training_data()
            print "total time to write rank data: %s seconds" % sw.stop()
        else:
            num_splits = int(options.num_splits)
            unit = total_num_data / num_splits
            pairs = []
            for i in range(num_splits):
                if i == num_splits - 1:
                    pairs.append((i * unit, total_num_data))
                else:
                    pairs.append((i * unit, (i + 1) * unit))
            print pairs
 def load_existing_model(cls):
     md_file_path = get_md_path()
     engine_logger.info(
         "Loading existing calibrated boosted decision stumps")
     model = joblib.load(md_file_path)
     return model
Example #22
0
    print "not indexed matches avg: %f" % (sum(map(lambda t: t[1], not_index_results)) / float(len(not_index_results)))
    print not_index_results


if __name__ == '__main__':
    (options, args) = parser.parse_args()

    if options.data_file:
        raw_data = load_raw_data(options.data_file)

    if options.silence:
        engine_logger.setLevel(logging.INFO)

    if options.eval_tfidf:
        engine_logger.info("Cross validation on TFIDF model for %s folds" % options.num_folds)
        cv = SingleModelCrossValidationRunner(
            raw_data, int(options.num_folds), TfIdfModelStruct, EvaluateQuestionRetrieveSingleModel, {})
        cv.cross_validate()
        print cv.report()

    if options.eval_lda:
        engine_logger.info("Cross validation on LDA model for %s folds" % options.num_folds)
        cv = SingleModelCrossValidationRunner(
            raw_data,
            int(options.num_folds),
            LdaModelStruct,
            EvaluateDocRetrieveSingleModel,
            {'num_topics': int(options.num_topics)})
        cv.cross_validate()
        print cv.report()
Example #23
0
 def run_eval(self):
     engine_logger.info("Evaluate on training data set")
     self._eval_on_data_set(self.train_data_set)
     engine_logger.info("Evaluate on test data set")
     self._eval_on_data_set(self.test_data_set)
Example #24
0
 def run_eval(self):
     engine_logger.info("Evaluate on training data set")
     self._eval_on_data_set(self.train_data_set)
     engine_logger.info("Evaluate on test data set")
     self._eval_on_data_set(self.test_data_set)
Example #25
0
        sum(map(lambda t: t[1], not_index_results)) /
        float(len(not_index_results)))
    print not_index_results


if __name__ == '__main__':
    (options, args) = parser.parse_args()

    if options.data_file:
        raw_data = load_raw_data(options.data_file)

    if options.silence:
        engine_logger.setLevel(logging.INFO)

    if options.eval_tfidf:
        engine_logger.info("Cross validation on TFIDF model for %s folds" %
                           options.num_folds)
        cv = SingleModelCrossValidationRunner(
            raw_data, int(options.num_folds), TfIdfModelStruct,
            EvaluateQuestionRetrieveSingleModel, {})
        cv.cross_validate()
        print cv.report()

    if options.eval_lda:
        engine_logger.info("Cross validation on LDA model for %s folds" %
                           options.num_folds)
        cv = SingleModelCrossValidationRunner(
            raw_data, int(options.num_folds), LdaModelStruct,
            EvaluateDocRetrieveSingleModel,
            {'num_topics': int(options.num_topics)})
        cv.cross_validate()
        print cv.report()
Example #26
0
 def __init__(self, svm_kernel_type, c, gamma=0.5):
     engine_logger.info("Initializeing RankModelTrainer. Kernel type: %s, c: %s, gamma: %s" %
                        (svm_kernel_type, c, gamma))
     self.svm_kernel_type = svm_kernel_type
     self.c = c
     self.gamma = gamma