def process(self):

        log.info("Commencing execution")

        tagged_docs = TaggedLineDocument(self.labeled_articles_file_path)

        log.info("Training Doc2Vec model")
        doc2vec_model = doc2vec_helper.init_model(tagged_docs)
        doc2vec_model.save(self.doc2vec_model_file_path)
        log.info("Learnt vocab from training set and saved doc2vec model")

        x_train = list()
        with open(self.labeled_articles_file_path) as training_set:
            for line in training_set:
                x_train.append(doc2vec_model.infer_vector(line))

        y_train = [0] * self.samples_per_class_train
        y_train.extend([1] * self.samples_per_class_train)

        x_test = list()
        with open(self.articles_source_file_path) as test_set:
            for line in test_set:
                x_test.append(doc2vec_model.infer_vector(line))

        y_true = [1] * self.samples_per_class_test
        y_true.extend([0] * self.samples_per_class_test)

        ml_model_logreg = scikit_ml_helper.train_logistic_reg_classifier(x_train, y_train)
        scikit_ml_helper.persist_model_to_disk(ml_model_logreg, self.ml_model_file_path)
        y_pred = ml_model_logreg.predict(x_test)
        log.info("Logistic Regression")
        log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true)))
        log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true)))
        log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true)))

        ml_model_svm = scikit_ml_helper.train_svm_classifier(x_train, y_train)
        y_pred = ml_model_svm.predict(x_test)
        log.info("SVM")
        log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true)))
        log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true)))
        log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true)))

        ml_model_nb = scikit_ml_helper.train_gnb_classifier(x_train, y_train)
        y_pred = ml_model_nb.predict(x_test)
        log.info("Naive Bayes")
        log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true)))
        log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true)))
        log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true)))

        log.info("Completed execution")
Ejemplo n.º 2
0
    def process(self):
        log.info("Began Processing")

        fpb_training_docs = FPBTaggedLineDocument(
            self.options.fpb_sentences_file_path)

        doc2vec_model = \
            doc2vec_helper.init_model(
                fpb_training_docs, self.options.docvec_dimension_size, self.options.docvec_iteration_count
            )
        log.info("Doc2vec model initialized with " +
                 str(self.options.docvec_dimension_size) + " dimensions and " +
                 str(self.options.docvec_iteration_count) + " iterations")
        label_list = fpb_training_docs.get_label_list()

        log.info("Re-training document vectors")
        x_train = list()
        for i in xrange(len(label_list)):
            x_vector = doc2vec_model.infer_vector(
                fpb_training_docs.get_phrases())
            x_train.append(x_vector)

        log.info("Training ML model")
        linear_regression_model = ml_helper.train_linear_regressor(
            x_train, label_list)

        log.info("Predicting test set")
        x_test_articles, y_true = file_helper.get_article_details(
            self.options.test_headlines_data_path)
        x_test = list()
        for article in x_test_articles:
            x_vector = doc2vec_model.infer_vector(article)
            x_test.append(x_vector)
        y_pred = linear_regression_model.predict(x_test)

        test_result_dict = dict()
        test_result_dict['dimension_size'] = self.options.docvec_dimension_size
        test_result_dict[
            'iteration_count'] = self.options.docvec_iteration_count
        test_result_dict['r2_score'] = metrics.r2_score(y_true, y_pred)
        test_result_dict[
            'semeval_score'] = evaluation_helper.evaluate_task_score(
                y_pred, y_true)

        log.info("Test result: " + str(test_result_dict))

        log.info("Completed Processing")
Ejemplo n.º 3
0
    def process(self):
        log.info("Began Processing")

        semeval_train_docs = SemevalTaggedLineDocument(
            self.options.train_headlines_data_path)

        doc2vec_model = \
            doc2vec_helper.init_model(
                semeval_train_docs, self.options.docvec_dimension_size, self.options.docvec_iteration_count
            )
        log.info("Doc2vec model initialized with " +
                 str(self.options.docvec_dimension_size) + " dimensions and " +
                 str(self.options.docvec_iteration_count) + " iterations")

        x_articles, y_train = file_helper.get_article_details(
            self.options.train_headlines_data_path)

        x_train = list()
        for article in x_articles:
            x_vector = doc2vec_model.infer_vector(article)
            x_train.append(x_vector)

        x_test_articles, y_true = file_helper.get_article_details(
            self.options.test_headlines_data_path)
        custom_scorer = make_scorer(evaluate_task_score)

        x_test = list()
        for article in x_test_articles:
            x_vector = doc2vec_model.infer_vector(article)
            x_test.append(x_vector)

        x_train.extend(x_test)
        y_train.extend(y_true)
        scores = model_selection.cross_val_score(svm.LinearSVR(),
                                                 x_train,
                                                 y_train,
                                                 cv=10,
                                                 scoring=custom_scorer)

        log.info("Accuracy: %0.2f (+/- %0.2f)" %
                 (scores.mean(), scores.std() * 2))

        log.info("Completed Processing")
    def process(self):

        log.info("Commencing execution")

        # Get tagged articles from Veriday
        log.info("Getting tagged Veriday articles ... ")
        veriday_articles_raw = file_helper.get_articles_list(
            self.articles_source_file_path)
        veriday_tagged_articles = doc2vec_helper.get_tagged_articles_veriday(
            veriday_articles_raw)

        log.info("Getting tagged Amazon reviews ... ")
        tagged_articles, sentiment_scores_dict = \
            doc2vec_helper.get_tagged_amazon_reviews(self.labeled_articles_file_path)

        # combine both article sets
        tagged_articles.extend(veriday_tagged_articles)

        # model initialization and vocab building
        log.info("Initializing the doc2vec model ...")
        doc2vec_model = doc2vec_helper.init_model(tagged_articles)

        # shuffling and training the model
        log.info("Training the doc2vec model ...")
        for i in range(self.shuffle_count):
            log.info("Shuffles remaining: " + str(self.shuffle_count - i))
            doc2vec_helper.shuffle_and_train_articles(doc2vec_model,
                                                      tagged_articles)

        # saving the doc2vec model to disk
        doc2vec_model.save(self.doc2vec_model_file_path)

        # Extracting parameters for and training the ML model
        x_docvecs, y_scores = scikit_ml_helper.extract_training_parameters(
            doc2vec_model, sentiment_scores_dict)
        log.info("Training the ML model ...")
        ml_model = scikit_ml_helper.train_linear_model(x_docvecs, y_scores)

        # saving the ml model to disk
        scikit_ml_helper.persist_model_to_disk(ml_model,
                                               self.ml_model_file_path)

        log.info("Completed execution")
    def process(self):

        log.info("Commencing execution")

        combined_iterator = file_helper.get_reviews_iterator(
            self.classification_sources_file_path)

        sentences = []
        doc_count = 0
        for tagged_doc in combined_iterator:
            doc_count += 1
            sentences.append(tagged_doc)

        doc2vec_model = doc2vec_helper.init_model(sentences)
        log.info("Learnt vocab from training set")

        # saving the doc2vec model to disk
        doc2vec_model.save(self.doc2vec_model_file_path)

        # Extracting parameters for and training the ML model
        x_docvecs, y_scores = doc2vec_helper.extract_classification_parameters(
            doc2vec_model, doc_count)

        log.info("Training the ML models")
        ml_model_logreg = scikit_ml_helper.train_logistic_reg_classifier(
            x_docvecs, y_scores)
        ml_model_nb = scikit_ml_helper.train_gnb_classifier(
            x_docvecs, y_scores)
        ml_model_svm_linear = scikit_ml_helper.train_svm_classifier(
            x_docvecs, y_scores)

        log.info("Saving the ML models to disk")
        scikit_ml_helper.persist_model_to_disk(
            ml_model_logreg, self.ml_model_file_path + ".docvec.log_reg")
        scikit_ml_helper.persist_model_to_disk(
            ml_model_nb, self.ml_model_file_path + ".docvec.nb")
        scikit_ml_helper.persist_model_to_disk(
            ml_model_svm_linear,
            self.ml_model_file_path + ".docvec.svm_linear")

        log.info("Completed execution")
Ejemplo n.º 6
0
    def process(self):

        log.info("Commencing execution")

        # Get tagged articles from Semeval
        log.info("Getting Semeval articles ... ")
        semeval_articles_raw = file_helper.get_articles_list(
            self.labeled_articles_file_path)
        semeval_tagged_articles, document_sentiment_classes = \
            doc2vec_helper.get_tagged_semeval_articles(semeval_articles_raw)

        # model initialization and vocab building
        log.info("Initializing the doc2vec model ...")
        doc2vec_model = doc2vec_helper.init_model(semeval_tagged_articles)

        # shuffling and training the model
        log.info("Training the doc2vec model ...")
        for i in range(self.shuffle_count):
            log.info("Shuffles remaining: " + str(self.shuffle_count - i))
            doc2vec_helper.shuffle_and_train_articles(doc2vec_model,
                                                      semeval_tagged_articles)

        # saving the doc2vec model to disk
        doc2vec_model.save(self.doc2vec_model_file_path)

        # Extracting parameters for and training the ML model
        x_docvecs, y_scores = scikit_ml_helper.extract_training_parameters(
            doc2vec_model, document_sentiment_classes)
        log.info("Training the ML model ...")
        # ml_model = scikit_ml_helper.train_linear_model(x_docvecs, y_scores)
        ml_model = scikit_ml_helper.train_gnb_classifier(x_docvecs, y_scores)

        # saving the ml model to disk
        scikit_ml_helper.persist_model_to_disk(ml_model,
                                               self.ml_model_file_path)

        log.info("Completed execution")
Ejemplo n.º 7
0
    def process(self):
        log.info("Began Processing")

        if self.options.validate:
            semeval_train_docs = SemevalTaggedLineDocument(self.options.train_headlines_data_path)

            doc2vec_model = \
                doc2vec_helper.init_model(
                    semeval_train_docs, self.options.docvec_dimension_size, self.options.docvec_iteration_count
                )
            log.info("Doc2vec model initialized with " + str(self.options.docvec_dimension_size) +
                     " dimensions and " + str(self.options.docvec_iteration_count) + " iterations")

            x_articles, y_train = file_helper.get_article_details(self.options.train_headlines_data_path)

            x_train = list()
            for article in x_articles:
                x_vector = doc2vec_model.infer_vector(article)
                x_train.append(x_vector)

            linear_regression_model = ml_helper.train_linear_regressor(x_train, y_train)

            x_test_articles, y_true = file_helper.get_article_details(self.options.test_headlines_data_path)

            x_test = list()
            for article in x_test_articles:
                x_vector = doc2vec_model.infer_vector(article)
                x_test.append(x_vector)

            y_pred = linear_regression_model.predict(x_test)

            test_result_dict = dict()
            test_result_dict['dimension_size'] = self.options.docvec_dimension_size
            test_result_dict['iteration_count'] = self.options.docvec_iteration_count
            test_result_dict['r2_score'] = metrics.r2_score(y_true, y_pred)
            test_result_dict['semeval_score'] = evaluation_helper.evaluate_task_score(y_pred, y_true)

            log.info(test_result_dict)

            # with open(self.options.results_file, 'a') as results_file:
            #     results_file.write(str(json.dumps(test_result_dict)) + "\n")

        elif self.options.annotate:
            semeval_train_docs = SemevalTaggedLineDocument(self.options.train_headlines_data_path)

            doc2vec_model = \
                doc2vec_helper.init_model(
                    semeval_train_docs, self.options.docvec_dimension_size, self.options.docvec_iteration_count
                )
            log.info("Doc2vec model initialized with " + str(self.options.docvec_dimension_size) +
                     " dimensions and " + str(self.options.docvec_iteration_count) + " iterations")

            x_articles, y_train = file_helper.get_article_details(self.options.train_headlines_data_path)

            x_train = list()
            for article in x_articles:
                x_vector = doc2vec_model.infer_vector(article)
                x_train.append(x_vector)

            linear_regression_model = ml_helper.train_linear_regressor(x_train, y_train)

            x_test_articles, y_true = file_helper.get_article_details(self.options.test_headlines_data_path)

            x_test = list()
            for article in x_test_articles:
                x_vector = doc2vec_model.infer_vector(article)
                x_test.append(x_vector)

            y_pred = linear_regression_model.predict(x_test)

            log.info("Annotating test set")
            file_helper.annotate_test_set(self.options.test_headlines_data_path, y_pred)

        else:
            raise RuntimeError("Invalid run mode. Valid modes are 'validate' and 'annotate'")

        log.info("Completed Processing")