def process(self): log.info("Commencing execution") tagged_docs = TaggedLineDocument(self.labeled_articles_file_path) log.info("Training Doc2Vec model") doc2vec_model = doc2vec_helper.init_model(tagged_docs) doc2vec_model.save(self.doc2vec_model_file_path) log.info("Learnt vocab from training set and saved doc2vec model") x_train = list() with open(self.labeled_articles_file_path) as training_set: for line in training_set: x_train.append(doc2vec_model.infer_vector(line)) y_train = [0] * self.samples_per_class_train y_train.extend([1] * self.samples_per_class_train) x_test = list() with open(self.articles_source_file_path) as test_set: for line in test_set: x_test.append(doc2vec_model.infer_vector(line)) y_true = [1] * self.samples_per_class_test y_true.extend([0] * self.samples_per_class_test) ml_model_logreg = scikit_ml_helper.train_logistic_reg_classifier(x_train, y_train) scikit_ml_helper.persist_model_to_disk(ml_model_logreg, self.ml_model_file_path) y_pred = ml_model_logreg.predict(x_test) log.info("Logistic Regression") log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true))) log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true))) log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true))) ml_model_svm = scikit_ml_helper.train_svm_classifier(x_train, y_train) y_pred = ml_model_svm.predict(x_test) log.info("SVM") log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true))) log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true))) log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true))) ml_model_nb = scikit_ml_helper.train_gnb_classifier(x_train, y_train) y_pred = ml_model_nb.predict(x_test) log.info("Naive Bayes") log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true))) log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true))) log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true))) log.info("Completed execution")
def process(self): log.info("Began Processing") fpb_training_docs = FPBTaggedLineDocument( self.options.fpb_sentences_file_path) doc2vec_model = \ doc2vec_helper.init_model( fpb_training_docs, self.options.docvec_dimension_size, self.options.docvec_iteration_count ) log.info("Doc2vec model initialized with " + str(self.options.docvec_dimension_size) + " dimensions and " + str(self.options.docvec_iteration_count) + " iterations") label_list = fpb_training_docs.get_label_list() log.info("Re-training document vectors") x_train = list() for i in xrange(len(label_list)): x_vector = doc2vec_model.infer_vector( fpb_training_docs.get_phrases()) x_train.append(x_vector) log.info("Training ML model") linear_regression_model = ml_helper.train_linear_regressor( x_train, label_list) log.info("Predicting test set") x_test_articles, y_true = file_helper.get_article_details( self.options.test_headlines_data_path) x_test = list() for article in x_test_articles: x_vector = doc2vec_model.infer_vector(article) x_test.append(x_vector) y_pred = linear_regression_model.predict(x_test) test_result_dict = dict() test_result_dict['dimension_size'] = self.options.docvec_dimension_size test_result_dict[ 'iteration_count'] = self.options.docvec_iteration_count test_result_dict['r2_score'] = metrics.r2_score(y_true, y_pred) test_result_dict[ 'semeval_score'] = evaluation_helper.evaluate_task_score( y_pred, y_true) log.info("Test result: " + str(test_result_dict)) log.info("Completed Processing")
def process(self): log.info("Began Processing") semeval_train_docs = SemevalTaggedLineDocument( self.options.train_headlines_data_path) doc2vec_model = \ doc2vec_helper.init_model( semeval_train_docs, self.options.docvec_dimension_size, self.options.docvec_iteration_count ) log.info("Doc2vec model initialized with " + str(self.options.docvec_dimension_size) + " dimensions and " + str(self.options.docvec_iteration_count) + " iterations") x_articles, y_train = file_helper.get_article_details( self.options.train_headlines_data_path) x_train = list() for article in x_articles: x_vector = doc2vec_model.infer_vector(article) x_train.append(x_vector) x_test_articles, y_true = file_helper.get_article_details( self.options.test_headlines_data_path) custom_scorer = make_scorer(evaluate_task_score) x_test = list() for article in x_test_articles: x_vector = doc2vec_model.infer_vector(article) x_test.append(x_vector) x_train.extend(x_test) y_train.extend(y_true) scores = model_selection.cross_val_score(svm.LinearSVR(), x_train, y_train, cv=10, scoring=custom_scorer) log.info("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) log.info("Completed Processing")
def process(self): log.info("Commencing execution") # Get tagged articles from Veriday log.info("Getting tagged Veriday articles ... ") veriday_articles_raw = file_helper.get_articles_list( self.articles_source_file_path) veriday_tagged_articles = doc2vec_helper.get_tagged_articles_veriday( veriday_articles_raw) log.info("Getting tagged Amazon reviews ... ") tagged_articles, sentiment_scores_dict = \ doc2vec_helper.get_tagged_amazon_reviews(self.labeled_articles_file_path) # combine both article sets tagged_articles.extend(veriday_tagged_articles) # model initialization and vocab building log.info("Initializing the doc2vec model ...") doc2vec_model = doc2vec_helper.init_model(tagged_articles) # shuffling and training the model log.info("Training the doc2vec model ...") for i in range(self.shuffle_count): log.info("Shuffles remaining: " + str(self.shuffle_count - i)) doc2vec_helper.shuffle_and_train_articles(doc2vec_model, tagged_articles) # saving the doc2vec model to disk doc2vec_model.save(self.doc2vec_model_file_path) # Extracting parameters for and training the ML model x_docvecs, y_scores = scikit_ml_helper.extract_training_parameters( doc2vec_model, sentiment_scores_dict) log.info("Training the ML model ...") ml_model = scikit_ml_helper.train_linear_model(x_docvecs, y_scores) # saving the ml model to disk scikit_ml_helper.persist_model_to_disk(ml_model, self.ml_model_file_path) log.info("Completed execution")
def process(self): log.info("Commencing execution") combined_iterator = file_helper.get_reviews_iterator( self.classification_sources_file_path) sentences = [] doc_count = 0 for tagged_doc in combined_iterator: doc_count += 1 sentences.append(tagged_doc) doc2vec_model = doc2vec_helper.init_model(sentences) log.info("Learnt vocab from training set") # saving the doc2vec model to disk doc2vec_model.save(self.doc2vec_model_file_path) # Extracting parameters for and training the ML model x_docvecs, y_scores = doc2vec_helper.extract_classification_parameters( doc2vec_model, doc_count) log.info("Training the ML models") ml_model_logreg = scikit_ml_helper.train_logistic_reg_classifier( x_docvecs, y_scores) ml_model_nb = scikit_ml_helper.train_gnb_classifier( x_docvecs, y_scores) ml_model_svm_linear = scikit_ml_helper.train_svm_classifier( x_docvecs, y_scores) log.info("Saving the ML models to disk") scikit_ml_helper.persist_model_to_disk( ml_model_logreg, self.ml_model_file_path + ".docvec.log_reg") scikit_ml_helper.persist_model_to_disk( ml_model_nb, self.ml_model_file_path + ".docvec.nb") scikit_ml_helper.persist_model_to_disk( ml_model_svm_linear, self.ml_model_file_path + ".docvec.svm_linear") log.info("Completed execution")
def process(self): log.info("Commencing execution") # Get tagged articles from Semeval log.info("Getting Semeval articles ... ") semeval_articles_raw = file_helper.get_articles_list( self.labeled_articles_file_path) semeval_tagged_articles, document_sentiment_classes = \ doc2vec_helper.get_tagged_semeval_articles(semeval_articles_raw) # model initialization and vocab building log.info("Initializing the doc2vec model ...") doc2vec_model = doc2vec_helper.init_model(semeval_tagged_articles) # shuffling and training the model log.info("Training the doc2vec model ...") for i in range(self.shuffle_count): log.info("Shuffles remaining: " + str(self.shuffle_count - i)) doc2vec_helper.shuffle_and_train_articles(doc2vec_model, semeval_tagged_articles) # saving the doc2vec model to disk doc2vec_model.save(self.doc2vec_model_file_path) # Extracting parameters for and training the ML model x_docvecs, y_scores = scikit_ml_helper.extract_training_parameters( doc2vec_model, document_sentiment_classes) log.info("Training the ML model ...") # ml_model = scikit_ml_helper.train_linear_model(x_docvecs, y_scores) ml_model = scikit_ml_helper.train_gnb_classifier(x_docvecs, y_scores) # saving the ml model to disk scikit_ml_helper.persist_model_to_disk(ml_model, self.ml_model_file_path) log.info("Completed execution")
def process(self): log.info("Began Processing") if self.options.validate: semeval_train_docs = SemevalTaggedLineDocument(self.options.train_headlines_data_path) doc2vec_model = \ doc2vec_helper.init_model( semeval_train_docs, self.options.docvec_dimension_size, self.options.docvec_iteration_count ) log.info("Doc2vec model initialized with " + str(self.options.docvec_dimension_size) + " dimensions and " + str(self.options.docvec_iteration_count) + " iterations") x_articles, y_train = file_helper.get_article_details(self.options.train_headlines_data_path) x_train = list() for article in x_articles: x_vector = doc2vec_model.infer_vector(article) x_train.append(x_vector) linear_regression_model = ml_helper.train_linear_regressor(x_train, y_train) x_test_articles, y_true = file_helper.get_article_details(self.options.test_headlines_data_path) x_test = list() for article in x_test_articles: x_vector = doc2vec_model.infer_vector(article) x_test.append(x_vector) y_pred = linear_regression_model.predict(x_test) test_result_dict = dict() test_result_dict['dimension_size'] = self.options.docvec_dimension_size test_result_dict['iteration_count'] = self.options.docvec_iteration_count test_result_dict['r2_score'] = metrics.r2_score(y_true, y_pred) test_result_dict['semeval_score'] = evaluation_helper.evaluate_task_score(y_pred, y_true) log.info(test_result_dict) # with open(self.options.results_file, 'a') as results_file: # results_file.write(str(json.dumps(test_result_dict)) + "\n") elif self.options.annotate: semeval_train_docs = SemevalTaggedLineDocument(self.options.train_headlines_data_path) doc2vec_model = \ doc2vec_helper.init_model( semeval_train_docs, self.options.docvec_dimension_size, self.options.docvec_iteration_count ) log.info("Doc2vec model initialized with " + str(self.options.docvec_dimension_size) + " dimensions and " + str(self.options.docvec_iteration_count) + " iterations") x_articles, y_train = file_helper.get_article_details(self.options.train_headlines_data_path) x_train = list() for article in x_articles: x_vector = doc2vec_model.infer_vector(article) x_train.append(x_vector) linear_regression_model = ml_helper.train_linear_regressor(x_train, y_train) x_test_articles, y_true = file_helper.get_article_details(self.options.test_headlines_data_path) x_test = list() for article in x_test_articles: x_vector = doc2vec_model.infer_vector(article) x_test.append(x_vector) y_pred = linear_regression_model.predict(x_test) log.info("Annotating test set") file_helper.annotate_test_set(self.options.test_headlines_data_path, y_pred) else: raise RuntimeError("Invalid run mode. Valid modes are 'validate' and 'annotate'") log.info("Completed Processing")