def process(self): log.info("Commencing execution") # Get tagged articles from Veriday log.info("Getting tagged Veriday articles ... ") veriday_articles_raw = file_helper.get_articles_list(self.articles_source_file_path) veriday_tagged_articles = doc2vec_helper.get_tagged_articles_veriday(veriday_articles_raw) # Convert articles file into a Tagged documents for doc2vec log.info("Getting tagged Semeval articles ... ") articles = file_helper.get_articles_list(self.labeled_articles_file_path) tagged_articles, sentiment_scores_dict = doc2vec_helper.get_tagged_articles_scores(articles) # combine both article sets tagged_articles.extend(veriday_tagged_articles) # model initialization and vocab building log.info("Initializing the doc2vec model ...") doc2vec_model = doc2vec_helper.init_model(tagged_articles) # shuffling and training the model log.info("Training the doc2vec model ...") for i in range(self.shuffle_count): log.info("Shuffles remaining: " + str(self.shuffle_count - i)) doc2vec_helper.shuffle_and_train_articles(doc2vec_model, tagged_articles) # saving the doc2vec model to disk doc2vec_model.save(self.doc2vec_model_file_path) # Extracting parameters for and training the ML model x_docvecs, y_scores = scikit_ml_helper.extract_training_parameters(doc2vec_model, sentiment_scores_dict) log.info("Training the ML model ...") ml_model = scikit_ml_helper.train_linear_model(x_docvecs, y_scores) # saving the ml model to disk scikit_ml_helper.persist_model_to_disk(ml_model, self.ml_model_file_path) log.info("Completed execution")
def process(self): log.info("Commencing execution") # Get tagged articles from Semeval log.info("Getting Semeval articles ... ") semeval_articles_raw = file_helper.get_articles_list( self.labeled_articles_file_path) semeval_tagged_articles, document_sentiment_classes = \ doc2vec_helper.get_tagged_semeval_articles(semeval_articles_raw) # model initialization and vocab building log.info("Initializing the doc2vec model ...") doc2vec_model = doc2vec_helper.init_model(semeval_tagged_articles) # shuffling and training the model log.info("Training the doc2vec model ...") for i in range(self.shuffle_count): log.info("Shuffles remaining: " + str(self.shuffle_count - i)) doc2vec_helper.shuffle_and_train_articles(doc2vec_model, semeval_tagged_articles) # saving the doc2vec model to disk doc2vec_model.save(self.doc2vec_model_file_path) # Extracting parameters for and training the ML model x_docvecs, y_scores = scikit_ml_helper.extract_training_parameters( doc2vec_model, document_sentiment_classes) log.info("Training the ML model ...") # ml_model = scikit_ml_helper.train_linear_model(x_docvecs, y_scores) ml_model = scikit_ml_helper.train_gnb_classifier(x_docvecs, y_scores) # saving the ml model to disk scikit_ml_helper.persist_model_to_disk(ml_model, self.ml_model_file_path) log.info("Completed execution")
def __init__(self, source): super(SemevalTaggedLineDocument, self).__init__(source) self.semeval_articles = file_helper.get_articles_list(source) self.counter = -1
def process(self): log.info("Commencing execution") with open(self.classification_sources_file_path) as source_cfg: sources_dict = json.load(source_cfg) x_documents = list() y_scores = list() log.info("Parsing documents") for sentiment in sources_dict: review_file_path = sources_dict[sentiment] with open(review_file_path) as review_file: for line in review_file: x_document = line y_score = sentiment x_documents.append(x_document) y_scores.append(y_score) semeval_classified_articles_file = self.labeled_articles_file_path semeval_classified_articles = file_helper.get_articles_list( semeval_classified_articles_file) y_true = list() semeval_count = 0 for semeval_classified_article in semeval_classified_articles: # article_text = semeval_classified_article['articleText'] article_text = semeval_classified_article['title'] if not article_text: continue semeval_count += 1 x_documents.append(article_text) y_true.append(semeval_classified_article['label']) log.info("Initiating training for document vectors") vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') x_vectors = vectorizer.fit_transform(x_documents) labelled_docs = x_vectors[(-1 * semeval_count):] log.info("Vectors have been trained") log.info("Training the ML models") ml_model_logreg = scikit_ml_helper.train_logistic_reg_classifier( x_vectors[:(-1 * semeval_count)], y_scores) ml_model_nb = scikit_ml_helper.train_gnb_classifier_dense( x_vectors[:(-1 * semeval_count)], y_scores) ml_model_svm_linear = scikit_ml_helper.train_svm_classifier( x_vectors[:(-1 * semeval_count)], y_scores) log.info("Saving the ML models to disk") scikit_ml_helper.persist_model_to_disk( ml_model_logreg, self.ml_model_file_path + ".tfidf.log_reg") scikit_ml_helper.persist_model_to_disk( ml_model_nb, self.ml_model_file_path + ".tfidf.nb") scikit_ml_helper.persist_model_to_disk( ml_model_svm_linear, self.ml_model_file_path + ".tfidf.svm_linear") predictions_logreg = ml_model_logreg.predict(labelled_docs) predictions_linearsvm = ml_model_svm_linear.predict(labelled_docs) predictions_nb = ml_model_nb.predict(labelled_docs) accuracy_logreg = \ sklearn.metrics.accuracy_score(y_true=y_true, y_pred=predictions_logreg, normalize=True, sample_weight=None) accuracy_linearsvm = \ sklearn.metrics.accuracy_score(y_true=y_true, y_pred=predictions_linearsvm, normalize=True, sample_weight=None) accuracy_nb = \ sklearn.metrics.accuracy_score(y_true=y_true, y_pred=predictions_nb, normalize=True, sample_weight=None) log.info("accuracy_logreg: " + str(accuracy_logreg)) log.info("accuracy_linearsvm: " + str(accuracy_linearsvm)) log.info("accuracy_nb: " + str(accuracy_nb)) log.info("\ncm_logreg\n" + str( scikit_ml_helper.get_confusion_matrix(y_true, predictions_logreg))) log.info("\ncm_linearsvm\n" + str( scikit_ml_helper.get_confusion_matrix(y_true, predictions_linearsvm))) log.info( "\ncm_nb\n" + str(scikit_ml_helper.get_confusion_matrix(y_true, predictions_nb))) log.info("Completed execution")
nb_ml_model_file_path = \ "/home/v2john/Documents/amazon/models/books_ml_docvec.model.docvec.nb" doc2vec_model = Doc2Vec.load(doc2vec_model_file_path) logreg_model = scikit_ml_helper.get_model_from_disk(logreg_ml_model_file_path) svm_model = scikit_ml_helper.get_model_from_disk(svm_linear_ml_model_file_path) nb_model = scikit_ml_helper.get_model_from_disk(nb_ml_model_file_path) semeval_classified_articles_file = \ "/home/v2john/Dropbox/Personal/Academic/Masters/UWaterloo/Academics/ResearchProject/semeval_task/" + \ "semeval-2017-task-5-subtask-2/semeval_combined_fulltext_classified.json" semeval_classified_articles = file_helper.get_articles_list(semeval_classified_articles_file) article_vectors = list() count = 0 y_true = list() for semeval_classified_article in semeval_classified_articles: print 'working on article ' + str(count) count += 1 article_text = semeval_classified_article['articleText'] # article_text = semeval_classified_article['title'] if not article_text: continue