def apply_word2vec_extratrees(data_frame, classif_level, classif_type, source_path): data_frame['text'] = data_frame.apply( lambda row: th.tokenize_complex_text(row['text']), axis=1) data_frame['classification'] = data_frame.apply( lambda row: th.tokenize_complex_text(row['classification']), axis=1) df_single_classification = ch.get_list_each_text_a_different_classification( data_frame) x = df_single_classification['text'] y = df_single_classification['classification'] X_train, X_test, y_train, y_test = ch.get_train_test_from_data(x, y) model_w2v = wmh.get_word2vec_model(X_train) etree_w2v = Pipeline([("word2vec vectorizer", wmh.MeanEmbeddingVectorizer(model_w2v)), ("extra trees", pmh.get_extra_tree())]) etree_w2v_tfidf = Pipeline([("word2vec vectorizer", wmh.TfidfEmbeddingVectorizer(model_w2v)), ("extra trees", pmh.get_extra_tree())]) # NB!!!: the model does not support multi targets, so i duplicate the sources and give them different targets y_pred = pmh.fit_predict_functions(etree_w2v_tfidf, X_train, y_train, X_test) classifier_name_0 = 'Word2Vec/MeanEmbeddingVectorizer' classifier_name_1, parameters_1 = ch.get_extratree_classifier_information( str(etree_w2v)) model_name = '[all classes predictions]' + classifier_name_0 + '/' + classifier_name_1 # this should be changed by comparing all the possibilities for specified text (i can use the original dataframe!) list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path) # NB!!!: the model does not support multi targets, so i duplicate the sources and give them different targets y_pred = pmh.fit_predict_functions(etree_w2v, X_train, y_train, X_test) classifier_name_2 = 'Word2Vec/TfidfEmbeddingVectorizer' model_name = '[all classes predictions]' + classifier_name_2 + '/' + classifier_name_1 # this should be changed by comparing all the possibilities for specified text (i can use the original dataframe!) list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path)
def apply_label_powerset(X_train, y_train, X_test, y_test, baseline_name, source_path): classifier = ch.get_label_powerset(pmh.get_logistic()) classifier_name, parameters = ch.get_complex_classifier_information( str(classifier), 1, 1, 2, 0) y_pred = pmh.fit_predict_functions(classifier, X_train, y_train, X_test) model_name = '[all classes predictions]' + baseline_name + classifier_name list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name, list_metrics, parameters, model_name, classif_level, classif_type, source_path)
def apply_svm(X_train_tfidf, y_train, X_test_tfidf, y_test, classif_level, classif_type, source_path): svm = pmh.get_SVC() classifier_name, parameters = ch.get_classifier_information(str(svm)) y_pred = pmh.fit_predict_functions(svm, X_train_tfidf, y_train, X_test_tfidf) model_name = '[all classes predictions]label_encoder/tfidf/' + classifier_name list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name, list_metrics, parameters, model_name, classif_level, classif_type, source_path)
def apply_onevsrest(X_train, y_train, X_test, y_test, classes, baseline_name, source_path): # X_train.sort_indices() # SVC needs this line in addition custom_pipeline = Pipeline([ ('clf', OneVsRestClassifier(pmh.get_logistic(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_SVC(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_multinomialNB(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_decision_tree(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_kneighbors(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_linear_SVC(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_random_forest_classifier(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_SGD_classifier(), n_jobs=-1)), ]) classifier_name, parameters = ch.get_complex_classifier_information( str(custom_pipeline), 3, 1, 4, 0) model_name = '[each class predictions]' + baseline_name + classifier_name accuracies = [] for _class in classes: print('**Processing {} texts...**'.format(_class)) y_pred = pmh.fit_predict_functions(custom_pipeline, X_train, y_train[_class], X_test) accuracies.append(mh.get_accuracy_score(y_test[_class], y_pred)) list_metrics = mh.calculate_metrics(model_name, y_test[_class], y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name, list_metrics, parameters, model_name, classif_level, classif_type, source_path) print('Pipeline score on training {}'.format( custom_pipeline.score(X_train, y_train[_class]))) true_positives, false_positives, tpfn = mh.get_predictions_distribution( y_test[_class], y_pred) model_name = '[all classes predictions]' + baseline_name + classifier_name if tpfn == 0 or (true_positives + false_positives) == 0: mh.display_directly_metrics(model_name, 0, 0, 0, -1) else: precision = true_positives / (true_positives + false_positives) recall = true_positives / tpfn mh.display_directly_metrics( model_name, precision, recall, 2 * (precision * recall) / (precision + recall), -1)
def apply_adapted_algorithm(X_train, y_train, X_test, y_test, baseline_name, source_path): classifier = pmh.get_MLkNN() classifier_name, parameters = ch.get_complex_classifier_information( str(classifier), 0, 0, 1, 0) X_train, y_train, X_test = th.get_lil_matrices(X_train, y_train, X_test) y_pred = pmh.fit_predict_functions(classifier, X_train, y_train, X_test) model_name = '[all classes predictions]' + baseline_name + classifier_name list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name, list_metrics, parameters, model_name, classif_level, classif_type, source_path)
def apply_doc2vec_logistic_regression(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, source_path): baseline_name = '[all classes predictions]' + text_vectorizer + '/' + class_vectorizer results_model_dbow = ch.apply_doc2vec_separated_train_test( data_frame, baseline_name) X_train, y_train, X_test, y_test, model_dbow, train_tagged, test_tagged = results_model_dbow logreg = pmh.get_logistic() classifier_name_0, parameters_0 = ch.get_classifier_information( str(model_dbow)) classifier_name_1, parameters_1 = ch.get_classifier_information( str(logreg)) y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test) model_name = baseline_name + classifier_name_0 + '/' + classifier_name_1 list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path) # baseline_name = '[each class predictions]'+text_vectorizer+'/'+class_vectorizer # vectorizer_results = ch.apply_df_vectorizer(data_frame, 'doc2vec', 'multi_label', baseline_name) # X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = vectorizer_results # model_name = baseline_name+'/'+classifier_name_0+'/'+classifier_name_1 # # for i in range(n_classes): # unique, counts = np.unique(y_train[:, i], return_counts=True) # if len(counts) > 1 and counts[1] > 1: # y_pred = pmh.fit_predict_functions(custom_pipeline, X_train, y_train[:, i], X_test) # print('### ', classes[i] ,' ###') # list_metrics = mh.calculate_metrics(model_name, y_test[:, i], y_pred) # none_average, binary_average, micro_average, macro_average = list_metrics # # ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path) # improvement improved_logistic_regression(train_tagged, test_tagged, model_dbow, logreg, text_vectorizer, class_vectorizer, classif_level, classif_type, source_path)
def improved_logistic_regression(train_tagged, test_tagged, model_dbow, logreg, text_vectorizer, class_vectorizer, classif_level, classif_type, source_path): classifier_name_0, parameters_0 = ch.get_classifier_information( str(model_dbow)) classifier_name_1, parameters_1 = ch.get_classifier_information( str(logreg)) model_dmm = wmh.train_doc2vec_with_tagged_data(train_tagged.values) classifier_name_2, parameters_2 = ch.get_classifier_information( str(model_dmm)) y_train, X_train = wmh.vec_for_learning(model_dmm, train_tagged) y_test, X_test = wmh.vec_for_learning(model_dmm, test_tagged) y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test) model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) merged_model = wmh.get_concatenated_doc2vec(model_dbow, model_dmm) y_train, X_train = wmh.vec_for_learning(merged_model, train_tagged) y_test, X_test = wmh.vec_for_learning(merged_model, test_tagged) y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test) model_name = '[all classes predictions]' + classifier_name_0 + '/' + classifier_name_2 + '/' + classifier_name_1 list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path)
def apply_multi_label_classification_without_pipeline( data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, source_path): baseline_name = text_vectorizer + '/' + class_vectorizer + '/onevsrest/' vectorizer_results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, '[both]' + baseline_name) X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = vectorizer_results # len_vocabulary = 57335 # len_vocabulary = 34736 print(X_train[0]) print('len_vocabulary: ', len_vocabulary, ' num_classes: ', n_classes) # Run classifier classifier = OneVsRestClassifier(pmh.get_logistic()) # here # classifier = OneVsRestClassifier(pmh.get_SVC()) # here # classifier = OneVsRestClassifier(pmh.get_multinomialNB()) # ERROR WORD2VEC/DOC2VEC (X has negative value) # classifier = OneVsRestClassifier(pmh.get_decision_tree()) # here # classifier = OneVsRestClassifier(pmh.get_kneighbors()) # here # classifier = OneVsRestClassifier(pmh.get_linear_SVC()) # classifier = OneVsRestClassifier(pmh.get_random_forest_classifier()) # here # classifier = OneVsRestClassifier(pmh.get_SGD_classifier()) train_predictions = np.ndarray(shape=(n_classes, y_train.shape[0]), dtype=int) predictions = np.ndarray(shape=(n_classes, y_test.shape[0]), dtype=int) ### precision = dict() recall = dict() average_precision = dict() classifier_name, parameters = ch.get_complex_classifier_information( str(classifier), 1, 1, 2, 0) second_training = False just_once = True another_try = False # single train and estimation instead of multi-train-estimation steps (it performs better with svm and logistic) if not another_try: for _ in range(1): for i in range(n_classes): if second_training: if classifier_name in [ 'DecisionTreeClassifier', 'KNeighborsClassifier', 'MultinomialNB', 'RandomForestClassifier' ]: # do not provide the second metrics break elif just_once: # fit again with the whole set of classes - should be better classifier.fit(X_train, y_train) y_score = classifier.decision_function(X_test) precision[i], recall[i], average_precision[ i] = mh.calculate_recall_curve_precision_score( y_test[:, i], y_score[:, i], None, y_test[:, i], y_score[:, i]) just_once = False else: predictions[i] = pmh.fit_predict_functions( classifier, X_train, y_train[:, i], X_test) train_predictions[i] = classifier.predict(X_train) print('**Processing classes {0:0.2f} % ...**'.format( ((i + 1) / n_classes) * 100)) second_training = True break predictions = predictions.transpose() print('transposed') else: predictions = pmh.fit_predict_functions(classifier, X_train, y_train, X_test) train_predictions = classifier.predict(X_train) # metrics model_name = '[each class predictions]' + baseline_name + classifier_name manual_metrics = mh.calculate_manual_metrics(model_name, y_test, predictions) none_average, binary_average, micro_average, macro_average = manual_metrics # metrics list_metrics = mh.calculate_metrics(model_name, y_test, predictions) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name, list_metrics, parameters, model_name, classif_level, classif_type, source_path) if not just_once: print('not just once') train_predictions = classifier.predict(X_train) predictions = classifier.predict(X_test) # metrics mh.calculate_metrics_with_recall_curve(y_score, y_train, y_test, train_predictions, predictions) # metrics model_name = '[all classes predictions]' + baseline_name + classifier_name list_metrics = mh.calculate_metrics(model_name, y_test, predictions) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name, list_metrics, parameters, model_name, classif_level, classif_type, source_path)