def overview_multilabel_models(data_frame): vect_data, patent_ids, vectorizer = ch.apply_tfidf_vectorizer_fit_transform( data_frame) ################################################# text: from text to sparse binary matrix [[0, 1, 0],[1, 0, 1]] # temp_text = ch.apply_count_vectorizer(data_frame) ################################################# classification: from text to sparse binary matrix [[0, 1, 0],[1, 0, 1]] temp_classification, classes, n_classes = ch.apply_multilabel_binarizer( data_frame) benchmark_dataframe = pd.DataFrame( columns=['algorithm', 'accuracy', 'recall', 'precision', 'class']) for algorithm in [ pmh.get_SVC(), pmh.get_decision_tree(), pmh.get_kneighbors(), pmh.get_logistic(), pmh.get_random_forest_classifier(), pmh.get_linear_SVC(), pmh.get_multinomialNB() ]: classifier_name, parameters = ch.get_classifier_information( str(algorithm)) print('### ', classifier_name, ' ###') for i in range(n_classes): unique, counts = np.unique(temp_classification[:, i], return_counts=True) if len(counts) > 1 and counts[1] > 1: mean_accuracy, mean_precision, mean_recall = mh.calculate_metrics_for_crossvalidation( algorithm, vect_data, temp_classification[:, i]) mean_accuracy = np.append( mean_accuracy, pd.Series([classifier_name], index=['Algorithm'])) mean_precision = np.append( mean_precision, pd.Series([classifier_name], index=['Algorithm'])) mean_recall = np.append( mean_recall, pd.Series([classifier_name], index=['Algorithm'])) benchmark_dataframe.loc[benchmark_dataframe.shape[0] + 1] = [ classifier_name, mean_accuracy, mean_recall, mean_precision, classes[i] ] print("algorithm ", " accuracy ", " recall ", " precision") for index, row in benchmark_dataframe.iterrows(): print(row[0], " ", row[1].flatten()[0], " ", row[2].flatten()[0], " ", row[3].flatten()[0], " ", row[4]) path_to_csv = ch.get_csv_path('cross_validation_multiclass') ch.write_dataframe_as_csv(benchmark_dataframe, path_to_csv)
def apply_doc2vec_logistic_regression(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, source_path): baseline_name = '[all classes predictions]' + text_vectorizer + '/' + class_vectorizer results_model_dbow = ch.apply_doc2vec_separated_train_test( data_frame, baseline_name) X_train, y_train, X_test, y_test, model_dbow, train_tagged, test_tagged = results_model_dbow logreg = pmh.get_logistic() classifier_name_0, parameters_0 = ch.get_classifier_information( str(model_dbow)) classifier_name_1, parameters_1 = ch.get_classifier_information( str(logreg)) y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test) model_name = baseline_name + classifier_name_0 + '/' + classifier_name_1 list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path) # baseline_name = '[each class predictions]'+text_vectorizer+'/'+class_vectorizer # vectorizer_results = ch.apply_df_vectorizer(data_frame, 'doc2vec', 'multi_label', baseline_name) # X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = vectorizer_results # model_name = baseline_name+'/'+classifier_name_0+'/'+classifier_name_1 # # for i in range(n_classes): # unique, counts = np.unique(y_train[:, i], return_counts=True) # if len(counts) > 1 and counts[1] > 1: # y_pred = pmh.fit_predict_functions(custom_pipeline, X_train, y_train[:, i], X_test) # print('### ', classes[i] ,' ###') # list_metrics = mh.calculate_metrics(model_name, y_test[:, i], y_pred) # none_average, binary_average, micro_average, macro_average = list_metrics # # ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path) # improvement improved_logistic_regression(train_tagged, test_tagged, model_dbow, logreg, text_vectorizer, class_vectorizer, classif_level, classif_type, source_path)
def apply_svm(X_train_tfidf, y_train, X_test_tfidf, y_test, classif_level, classif_type, source_path): svm = pmh.get_SVC() classifier_name, parameters = ch.get_classifier_information(str(svm)) y_pred = pmh.fit_predict_functions(svm, X_train_tfidf, y_train, X_test_tfidf) model_name = '[all classes predictions]label_encoder/tfidf/' + classifier_name list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name, list_metrics, parameters, model_name, classif_level, classif_type, source_path)
def improved_logistic_regression(train_tagged, test_tagged, model_dbow, logreg, text_vectorizer, class_vectorizer, classif_level, classif_type, source_path): classifier_name_0, parameters_0 = ch.get_classifier_information( str(model_dbow)) classifier_name_1, parameters_1 = ch.get_classifier_information( str(logreg)) model_dmm = wmh.train_doc2vec_with_tagged_data(train_tagged.values) classifier_name_2, parameters_2 = ch.get_classifier_information( str(model_dmm)) y_train, X_train = wmh.vec_for_learning(model_dmm, train_tagged) y_test, X_test = wmh.vec_for_learning(model_dmm, test_tagged) y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test) model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) merged_model = wmh.get_concatenated_doc2vec(model_dbow, model_dmm) y_train, X_train = wmh.vec_for_learning(merged_model, train_tagged) y_test, X_test = wmh.vec_for_learning(merged_model, test_tagged) y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test) model_name = '[all classes predictions]' + classifier_name_0 + '/' + classifier_name_2 + '/' + classifier_name_1 list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path)
def overview_models(data_frame, classif_level, classif_type, source_path): vect_data, patent_ids, vectorizer = ch.apply_tfidf_vectorizer_fit_transform( data_frame) benchmark = [] # word2vec model, svm, decision tree, random forest, hidden markov model, k-nearest for algorithm in [ pmh.get_SVC(), pmh.get_decision_tree(), pmh.get_kneighbors(), pmh.get_logistic() ]: classifier_name, parameters = ch.get_classifier_information( str(algorithm)) print('### ', classifier_name, ' ###') cross_results = mh.get_cross_val_score(algorithm, vect_data, data_frame['classification'], 'accuracy') tmp = pd.DataFrame.from_dict(cross_results).mean(axis=0) tmp = tmp.append(pd.Series([classifier_name], index=['algorithm'])) benchmark.append(tmp) ch.save_results_cross_validation('cross_validation', benchmark, ['cv=5'], classif_level, classif_type, source_path)