コード例 #1
0
def overview_multilabel_models(data_frame):
    vect_data, patent_ids, vectorizer = ch.apply_tfidf_vectorizer_fit_transform(
        data_frame)

    ################################################# text: from text to sparse binary matrix [[0, 1, 0],[1, 0, 1]]

    # temp_text = ch.apply_count_vectorizer(data_frame)

    ################################################# classification: from text to sparse binary matrix [[0, 1, 0],[1, 0, 1]]

    temp_classification, classes, n_classes = ch.apply_multilabel_binarizer(
        data_frame)

    benchmark_dataframe = pd.DataFrame(
        columns=['algorithm', 'accuracy', 'recall', 'precision', 'class'])

    for algorithm in [
            pmh.get_SVC(),
            pmh.get_decision_tree(),
            pmh.get_kneighbors(),
            pmh.get_logistic(),
            pmh.get_random_forest_classifier(),
            pmh.get_linear_SVC(),
            pmh.get_multinomialNB()
    ]:
        classifier_name, parameters = ch.get_classifier_information(
            str(algorithm))
        print('###  ', classifier_name, '  ###')
        for i in range(n_classes):
            unique, counts = np.unique(temp_classification[:, i],
                                       return_counts=True)
            if len(counts) > 1 and counts[1] > 1:
                mean_accuracy, mean_precision, mean_recall = mh.calculate_metrics_for_crossvalidation(
                    algorithm, vect_data, temp_classification[:, i])

                mean_accuracy = np.append(
                    mean_accuracy,
                    pd.Series([classifier_name], index=['Algorithm']))
                mean_precision = np.append(
                    mean_precision,
                    pd.Series([classifier_name], index=['Algorithm']))
                mean_recall = np.append(
                    mean_recall,
                    pd.Series([classifier_name], index=['Algorithm']))

                benchmark_dataframe.loc[benchmark_dataframe.shape[0] + 1] = [
                    classifier_name, mean_accuracy, mean_recall,
                    mean_precision, classes[i]
                ]

    print("algorithm ", " accuracy ", " recall ", " precision")
    for index, row in benchmark_dataframe.iterrows():
        print(row[0], " ", row[1].flatten()[0], " ", row[2].flatten()[0], " ",
              row[3].flatten()[0], " ", row[4])

    path_to_csv = ch.get_csv_path('cross_validation_multiclass')
    ch.write_dataframe_as_csv(benchmark_dataframe, path_to_csv)
コード例 #2
0
def apply_doc2vec_logistic_regression(data_frame, text_vectorizer,
                                      class_vectorizer, classif_level,
                                      classif_type, source_path):
    baseline_name = '[all classes predictions]' + text_vectorizer + '/' + class_vectorizer
    results_model_dbow = ch.apply_doc2vec_separated_train_test(
        data_frame, baseline_name)
    X_train, y_train, X_test, y_test, model_dbow, train_tagged, test_tagged = results_model_dbow

    logreg = pmh.get_logistic()

    classifier_name_0, parameters_0 = ch.get_classifier_information(
        str(model_dbow))
    classifier_name_1, parameters_1 = ch.get_classifier_information(
        str(logreg))

    y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test)

    model_name = baseline_name + classifier_name_0 + '/' + classifier_name_1
    list_metrics = mh.calculate_metrics(model_name, y_test, y_pred)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name,
                    classif_level, classif_type, source_path)

    # baseline_name = '[each class predictions]'+text_vectorizer+'/'+class_vectorizer
    # vectorizer_results = ch.apply_df_vectorizer(data_frame, 'doc2vec', 'multi_label', baseline_name)
    # X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = vectorizer_results
    # model_name = baseline_name+'/'+classifier_name_0+'/'+classifier_name_1
    #
    # for i in range(n_classes):
    #     unique, counts = np.unique(y_train[:, i], return_counts=True)
    #     if len(counts) > 1 and counts[1] > 1:
    #         y_pred = pmh.fit_predict_functions(custom_pipeline, X_train, y_train[:, i], X_test)
    #         print('###  ', classes[i] ,'  ###')
    #         list_metrics = mh.calculate_metrics(model_name, y_test[:, i], y_pred)
    #         none_average, binary_average, micro_average, macro_average = list_metrics
    #
    #         ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path)

    # improvement
    improved_logistic_regression(train_tagged, test_tagged, model_dbow, logreg,
                                 text_vectorizer, class_vectorizer,
                                 classif_level, classif_type, source_path)
コード例 #3
0
def apply_svm(X_train_tfidf, y_train, X_test_tfidf, y_test, classif_level,
              classif_type, source_path):
    svm = pmh.get_SVC()

    classifier_name, parameters = ch.get_classifier_information(str(svm))

    y_pred = pmh.fit_predict_functions(svm, X_train_tfidf, y_train,
                                       X_test_tfidf)

    model_name = '[all classes predictions]label_encoder/tfidf/' + classifier_name
    list_metrics = mh.calculate_metrics(model_name, y_test, y_pred)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name, list_metrics, parameters, model_name,
                    classif_level, classif_type, source_path)
コード例 #4
0
def improved_logistic_regression(train_tagged, test_tagged, model_dbow, logreg,
                                 text_vectorizer, class_vectorizer,
                                 classif_level, classif_type, source_path):
    classifier_name_0, parameters_0 = ch.get_classifier_information(
        str(model_dbow))
    classifier_name_1, parameters_1 = ch.get_classifier_information(
        str(logreg))

    model_dmm = wmh.train_doc2vec_with_tagged_data(train_tagged.values)

    classifier_name_2, parameters_2 = ch.get_classifier_information(
        str(model_dmm))

    y_train, X_train = wmh.vec_for_learning(model_dmm, train_tagged)
    y_test, X_test = wmh.vec_for_learning(model_dmm, test_tagged)

    y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test)

    model_dbow.delete_temporary_training_data(keep_doctags_vectors=True,
                                              keep_inference=True)
    model_dmm.delete_temporary_training_data(keep_doctags_vectors=True,
                                             keep_inference=True)

    merged_model = wmh.get_concatenated_doc2vec(model_dbow, model_dmm)

    y_train, X_train = wmh.vec_for_learning(merged_model, train_tagged)
    y_test, X_test = wmh.vec_for_learning(merged_model, test_tagged)

    y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test)

    model_name = '[all classes predictions]' + classifier_name_0 + '/' + classifier_name_2 + '/' + classifier_name_1
    list_metrics = mh.calculate_metrics(model_name, y_test, y_pred)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name,
                    classif_level, classif_type, source_path)
コード例 #5
0
def overview_models(data_frame, classif_level, classif_type, source_path):
    vect_data, patent_ids, vectorizer = ch.apply_tfidf_vectorizer_fit_transform(
        data_frame)

    benchmark = []

    # word2vec model, svm, decision tree, random forest, hidden markov model, k-nearest
    for algorithm in [
            pmh.get_SVC(),
            pmh.get_decision_tree(),
            pmh.get_kneighbors(),
            pmh.get_logistic()
    ]:
        classifier_name, parameters = ch.get_classifier_information(
            str(algorithm))
        print('###  ', classifier_name, '  ###')
        cross_results = mh.get_cross_val_score(algorithm, vect_data,
                                               data_frame['classification'],
                                               'accuracy')
        tmp = pd.DataFrame.from_dict(cross_results).mean(axis=0)
        tmp = tmp.append(pd.Series([classifier_name], index=['algorithm']))
        benchmark.append(tmp)
    ch.save_results_cross_validation('cross_validation', benchmark, ['cv=5'],
                                     classif_level, classif_type, source_path)