コード例 #1
0
def apply_word2vec_extratrees(data_frame, classif_level, classif_type,
                              source_path):
    data_frame['text'] = data_frame.apply(
        lambda row: th.tokenize_complex_text(row['text']), axis=1)
    data_frame['classification'] = data_frame.apply(
        lambda row: th.tokenize_complex_text(row['classification']), axis=1)

    df_single_classification = ch.get_list_each_text_a_different_classification(
        data_frame)

    x = df_single_classification['text']
    y = df_single_classification['classification']

    X_train, X_test, y_train, y_test = ch.get_train_test_from_data(x, y)

    model_w2v = wmh.get_word2vec_model(X_train)

    etree_w2v = Pipeline([("word2vec vectorizer",
                           wmh.MeanEmbeddingVectorizer(model_w2v)),
                          ("extra trees", pmh.get_extra_tree())])
    etree_w2v_tfidf = Pipeline([("word2vec vectorizer",
                                 wmh.TfidfEmbeddingVectorizer(model_w2v)),
                                ("extra trees", pmh.get_extra_tree())])

    # NB!!!: the model does not support multi targets, so i duplicate the sources and give them different targets
    y_pred = pmh.fit_predict_functions(etree_w2v_tfidf, X_train, y_train,
                                       X_test)

    classifier_name_0 = 'Word2Vec/MeanEmbeddingVectorizer'
    classifier_name_1, parameters_1 = ch.get_extratree_classifier_information(
        str(etree_w2v))
    model_name = '[all classes predictions]' + classifier_name_0 + '/' + classifier_name_1

    # this should be changed by comparing all the possibilities for specified text (i can use the original dataframe!)
    list_metrics = mh.calculate_metrics(model_name, y_test, y_pred)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name,
                    classif_level, classif_type, source_path)

    # NB!!!: the model does not support multi targets, so i duplicate the sources and give them different targets
    y_pred = pmh.fit_predict_functions(etree_w2v, X_train, y_train, X_test)

    classifier_name_2 = 'Word2Vec/TfidfEmbeddingVectorizer'
    model_name = '[all classes predictions]' + classifier_name_2 + '/' + classifier_name_1

    # this should be changed by comparing all the possibilities for specified text (i can use the original dataframe!)
    list_metrics = mh.calculate_metrics(model_name, y_test, y_pred)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name,
                    classif_level, classif_type, source_path)
コード例 #2
0
def apply_label_powerset(X_train, y_train, X_test, y_test, baseline_name,
                         source_path):
    classifier = ch.get_label_powerset(pmh.get_logistic())

    classifier_name, parameters = ch.get_complex_classifier_information(
        str(classifier), 1, 1, 2, 0)

    y_pred = pmh.fit_predict_functions(classifier, X_train, y_train, X_test)

    model_name = '[all classes predictions]' + baseline_name + classifier_name
    list_metrics = mh.calculate_metrics(model_name, y_test, y_pred)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name, list_metrics, parameters, model_name,
                    classif_level, classif_type, source_path)
コード例 #3
0
def apply_svm(X_train_tfidf, y_train, X_test_tfidf, y_test, classif_level,
              classif_type, source_path):
    svm = pmh.get_SVC()

    classifier_name, parameters = ch.get_classifier_information(str(svm))

    y_pred = pmh.fit_predict_functions(svm, X_train_tfidf, y_train,
                                       X_test_tfidf)

    model_name = '[all classes predictions]label_encoder/tfidf/' + classifier_name
    list_metrics = mh.calculate_metrics(model_name, y_test, y_pred)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name, list_metrics, parameters, model_name,
                    classif_level, classif_type, source_path)
コード例 #4
0
def improved_logistic_regression(train_tagged, test_tagged, model_dbow, logreg,
                                 text_vectorizer, class_vectorizer,
                                 classif_level, classif_type, source_path):
    classifier_name_0, parameters_0 = ch.get_classifier_information(
        str(model_dbow))
    classifier_name_1, parameters_1 = ch.get_classifier_information(
        str(logreg))

    model_dmm = wmh.train_doc2vec_with_tagged_data(train_tagged.values)

    classifier_name_2, parameters_2 = ch.get_classifier_information(
        str(model_dmm))

    y_train, X_train = wmh.vec_for_learning(model_dmm, train_tagged)
    y_test, X_test = wmh.vec_for_learning(model_dmm, test_tagged)

    y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test)

    model_dbow.delete_temporary_training_data(keep_doctags_vectors=True,
                                              keep_inference=True)
    model_dmm.delete_temporary_training_data(keep_doctags_vectors=True,
                                             keep_inference=True)

    merged_model = wmh.get_concatenated_doc2vec(model_dbow, model_dmm)

    y_train, X_train = wmh.vec_for_learning(merged_model, train_tagged)
    y_test, X_test = wmh.vec_for_learning(merged_model, test_tagged)

    y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test)

    model_name = '[all classes predictions]' + classifier_name_0 + '/' + classifier_name_2 + '/' + classifier_name_1
    list_metrics = mh.calculate_metrics(model_name, y_test, y_pred)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name,
                    classif_level, classif_type, source_path)
コード例 #5
0
def apply_onevsrest(X_train, y_train, X_test, y_test, classes, baseline_name,
                    source_path):
    # X_train.sort_indices() # SVC needs this line in addition
    custom_pipeline = Pipeline([
        ('clf', OneVsRestClassifier(pmh.get_logistic(), n_jobs=-1)),
        # ('clf', OneVsRestClassifier(pmh.get_SVC(), n_jobs=-1)),
        # ('clf', OneVsRestClassifier(pmh.get_multinomialNB(), n_jobs=-1)),
        # ('clf', OneVsRestClassifier(pmh.get_decision_tree(), n_jobs=-1)),
        # ('clf', OneVsRestClassifier(pmh.get_kneighbors(), n_jobs=-1)),
        # ('clf', OneVsRestClassifier(pmh.get_linear_SVC(), n_jobs=-1)),
        # ('clf', OneVsRestClassifier(pmh.get_random_forest_classifier(), n_jobs=-1)),
        # ('clf', OneVsRestClassifier(pmh.get_SGD_classifier(), n_jobs=-1)),
    ])

    classifier_name, parameters = ch.get_complex_classifier_information(
        str(custom_pipeline), 3, 1, 4, 0)
    model_name = '[each class predictions]' + baseline_name + classifier_name

    accuracies = []
    for _class in classes:
        print('**Processing {} texts...**'.format(_class))

        y_pred = pmh.fit_predict_functions(custom_pipeline, X_train,
                                           y_train[_class], X_test)

        accuracies.append(mh.get_accuracy_score(y_test[_class], y_pred))

        list_metrics = mh.calculate_metrics(model_name, y_test[_class], y_pred)
        none_average, binary_average, micro_average, macro_average = list_metrics

        ch.save_results(classifier_name, list_metrics, parameters, model_name,
                        classif_level, classif_type, source_path)

        print('Pipeline score on training {}'.format(
            custom_pipeline.score(X_train, y_train[_class])))

        true_positives, false_positives, tpfn = mh.get_predictions_distribution(
            y_test[_class], y_pred)

    model_name = '[all classes predictions]' + baseline_name + classifier_name
    if tpfn == 0 or (true_positives + false_positives) == 0:
        mh.display_directly_metrics(model_name, 0, 0, 0, -1)
    else:
        precision = true_positives / (true_positives + false_positives)
        recall = true_positives / tpfn
        mh.display_directly_metrics(
            model_name, precision, recall,
            2 * (precision * recall) / (precision + recall), -1)
コード例 #6
0
def apply_adapted_algorithm(X_train, y_train, X_test, y_test, baseline_name,
                            source_path):
    classifier = pmh.get_MLkNN()

    classifier_name, parameters = ch.get_complex_classifier_information(
        str(classifier), 0, 0, 1, 0)

    X_train, y_train, X_test = th.get_lil_matrices(X_train, y_train, X_test)

    y_pred = pmh.fit_predict_functions(classifier, X_train, y_train, X_test)

    model_name = '[all classes predictions]' + baseline_name + classifier_name
    list_metrics = mh.calculate_metrics(model_name, y_test, y_pred)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name, list_metrics, parameters, model_name,
                    classif_level, classif_type, source_path)
コード例 #7
0
def apply_doc2vec_logistic_regression(data_frame, text_vectorizer,
                                      class_vectorizer, classif_level,
                                      classif_type, source_path):
    baseline_name = '[all classes predictions]' + text_vectorizer + '/' + class_vectorizer
    results_model_dbow = ch.apply_doc2vec_separated_train_test(
        data_frame, baseline_name)
    X_train, y_train, X_test, y_test, model_dbow, train_tagged, test_tagged = results_model_dbow

    logreg = pmh.get_logistic()

    classifier_name_0, parameters_0 = ch.get_classifier_information(
        str(model_dbow))
    classifier_name_1, parameters_1 = ch.get_classifier_information(
        str(logreg))

    y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test)

    model_name = baseline_name + classifier_name_0 + '/' + classifier_name_1
    list_metrics = mh.calculate_metrics(model_name, y_test, y_pred)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name,
                    classif_level, classif_type, source_path)

    # baseline_name = '[each class predictions]'+text_vectorizer+'/'+class_vectorizer
    # vectorizer_results = ch.apply_df_vectorizer(data_frame, 'doc2vec', 'multi_label', baseline_name)
    # X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = vectorizer_results
    # model_name = baseline_name+'/'+classifier_name_0+'/'+classifier_name_1
    #
    # for i in range(n_classes):
    #     unique, counts = np.unique(y_train[:, i], return_counts=True)
    #     if len(counts) > 1 and counts[1] > 1:
    #         y_pred = pmh.fit_predict_functions(custom_pipeline, X_train, y_train[:, i], X_test)
    #         print('###  ', classes[i] ,'  ###')
    #         list_metrics = mh.calculate_metrics(model_name, y_test[:, i], y_pred)
    #         none_average, binary_average, micro_average, macro_average = list_metrics
    #
    #         ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path)

    # improvement
    improved_logistic_regression(train_tagged, test_tagged, model_dbow, logreg,
                                 text_vectorizer, class_vectorizer,
                                 classif_level, classif_type, source_path)
コード例 #8
0
def apply_multi_label_classification_without_pipeline(
        data_frame, text_vectorizer, class_vectorizer, classif_level,
        classif_type, source_path):
    baseline_name = text_vectorizer + '/' + class_vectorizer + '/onevsrest/'
    vectorizer_results = ch.apply_df_vectorizer(data_frame, text_vectorizer,
                                                class_vectorizer,
                                                '[both]' + baseline_name)
    X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = vectorizer_results
    # len_vocabulary = 57335
    # len_vocabulary = 34736

    print(X_train[0])

    print('len_vocabulary: ', len_vocabulary, ' num_classes: ', n_classes)

    # Run classifier
    classifier = OneVsRestClassifier(pmh.get_logistic())  # here
    # classifier = OneVsRestClassifier(pmh.get_SVC()) # here
    # classifier = OneVsRestClassifier(pmh.get_multinomialNB()) # ERROR WORD2VEC/DOC2VEC (X has negative value)
    # classifier = OneVsRestClassifier(pmh.get_decision_tree()) # here
    # classifier = OneVsRestClassifier(pmh.get_kneighbors()) # here
    # classifier = OneVsRestClassifier(pmh.get_linear_SVC())
    # classifier = OneVsRestClassifier(pmh.get_random_forest_classifier()) # here
    # classifier = OneVsRestClassifier(pmh.get_SGD_classifier())

    train_predictions = np.ndarray(shape=(n_classes, y_train.shape[0]),
                                   dtype=int)
    predictions = np.ndarray(shape=(n_classes, y_test.shape[0]), dtype=int)
    ###
    precision = dict()
    recall = dict()
    average_precision = dict()

    classifier_name, parameters = ch.get_complex_classifier_information(
        str(classifier), 1, 1, 2, 0)

    second_training = False
    just_once = True
    another_try = False  # single train and estimation instead of multi-train-estimation steps (it performs better with svm and logistic)

    if not another_try:
        for _ in range(1):
            for i in range(n_classes):
                if second_training:
                    if classifier_name in [
                            'DecisionTreeClassifier', 'KNeighborsClassifier',
                            'MultinomialNB', 'RandomForestClassifier'
                    ]:
                        # do not provide the second metrics
                        break
                    elif just_once:
                        # fit again with the whole set of classes - should be better
                        classifier.fit(X_train, y_train)
                        y_score = classifier.decision_function(X_test)

                        precision[i], recall[i], average_precision[
                            i] = mh.calculate_recall_curve_precision_score(
                                y_test[:, i], y_score[:, i], None,
                                y_test[:, i], y_score[:, i])

                        just_once = False
                else:
                    predictions[i] = pmh.fit_predict_functions(
                        classifier, X_train, y_train[:, i], X_test)
                    train_predictions[i] = classifier.predict(X_train)
                print('**Processing classes {0:0.2f} % ...**'.format(
                    ((i + 1) / n_classes) * 100))
            second_training = True
            break

        predictions = predictions.transpose()
        print('transposed')
    else:
        predictions = pmh.fit_predict_functions(classifier, X_train, y_train,
                                                X_test)
        train_predictions = classifier.predict(X_train)

    # metrics
    model_name = '[each class predictions]' + baseline_name + classifier_name
    manual_metrics = mh.calculate_manual_metrics(model_name, y_test,
                                                 predictions)
    none_average, binary_average, micro_average, macro_average = manual_metrics

    # metrics
    list_metrics = mh.calculate_metrics(model_name, y_test, predictions)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name, list_metrics, parameters, model_name,
                    classif_level, classif_type, source_path)

    if not just_once:
        print('not just once')
        train_predictions = classifier.predict(X_train)
        predictions = classifier.predict(X_test)

        # metrics
        mh.calculate_metrics_with_recall_curve(y_score, y_train, y_test,
                                               train_predictions, predictions)

        # metrics
        model_name = '[all classes predictions]' + baseline_name + classifier_name
        list_metrics = mh.calculate_metrics(model_name, y_test, predictions)
        none_average, binary_average, micro_average, macro_average = list_metrics

        ch.save_results(classifier_name, list_metrics, parameters, model_name,
                        classif_level, classif_type, source_path)