Example #1
0
def apply_multi_label_classification(data_frame, text_vectorizer,
                                     class_vectorizer, classif_type,
                                     classif_level, source_path):
    ################################################# classification: from text to sparse binary matrix [[0, 1, 0],[1, 0, 1]]
    baseline_name = text_vectorizer + '/' + class_vectorizer + '/onevsrest/'
    vectorizer_results = ch.apply_df_vectorizer(data_frame, text_vectorizer,
                                                class_vectorizer,
                                                '[both]' + baseline_name)
    X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = vectorizer_results
    y_train = pd.DataFrame(y_train, columns=classes)
    y_test = pd.DataFrame(y_test, columns=classes)

    ###################################################### text: from text to sparse binary matrix [[0, 1, 0],[1, 0, 1]]

    # temp_text = ch.apply_count_vectorizer(data_frame)

    ###################################################### text: from text to sparse binary matrix [[0, 1, 0],[1, 0, 1]]

    apply_onevsrest(X_train, y_train, X_test, y_test, classes, baseline_name,
                    source_path)
    # apply_binary_relevance(X_train, y_train, X_test, y_test, baseline_name, source_path) # needs lots of patents!
    # apply_classifier_chain(X_train, y_train, X_test, y_test, baseline_name, source_path) # needs lots of patents!
    # apply_label_powerset(X_train, y_train, X_test, y_test, baseline_name, source_path) # best one
    apply_adapted_algorithm(X_train, y_train, X_test, y_test, baseline_name,
                            source_path)
Example #2
0
def second_attempt_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location):

    root_location = fh.get_root_location('data/convolutional_outcome/')

    # imdb = keras.datasets.imdb
    # (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

    # explore data
    # print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))
    # print("how data looks like: ", train_data[0]) # [1, 14, 34...]
    # print("how labels looks like: ", train_labels[0]) # 0

    # preprocess data
    # A dictionary mapping words to an integer index
    # word_index = imdb.get_word_index()

    # The first indices are reserved, thus it increases all the indices by 3
    # word_index = {k : (v + 3) for k, v in word_index.items()}
    # word_index["<PAD>"] = 0
    # word_index["<START>"] = 1
    # word_index["<UNK>"] = 2  # unknown
    # word_index["<UNUSED>"] = 3

    # make both the train and the test dataset the same length
    # train_data = keras.preprocessing.sequence.pad_sequences(train_data,
    #                                                         value=word_index["<PAD>"],
    #                                                         padding='post',
    #                                                         maxlen=256)

    # test_data = keras.preprocessing.sequence.pad_sequences(test_data,
    #                                                        value=word_index["<PAD>"],
    #                                                        padding='post',
    #                                                        maxlen=256)
    # print(train_data[0]) # [1 14 34 0 0 0] - with little difference: PAD,START,UNK,UNUSED

    model_name = text_vectorizer+'/'+class_vectorizer+'/NN'
    standard_results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name)
    train_data, test_data, train_labels, test_labels, classes, n_classes, vocab_processor, len_vocabulary = standard_results

    train_data, val_data, train_labels, val_labels = ch.get_train_test_from_data(train_data, train_labels)

    # print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))
    # print("how data looks like: ", train_data[0]) # [1 14 34 0 0 0]
    # print("how labels looks like: ", train_labels[0:5]) # list of lists [[0 1 0 0 0 1 0 1], ...]

    model = pmh.get_text_convolutional_from_web(len_vocabulary, n_classes)
    metrics, predictions = pmh.run_text_cnn_model(model, train_data, train_labels, test_data, test_labels)

    classifier_name, layers = ch.get_sequential_classifier_information(model)
    mh.display_convolutional_metrics(classifier_name, metrics[0], metrics[1], metrics[2], test_labels, predictions)

    ch.save_results(classifier_name, metrics, layers, model_name, classif_level, classif_type, dataset_location)
Example #3
0
def fourth_attemp_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location):
    model_name = text_vectorizer+'/'+class_vectorizer+'/CNN'
    standard_results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name)
    train_data, test_data, train_labels, test_labels, classes, n_classes, vocab_processor, len_vocabulary = standard_results

    # TODO: is it useful?
    # test_labels = np_utils.to_categorical(test_labels, n_classes)

    model = pmh.get_fourth_attempt_model_from_web(train_data, n_classes)

    y_train_predclass, y_test_predclass, train_metrics, test_metrics, train_predictions, test_predictions = pmh.run_fourth_attempt_model(model, train_data, train_labels, test_data, test_labels)

    # mh.display_convolution_metrics_fourth_attempt(train_labels, test_labels, y_train_predclass, y_test_predclass)
    # mh.display_convolutional_metrics(classifier_name, train_metrics[0], train_metrics[1], train_metrics[2], train_labels, train_predictions)
    mh.display_convolutional_metrics(classifier_name, test_metrics[0], test_metrics[1], test_metrics[2], test_labels, test_predictions)

    classifier_name, layers = ch.get_sequential_classifier_information(model)
    ch.save_results(classifier_name, test_metrics, layers, model_name, classif_level, classif_type, dataset_location)
Example #4
0
def train_testing_convolution(data_frame, text_vectorizer, class_vectorizer):
    save_standard_sets = True
    root_location = fh.get_root_location('data/convolutional_outcome/')

    sets_location = fh.join_paths(root_location, "model_sets")
    checkpoint_path = fh.join_paths(root_location, "model_checkpoints")
    model_path = fh.link_paths(checkpoint_path, 'convolution_model')
    weights_path = fh.link_paths(checkpoint_path, 'convolution_weights')

    # get sets
    model_name = text_vectorizer+'/'+class_vectorizer+'/NN'
    standard_results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name)
    train_data, test_data, train_labels, test_labels, classes, n_classes, vocab_processor, len_vocabulary = standard_results
    train_data, val_data, train_labels, val_labels = ch.get_train_test_from_data(train_data, train_labels)

    # save sets
    # ch.save_sets(sets_location, train_data, test_data, val_data, train_labels, test_labels, val_labels,
    #           [classes, n_classes, vocab_processor, len_vocabulary])

    # this is for test
    train_data, test_data, val_data, train_labels, test_labels, val_labels, _ = ch.load_sets(sets_location)

    # it could be that a label is only in the test/data data, might be a problem
    sequence_length = train_data.shape[1]
    # define the model
    model = pmh.get_cnn_test(len_vocabulary, n_classes, sequence_length)

    # calculates metrics with validating data
    model, val_predictions = pmh.run_cnn_test(model,
                                   train_data, train_labels, val_data, val_labels, val_data, val_labels,
                                   model_path, weights_path, True)
    binary_val_predictions = mh.get_binary_0_5(val_predictions)
    print(val_labels.shape)
    print(val_predictions.shape)
    # display validation metrics
    metrics = mh.get_sequential_metrics(val_labels, val_predictions, binary_predictions)
    mh.display_sequential_metrics('validation convolution sequence', metrics)
Example #5
0
def test_LSTM_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location):
    print('### LSTM Doing Testing ###')

    root_location = fh.get_root_location('data/lstm_outcome/')

    nn_parameter_search_location = fh.join_paths(root_location, "nn_fhv_parameter_search")
    doc2vec_model_location = fh.link_paths(fh.join_paths(root_location, "doc2vec_model/vocab_model/"), "doc2vec_model")

    save_results = True

    sequence_size = 1
    EMBEDDING_SIZE = 150

    model_name = text_vectorizer+'/'+class_vectorizer+'/LSTM'
    results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name)
    X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = results

    X_train, X_val, y_train, y_val = ch.get_train_test_from_data(X_train, y_train)

    training_docs_list = X_train['patent_id']
    test_docs_list = X_test['patent_id']
    val_docs_list = X_val['patent_id']

    X_data, Xv_data, Xt_data = ch.get_df_data(3, training_docs_list, val_docs_list, test_docs_list, sequence_size, EMBEDDING_SIZE, doc2vec_model_location)
    GLOBAL_VARS.DOC2VEC_MODEL_NAME, GLOBAL_VARS.MODEL_NAME = wmh.set_parameters_lstm_doc2vec(nn_parameter_search_location, classif_level, classif_type)

    NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS = pmh.get_lstm_shapes(X_data, n_classes)
    NN_BATCH_SIZE, PARTS_LEVEL, NN_MAX_EPOCHS, QUEUE_SIZE = pmh.get_lstm_basic_parameters()
    params = pmh.get_lstm_testing_parameters()
    lstm_output_size,w_dropout_do,u_dropout_do, stack_layers, conv_size, conv_filter_length, conv_max_pooling_length = params
    EARLY_STOPPER_MIN_DELTA, EARLY_STOPPER_PATIENCE = pmh.get_early_stopping_parameters()

    TEST_METRICS_FILENAME = '{}_level_{}_standard_nn_test_metrics_dict.pkl'

    test_metrics_dict = dict()
    test_metrics_path = fh.link_paths(fh.link_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), TEST_METRICS_FILENAME.format(classif_type, PARTS_LEVEL))

    param_results_path = fh.link_paths(fh.link_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), NN_PARAMETER_SEARCH_PREFIX.format(classif_type, classif_level, NN_BATCH_SIZE))

    param_results_dict = pickle.load(open(param_results_path, 'rb'))
    GLOBAL_VARS.NN_MODEL_NAME = 'lstm_size_{}_w-drop_{}_u-drop_{}_stack_{}_conv_{}'.format(lstm_output_size,
                                                                                            w_dropout_do,
                                                                                            u_dropout_do,
                                                                                            stack_layers,
                                                                                            str(conv_size)
                                                                                            )
    if conv_size:
        GLOBAL_VARS.NN_MODEL_NAME += '_conv-filter-length_{}_max-pooling-size_{}'.format(conv_filter_length,
                                                                                         conv_max_pooling_length)
    if GLOBAL_VARS.NN_MODEL_NAME not in param_results_dict.keys():
        print("Can't find model: {}".format(GLOBAL_VARS.NN_MODEL_NAME))
        raise Exception()

    if fh.ensure_exists_path_location(test_metrics_path):
        test_metrics_dict = pickle.load(open(fh.link_paths(fh.link_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), TEST_METRICS_FILENAME.format(classif_type,PARTS_LEVEL)), 'rb'))
        if GLOBAL_VARS.NN_MODEL_NAME in test_metrics_dict.keys():
            print("Test metrics already exist for: {}".format(GLOBAL_VARS.NN_MODEL_NAME))
            test_metrics = test_metrics_dict[GLOBAL_VARS.NN_MODEL_NAME]

            print("** Test Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}".format(
                test_metrics['coverage_error'], test_metrics['average_num_of_labels'],
                test_metrics['top_1'], test_metrics['top_3'], test_metrics['top_5'],
                test_metrics['f1_micro'], test_metrics['f1_macro']))
            raise Exception()

    print('***************************************************************************************')
    print(GLOBAL_VARS.NN_MODEL_NAME)

    model = pmh.get_keras_rnn_model(NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS,
                                   lstm_output_size, w_dropout_do, u_dropout_do, stack_layers, conv_size,
                                   conv_filter_length, conv_max_pooling_length)

    # get model best weights
    weights = param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['best_weights']
    model.set_weights(weights)

    print('Evaluating on Test Data using best weights')
    _, ytp, ytp_binary = pmh.predict_generator(None, model, Xt_data, y_test, NN_BATCH_SIZE, QUEUE_SIZE, test_docs_list)

    print('Generating Test Metrics')
    test_metrics = mh.get_sequential_metrics(y_test, ytp, ytp_binary)
    mh.display_sequential_metrics(test_metrics)

    if save_results:
        classifier_name, parameters = ch.get_sequential_classifier_information(model)
        ch.save_results(classifier_name+'_LSTM', test_metrics, parameters, model_name, classif_level, classif_type, dataset_location)

        test_metrics_dict[GLOBAL_VARS.NN_MODEL_NAME] = test_metrics
        pickle.dump(test_metrics_dict, open(test_metrics_path, 'wb'))
Example #6
0
def train_LSTM_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location):
    print('### LSTM Doing Training ###')

    root_location = fh.get_root_location('data/lstm_outcome/')

    exports_location = fh.join_paths(root_location, "exported_data/")
    matrices_save_location = fh.join_paths(root_location, "fhv_matrices/")
    nn_parameter_search_location = fh.join_paths(root_location, "nn_fhv_parameter_search")
    doc2vec_model_location = fh.link_paths(fh.join_paths(root_location, "doc2vec_model/vocab_model/"), "doc2vec_model")

    load_existing_results = True # it was True
    save_results = True

    sequence_size = 1
    EMBEDDING_SIZE = 150

    model_name = text_vectorizer+'/'+class_vectorizer+'/LSTM'
    results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name)
    X_train, X_val, y_train, y_val, classes, n_classes, vocab_processor, len_vocabulary = results

    training_docs_list = X_train['patent_id']
    val_docs_list = X_val['patent_id']

    X_data, Xv_data, _ = ch.get_df_data(2, training_docs_list, val_docs_list, None, sequence_size, EMBEDDING_SIZE, doc2vec_model_location)
    GLOBAL_VARS.DOC2VEC_MODEL_NAME, GLOBAL_VARS.MODEL_NAME = wmh.set_parameters_lstm_doc2vec(nn_parameter_search_location, classif_level, classif_type)

    # print(X_data.shape) # 64, 1, 200
    # print(Xt_data.shape) # 20, 1, 200
    # print(Xv_data.shape) # 16, 1, 200

    NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS = pmh.get_lstm_shapes(X_data, n_classes)
    NN_BATCH_SIZE, PARTS_LEVEL, NN_MAX_EPOCHS, QUEUE_SIZE = pmh.get_lstm_basic_parameters()
    param_sampler = pmh.get_lstm_training_parameters()
    EARLY_STOPPER_MIN_DELTA, EARLY_STOPPER_PATIENCE = pmh.get_early_stopping_parameters()

    param_results_dict = dict()
    param_results_path = fh.link_paths(fh.join_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), NN_PARAMETER_SEARCH_PREFIX.format(classif_type, classif_level, NN_BATCH_SIZE))
    index = param_results_path.rfind('/')
    fh.create_folder(fh.link_paths(fh.join_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), NN_PARAMETER_SEARCH_PREFIX.format(classif_type, classif_level, NN_BATCH_SIZE))[:index])

    ###########
    # print(X_data.shape)
    # input_size, sequence_size, output_size = X_data.shape[2], X_data.shape[0], n_classes
    # NN_BATCH_SIZE, PARTS_LEVEL, NN_MAX_EPOCHS, QUEUE_SIZE = 1, 2, 200, 100

    # lstm_output_sizes = [500, 1000]
    # w_dropout_options = [0., 0.5]
    # u_dropout_options = [0., 0.5]
    # stack_layers_options = [1, 2, 3]

    # lstm_output_size, w_dropout_do, u_dropout_do, stack_layers = 500, 0.0, 0.0, 1
    # conv_size, conv_filter_length, max_pooling_length = 128, 2, 2

    # EARLY_STOPPER_MIN_DELTA, EARLY_STOPPER_PATIENCE = 0.00001, 15

    # import tensorflow as tf
    # from tensorflow import keras
    # from keras.layers import Input, Dense, Dropout, Activation
    # from keras.models import Model, Sequential
    # from keras.layers.convolutional import MaxPooling1D, Convolution1D
    # from keras.layers.recurrent import LSTM

    # model = Sequential()

    # # model.add(Convolution1D(nb_filter=conv_size, input_shape=(sequence_size, input_size),
    # #                             filter_length=conv_filter_length, border_mode='same', activation='relu'))
    # # model.add(MaxPooling1D(pool_length=max_pooling_length))

    # model.add(LSTM(lstm_output_size, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=False,
    #                input_dim=input_size, dropout_W=w_dropout_do, dropout_U=u_dropout_do,
    #                implementation=1,
    #                return_sequences=False if 1 == stack_layers else True,
    #                go_backwards=False, stateful=False, unroll=False,
    #                name='lstm_{}_w-drop_{}_u-drop_{}_layer_{}'.format(lstm_output_size, str(u_dropout_do),
    #                                                                   str(w_dropout_do), str(1))))
    # # model.add(LSTM(lstm_output_size, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=False,
    # #                input_dim=input_size, dropout_W=w_dropout_do, dropout_U=u_dropout_do,
    # #                implementation=1,
    # #                return_sequences=False if 2 == stack_layers else True,
    # #                go_backwards=False, stateful=False, unroll=False,
    # #                name='lstm_{}_w-drop_{}_u-drop_{}_layer_{}'.format(lstm_output_size, str(u_dropout_do),
    # #                                                                   str(w_dropout_do), str(2))))
    # # model.add(LSTM(lstm_output_size, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=False,
    # #                input_dim=input_size, dropout_W=w_dropout_do, dropout_U=u_dropout_do,
    # #                implementation=1,
    # #                return_sequences=False if 3 == stack_layers else True,
    # #                go_backwards=False, stateful=False, unroll=False,
    # #                name='lstm_{}_w-drop_{}_u-drop_{}_layer_{}'.format(lstm_output_size, str(u_dropout_do),
    # #                                                                   str(w_dropout_do), str(3))))
    # model.add(Dense(output_size, activation='sigmoid', name='sigmoid_output'))
    # model.compile(optimizer='rmsprop', loss='binary_crossentropy')

    # input_matrix = fh.join_paths(matrices_save_location, GLOBAL_VARS.MODEL_NAME)
    # early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=EARLY_STOPPER_MIN_DELTA,
    #                                               patience=EARLY_STOPPER_PATIENCE, verbose=1, mode='auto')
    # metrics_callback = mh.MetricsCallback(input_matrix, classif_type, PARTS_LEVEL, NN_BATCH_SIZE, is_mlp=False)

    # history, yvp, yvp_binary = pmh.fit_predict_generator(model, X_data, y_train, Xv_data, y_val, training_docs_list, val_docs_list, early_stopper, metrics_callback, NN_BATCH_SIZE, NN_MAX_EPOCHS, QUEUE_SIZE)

    # validation_metrics = mh.get_sequential_metrics(y_val, yvp, yvp_binary)
    # mh.display_sequential_metrics(validation_metrics)
    # ###########



    # useful to skip all the already tested models
    if load_existing_results:
        param_results_path = fh.link_paths(fh.join_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME),
                                           NN_PARAMETER_SEARCH_PREFIX.format(classif_type, classif_level, NN_BATCH_SIZE))
        if fh.ensure_exists_path_location(param_results_path):
            print('Loading Previous results in {}'.format(param_results_path))
            param_results_dict = pickle.load(open(param_results_path, 'rb'))
        else:
            print('No Previous results exist in {}'.format(param_results_path))

    for params in param_sampler:
        start_time = time.time()
        lstm_output_size = params['lstm_output_size']
        w_dropout_do = params['w_dropout']
        u_dropout_do = params['u_dropout']
        stack_layers = params['stack_layers']
        conv_size = params['conv_size']
        conv_filter_length = params['conv_filter_length']
        conv_max_pooling_length = params['conv_max_pooling_length']

        GLOBAL_VARS.NN_MODEL_NAME = 'lstm_size_{}_w-drop_{}_u-drop_{}_stack_{}_conv_{}'.format(
            lstm_output_size, w_dropout_do, u_dropout_do, stack_layers, str(conv_size))

        if conv_size:
            GLOBAL_VARS.NN_MODEL_NAME += '_conv-filter-length_{}_max-pooling-size_{}'.format(conv_filter_length,
                                                                                             conv_max_pooling_length)
        if GLOBAL_VARS.NN_MODEL_NAME in param_results_dict.keys():
            print("skipping: {}".format(GLOBAL_VARS.NN_MODEL_NAME))
            continue

        # creating the actual keras model
        model = pmh.get_keras_rnn_model(NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS,
                                       lstm_output_size, w_dropout_do, u_dropout_do, stack_layers, conv_size,
                                       conv_filter_length, conv_max_pooling_length)

        classifier_name, parameters = ch.get_sequential_classifier_information(model)
        model_name = text_vectorizer+'/'+class_vectorizer+'/'+classifier_name

        input_matrix = fh.join_paths(matrices_save_location, GLOBAL_VARS.MODEL_NAME)
        early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=EARLY_STOPPER_MIN_DELTA,
                                                      patience=EARLY_STOPPER_PATIENCE, verbose=1, mode='auto')
        metrics_callback = mh.MetricsCallback(input_matrix, classif_type, PARTS_LEVEL, NN_BATCH_SIZE, is_mlp=False)

        history, yvp, yvp_binary = pmh.fit_predict_generator(model, X_data, y_train, Xv_data, y_val, training_docs_list, val_docs_list, early_stopper, metrics_callback, NN_BATCH_SIZE, NN_MAX_EPOCHS, QUEUE_SIZE)

        print('\nGenerating Validation Metrics')
        validation_metrics = mh.get_sequential_metrics(y_val, yvp, yvp_binary)
        mh.display_sequential_metrics(classifier_name, validation_metrics)

        param_results_dict[GLOBAL_VARS.NN_MODEL_NAME] = dict()
        # param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['best_validation_metrics'] = best_validation_metrics
        param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['epochs'] = len(history.history['val_loss'])
        param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['best_weights'] = metrics_callback.best_weights
        param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['best_val_loss'] = metrics_callback.best_val_loss
        param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['training_loss'] = metrics_callback.losses
        param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['validation_loss'] = metrics_callback.val_losses

        duration = time.time() - start_time
        param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['duration'] = duration

        ch.delete_variables(history, metrics_callback)

        ch.save_results(classifier_name+'_LSTM', validation_metrics, parameters, model_name, classif_level, classif_type, dataset_location)

    if save_results:
        file = open(param_results_path, 'wb')
        pickle.dump(param_results_dict, file)
def preprocessing_data_for_fasttext(data_frame, text_vectorizer,
                                    class_vectorizer):
    root_location = fh.get_root_location('data/fasttext_outcome/')

    data_frame['text'] = data_frame['text'].replace(
        '\n', ' ', regex=True).replace('\t', ' ', regex=True)

    model_name = text_vectorizer + '/' + class_vectorizer + '/FastText'
    try:
        X_train, X_test, Y_train, Y_test, _, _, _, _ = ch.apply_df_vectorizer(
            data_frame, text_vectorizer, class_vectorizer, model_name)

        X_train, X_val, Y_train, Y_val = ch.get_train_test_from_data(
            X_train, Y_train)

        # self.data_vectors = pd.DataFrame(columns=range(vectors_size), index=range(corpus_size))
        if not isinstance(X_train, pd.DataFrame):
            train = pd.DataFrame(data=X_train)
            test = pd.DataFrame(data=X_test)
            val = pd.DataFrame(data=X_val)
            # test_labels = pd.DataFrame(columns=[''])
        else:
            train = X_train
            test = X_test
            val = X_val

        train.loc[:, 1] = Y_train
        test.loc[:, 1] = Y_test
        val.loc[:, 1] = Y_val

        train.drop(columns=['patent_id'], inplace=True)
        test.drop(columns=['patent_id'], inplace=True)
        val.drop(columns=['patent_id'], inplace=True)

        data_frame.to_csv(fh.link_paths(root_location, 'dataframe.csv'),
                          index=False,
                          sep=' ',
                          header=False,
                          quoting=csv.QUOTE_NONE,
                          quotechar="",
                          escapechar=" ")

        train.to_csv(fh.link_paths(root_location, 'training set.csv'),
                     index=False,
                     sep=' ',
                     header=False,
                     quoting=csv.QUOTE_NONE,
                     quotechar="",
                     escapechar=" ")
        test.to_csv(fh.link_paths(root_location, 'testing set.csv'),
                    index=False,
                    sep=',',
                    header=False,
                    quoting=csv.QUOTE_NONE,
                    quotechar="",
                    escapechar=" ")
    except:
        print('a problem occurred while trying to store the dataframes')

        X_train, X_test, Y_train, Y_test, _, _, _, _ = ch.apply_df_vectorizer(
            data_frame, text_vectorizer, class_vectorizer, model_name)

        X_train, X_val, Y_train, Y_val = ch.get_train_test_from_data(
            X_train, Y_train)

        val = pd.DataFrame({'text': X_val, 'classification': Y_val})
        train = pd.DataFrame({'text': X_train, 'classification': Y_train})
        test = pd.DataFrame({'text': X_test, 'classification': Y_test})

        data_frame.to_csv(fh.link_paths(root_location, 'dataframe.csv'),
                          index=False,
                          sep=' ',
                          header=False,
                          quoting=csv.QUOTE_NONE,
                          quotechar="",
                          escapechar=" ")

        val.to_csv(fh.link_paths(root_location, 'validating set.csv'),
                   index=False,
                   sep=' ',
                   header=False,
                   quoting=csv.QUOTE_NONE,
                   quotechar="",
                   escapechar=" ")
        train.to_csv(fh.link_paths(root_location, 'training set.csv'),
                     index=False,
                     sep=' ',
                     header=False,
                     quoting=csv.QUOTE_NONE,
                     quotechar="",
                     escapechar=" ")
        test.to_csv(fh.link_paths(root_location, 'testing set.csv'),
                    index=False,
                    sep=',',
                    header=False,
                    quoting=csv.QUOTE_NONE,
                    quotechar="",
                    escapechar=" ")
Example #8
0
def apply_multi_label_classification_without_pipeline(
        data_frame, text_vectorizer, class_vectorizer, classif_level,
        classif_type, source_path):
    baseline_name = text_vectorizer + '/' + class_vectorizer + '/onevsrest/'
    vectorizer_results = ch.apply_df_vectorizer(data_frame, text_vectorizer,
                                                class_vectorizer,
                                                '[both]' + baseline_name)
    X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = vectorizer_results
    # len_vocabulary = 57335
    # len_vocabulary = 34736

    print(X_train[0])

    print('len_vocabulary: ', len_vocabulary, ' num_classes: ', n_classes)

    # Run classifier
    classifier = OneVsRestClassifier(pmh.get_logistic())  # here
    # classifier = OneVsRestClassifier(pmh.get_SVC()) # here
    # classifier = OneVsRestClassifier(pmh.get_multinomialNB()) # ERROR WORD2VEC/DOC2VEC (X has negative value)
    # classifier = OneVsRestClassifier(pmh.get_decision_tree()) # here
    # classifier = OneVsRestClassifier(pmh.get_kneighbors()) # here
    # classifier = OneVsRestClassifier(pmh.get_linear_SVC())
    # classifier = OneVsRestClassifier(pmh.get_random_forest_classifier()) # here
    # classifier = OneVsRestClassifier(pmh.get_SGD_classifier())

    train_predictions = np.ndarray(shape=(n_classes, y_train.shape[0]),
                                   dtype=int)
    predictions = np.ndarray(shape=(n_classes, y_test.shape[0]), dtype=int)
    ###
    precision = dict()
    recall = dict()
    average_precision = dict()

    classifier_name, parameters = ch.get_complex_classifier_information(
        str(classifier), 1, 1, 2, 0)

    second_training = False
    just_once = True
    another_try = False  # single train and estimation instead of multi-train-estimation steps (it performs better with svm and logistic)

    if not another_try:
        for _ in range(1):
            for i in range(n_classes):
                if second_training:
                    if classifier_name in [
                            'DecisionTreeClassifier', 'KNeighborsClassifier',
                            'MultinomialNB', 'RandomForestClassifier'
                    ]:
                        # do not provide the second metrics
                        break
                    elif just_once:
                        # fit again with the whole set of classes - should be better
                        classifier.fit(X_train, y_train)
                        y_score = classifier.decision_function(X_test)

                        precision[i], recall[i], average_precision[
                            i] = mh.calculate_recall_curve_precision_score(
                                y_test[:, i], y_score[:, i], None,
                                y_test[:, i], y_score[:, i])

                        just_once = False
                else:
                    predictions[i] = pmh.fit_predict_functions(
                        classifier, X_train, y_train[:, i], X_test)
                    train_predictions[i] = classifier.predict(X_train)
                print('**Processing classes {0:0.2f} % ...**'.format(
                    ((i + 1) / n_classes) * 100))
            second_training = True
            break

        predictions = predictions.transpose()
        print('transposed')
    else:
        predictions = pmh.fit_predict_functions(classifier, X_train, y_train,
                                                X_test)
        train_predictions = classifier.predict(X_train)

    # metrics
    model_name = '[each class predictions]' + baseline_name + classifier_name
    manual_metrics = mh.calculate_manual_metrics(model_name, y_test,
                                                 predictions)
    none_average, binary_average, micro_average, macro_average = manual_metrics

    # metrics
    list_metrics = mh.calculate_metrics(model_name, y_test, predictions)
    none_average, binary_average, micro_average, macro_average = list_metrics

    ch.save_results(classifier_name, list_metrics, parameters, model_name,
                    classif_level, classif_type, source_path)

    if not just_once:
        print('not just once')
        train_predictions = classifier.predict(X_train)
        predictions = classifier.predict(X_test)

        # metrics
        mh.calculate_metrics_with_recall_curve(y_score, y_train, y_test,
                                               train_predictions, predictions)

        # metrics
        model_name = '[all classes predictions]' + baseline_name + classifier_name
        list_metrics = mh.calculate_metrics(model_name, y_test, predictions)
        none_average, binary_average, micro_average, macro_average = list_metrics

        ch.save_results(classifier_name, list_metrics, parameters, model_name,
                        classif_level, classif_type, source_path)
Example #9
0
def first_attempt_based_on_text_classification_paper(data_frame, text_vectorizer, class_vectorizer):
    # Parameters - can be placed at the beginning of the script
    # ==================================================

    # Data loading params
    tf.flags.DEFINE_float("dev_sample_percentage", .2, "Percentage of the training data to use for validation")
    tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.")
    tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the negative data.")

    # Model Hyperparameters
    tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
    tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
    tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
    tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
    tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")

    # Training parameters
    tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
    tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")
    tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
    tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
    tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
    # Misc Parameters
    tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
    tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

    FLAGS = tf.flags.FLAGS
    # FLAGS._parse_flags()
    # print("\nParameters:")
    # for attr, value in sorted(FLAGS.__flags.items()):
    #     print("{}={}".format(attr.upper(), value))
    # print("")

    dev_sample_percentage = FLAGS.dev_sample_percentage

    # x_train, y_train, vocab_processor, len_vocabulary, x_dev, y_dev, classe, n_classes = preprocess(data_frame, dev_sample_percentage)
    model_name = text_vectorizer+'/'+class_vectorizer+'/TextCNN'
    standard_results = ch.apply_df_vectorizer(data_frame, 'standard', 'multi_label', model_name)
    x_train, x_dev, y_train, y_dev, classes, n_classes, vocab_processor, len_vocabulary = standard_results

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement = FLAGS.allow_soft_placement,
            log_device_placement = FLAGS.log_device_placement)
        sess = tf.Session(config = session_conf)
        with sess.as_default():
            # Code that operates on the default graph and session comes here…
            cnn = pmh.TextCNN(
                sequence_length = x_train.shape[1],
                num_classes = n_classes,
                vocab_size = (len_vocabulary),
                embedding_size = FLAGS.embedding_dim,
                filter_sizes = list(map(int, FLAGS.filter_sizes.split(","))),
                num_filters = FLAGS.num_filters)

            # define training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-4)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

            # output directory for models and summaries
            root_location = fh.get_root_location('data/convolutional_runs/')

            timestamp = datetime.datetime.now().isoformat()
            out_dir = fh.link_paths(root_location, timestamp)
            print("Writing to {}\n".format(out_dir))

            # summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # train summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = fh.join_paths(fh.link_paths(out_dir, 'summaries'), 'train')
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def)

            # dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = fh.join_paths(fh.link_paths(out_dir, 'summaries'), 'dev')
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def)

            # checkpointing
            checkpoint_dir = fh.link_paths(out_dir, 'checkpoints')
            checkpoint_prefix = fh.join_paths(checkpoint_dir, 'model')

            saver = tf.train.Saver(tf.all_variables())

            # write vocabulary
            try:
                vocab_processor.save(os.path.join(out_dir, "vocab"))
            except:
                pass

            # initialize all variables
            sess.run(tf.global_variables_initializer())

            def train_step(x_batch, y_batch):
                """
                A single training step
                """
                feed_dict = {
                  cnn.input_x: x_batch,
                  cnn.input_y: y_batch,
                  cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run(
                    [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                    }
                step, summaries, loss, accuracy = sess.run(
                    [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                if writer:
                    writer.add_summary(summaries, step)

            # generate batches
            batches = batch_iter(
                list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
            # training loop. For each batch…
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    dev_step(x_dev, y_dev, writer=dev_summary_writer)
                if current_step % FLAGS.checkpoint_every == 0:
                    path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                    print("Saved model checkpoint to {}\n".format(path))
Example #10
0
def third_attempt_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location):
    # fashion_mnist = keras.datasets.fashion_mnist
    # (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
    # class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
    #                'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

    # print("###  images  ###")
    # print(test_images)
    # print(type(test_images))
    # print(test_images.shape)
    # print(test_images[0])

    # print(test_labels)
    # print(type(test_labels))
    # print(test_labels.shape)
    # print(test_labels[0])

    model_name = text_vectorizer+'/'+class_vectorizer+'/CNN'
    standard_results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name)
    train_data, test_data, train_labels, test_labels, classes, n_classes, vocab_processor, len_vocabulary = standard_results

    # print("###  texts  ###")
    # print(test_data)
    # print(type(test_data))
    # print(test_data.shape)
    # print(test_data[0])

    # print(test_labels)
    # print(type(test_labels))
    # print(test_labels.shape)
    # print(test_labels[0])

    # preprocess data
    # train_images = train_images / 255.0
    # test_images = test_images / 255.0
    # it reduced all in the range 0--1

    # normalize the values

    train_data = np.reshape(train_data, [train_data.shape[0], 1, train_data.shape[1]])
    test_data = np.reshape(test_data, [test_data.shape[0], 1, test_data.shape[1]])

    print(train_labels)
    print(train_labels.shape)

    for index in range(train_labels.shape[1]):
        temp_train_labels = train_labels[:, index] # estimating the first class
        temp_test_labels = test_labels[:, index] # estimating the first class

        print(temp_train_labels)

        model = pmh.get_image_convolutional_from_web(train_data, n_classes)

        metrics, predictions = pmh.run_image_cnn_model(model, train_data, temp_train_labels, test_data, temp_test_labels)

        print("predictions: ", predictions)

        classifier_name, layers = ch.get_sequential_classifier_information(model)
        mh.display_convolutional_metrics(classifier_name, metrics[0], metrics[1], metrics[2], temp_test_labels, predictions)

    ch.save_results(classifier_name, metrics, layers, model_name, classif_level, classif_type, dataset_location)