Ejemplo n.º 1
0
def Text_Classification(x_train,
                        y_train,
                        x_test,
                        y_test,
                        batch_size=128,
                        EMBEDDING_DIM=50,
                        MAX_SEQUENCE_LENGTH=500,
                        MAX_NB_WORDS=75000,
                        GloVe_dir="",
                        GloVe_file="glove.6B.50d.txt",
                        sparse_categorical=True,
                        random_deep=[3, 3, 3],
                        epochs=[500, 500, 500],
                        plot=False,
                        min_hidden_layer_dnn=1,
                        max_hidden_layer_dnn=8,
                        min_nodes_dnn=128,
                        max_nodes_dnn=1024,
                        min_hidden_layer_rnn=1,
                        max_hidden_layer_rnn=5,
                        min_nodes_rnn=32,
                        max_nodes_rnn=128,
                        min_hidden_layer_cnn=3,
                        max_hidden_layer_cnn=10,
                        min_nodes_cnn=128,
                        max_nodes_cnn=512,
                        random_state=42,
                        random_optimizor=True,
                        dropout=0.5,
                        no_of_classes=0):

    np.random.seed(random_state)

    glove_directory = GloVe_dir
    GloVe_file = GloVe_file

    GloVe_needed = random_deep[1] != 0 or random_deep[2] != 0

    # example_input  = [0,1,3]
    # example_output :
    #
    # [[1 0 0 0]
    #  [0 1 0 0]
    #  [0 0 0 1]]

    def one_hot_encoder(value, datal):

        datal[value] = 1

        return datal

    def _one_hot_values(labels_data):
        encoded = [0] * len(labels_data)

        for j, i in enumerate(labels_data):
            max_value = [0] * (np.max(labels_data) + 1)

            encoded[j] = one_hot_encoder(i, max_value)

        return np.array(encoded)

    if not isinstance(y_train[0], list) and not isinstance(
            y_train[0], np.ndarray):
        #checking if labels are one hot or not otherwise dense_layer will give shape error

        print("converted_into_one_hot")
        y_train = _one_hot_values(y_train)
        y_test = _one_hot_values(y_test)

    if GloVe_needed:
        if glove_directory == "":
            GloVe_directory = GloVe.download_and_extract()
            GloVe_DIR = os.path.join(GloVe_directory, GloVe_file)
        else:
            GloVe_DIR = os.path.join(glove_directory, GloVe_file)

        if not os.path.isfile(GloVe_DIR):
            print("Could not find %s Set GloVe Directory in Global.py ", GloVe)
            exit()

    G.setup()
    if random_deep[0] != 0:
        x_train_tfidf, x_test_tfidf = txt.loadData(x_train,
                                                   x_test,
                                                   MAX_NB_WORDS=MAX_NB_WORDS)
    if random_deep[1] != 0 or random_deep[2] != 0:
        print(GloVe_DIR)
        x_train_embedded, x_test_embedded, word_index, embeddings_index = txt.loadData_Tokenizer(
            x_train, x_test, GloVe_DIR, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH,
            EMBEDDING_DIM)

    del x_train
    del x_test
    gc.collect()

    y_pr = []
    History = []
    score = []

    if no_of_classes == 0:

        #checking no_of_classes
        #np.max(data)+1 will not work for one_hot encoding labels

        number_of_classes = len(y_train[0])
        print(number_of_classes)
    else:
        number_of_classes = no_of_classes
        print(number_of_classes)

    i = 0
    while i < random_deep[0]:
        # model_DNN.append(Sequential())
        try:
            print("DNN " + str(i))
            filepath = "weights\weights_DNN_" + str(i) + ".hdf5"
            checkpoint = ModelCheckpoint(filepath,
                                         monitor='val_acc',
                                         verbose=1,
                                         save_best_only=True,
                                         mode='max')
            callbacks_list = [checkpoint]

            model_DNN, model_tmp = BuildModel.Build_Model_DNN_Text(
                x_train_tfidf.shape[1], number_of_classes, sparse_categorical,
                min_hidden_layer_dnn, max_hidden_layer_dnn, min_nodes_dnn,
                max_nodes_dnn, random_optimizor, dropout)
            model_history = model_DNN.fit(x_train_tfidf,
                                          y_train,
                                          validation_data=(x_test_tfidf,
                                                           y_test),
                                          epochs=epochs[0],
                                          batch_size=batch_size,
                                          callbacks=callbacks_list,
                                          verbose=2)
            History.append(model_history)

            model_tmp.load_weights(filepath)
            if sparse_categorical:
                model_tmp.compile(loss='sparse_categorical_crossentropy',
                                  optimizer='adam',
                                  metrics=['accuracy'])

                y_pr_ = model_tmp.predict_classes(x_test_tfidf,
                                                  batch_size=batch_size)
                y_pr.append(np.array(y_pr_))
                score.append(accuracy_score(y_test, y_pr_))
            else:
                model_tmp.compile(loss='categorical_crossentropy',
                                  optimizer='adam',
                                  metrics=['accuracy'])

                y_pr_ = model_tmp.predict(x_test_tfidf, batch_size=batch_size)

                y_pr_ = np.argmax(y_pr_, axis=1)
                y_pr.append(np.array(y_pr_))
                y_test_temp = np.argmax(y_test, axis=1)
                score.append(accuracy_score(y_test_temp, y_pr_))
            # print(y_proba)
            i += 1
            del model_tmp
            del model_DNN

        except Exception as e:

            print("Check the Error \n {} ".format(e))

            print("Error in model", i, "try to re-generate another model")
            if max_hidden_layer_dnn > 3:
                max_hidden_layer_dnn -= 1
            if max_nodes_dnn > 256:
                max_nodes_dnn -= 8

    try:
        del x_train_tfidf
        del x_test_tfidf
        gc.collect()
    except:
        pass

    i = 0
    while i < random_deep[1]:
        try:
            print("RNN " + str(i))
            filepath = "weights\weights_RNN_" + str(i) + ".hdf5"
            checkpoint = ModelCheckpoint(filepath,
                                         monitor='val_acc',
                                         verbose=1,
                                         save_best_only=True,
                                         mode='max')
            callbacks_list = [checkpoint]

            model_RNN, model_tmp = BuildModel.Build_Model_RNN_Text(
                word_index, embeddings_index, number_of_classes,
                MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, sparse_categorical,
                min_hidden_layer_rnn, max_hidden_layer_rnn, min_nodes_rnn,
                max_nodes_rnn, random_optimizor, dropout)

            model_history = model_RNN.fit(x_train_embedded,
                                          y_train,
                                          validation_data=(x_test_embedded,
                                                           y_test),
                                          epochs=epochs[1],
                                          batch_size=batch_size,
                                          callbacks=callbacks_list,
                                          verbose=2)
            History.append(model_history)

            if sparse_categorical:
                model_tmp.load_weights(filepath)
                model_tmp.compile(loss='sparse_categorical_crossentropy',
                                  optimizer='rmsprop',
                                  metrics=['accuracy'])

                y_pr_ = model_tmp.predict_classes(x_test_embedded,
                                                  batch_size=batch_size)
                y_pr.append(np.array(y_pr_))
                score.append(accuracy_score(y_test, y_pr_))
            else:
                model_tmp.load_weights(filepath)
                model_tmp.compile(loss='categorical_crossentropy',
                                  optimizer='rmsprop',
                                  metrics=['accuracy'])
                y_pr_ = model_tmp.predict(x_test_embedded,
                                          batch_size=batch_size)
                y_pr_ = np.argmax(y_pr_, axis=1)
                y_pr.append(np.array(y_pr_))
                y_test_temp = np.argmax(y_test, axis=1)
                score.append(accuracy_score(y_test_temp, y_pr_))
            i += 1
            del model_tmp
            del model_RNN
            gc.collect()
        except:
            print("Error in model", i, "try to re-generate another model")
            if max_hidden_layer_rnn > 3:
                max_hidden_layer_rnn -= 1
            if max_nodes_rnn > 64:
                max_nodes_rnn -= 2

    gc.collect()

    i = 0
    while i < random_deep[2]:
        try:
            print("CNN " + str(i))

            model_CNN, model_tmp = BuildModel.Build_Model_CNN_Text(
                word_index, embeddings_index, number_of_classes,
                MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, sparse_categorical,
                min_hidden_layer_cnn, max_hidden_layer_cnn, min_nodes_cnn,
                max_nodes_cnn, random_optimizor, dropout)

            filepath = "weights\weights_CNN_" + str(i) + ".hdf5"
            checkpoint = ModelCheckpoint(filepath,
                                         monitor='val_acc',
                                         verbose=1,
                                         save_best_only=True,
                                         mode='max')
            callbacks_list = [checkpoint]

            model_history = model_CNN.fit(x_train_embedded,
                                          y_train,
                                          validation_data=(x_test_embedded,
                                                           y_test),
                                          epochs=epochs[2],
                                          batch_size=batch_size,
                                          callbacks=callbacks_list,
                                          verbose=2)
            History.append(model_history)

            model_tmp.load_weights(filepath)
            if sparse_categorical:
                model_tmp.compile(loss='sparse_categorical_crossentropy',
                                  optimizer='rmsprop',
                                  metrics=['accuracy'])
            else:
                model_tmp.compile(loss='categorical_crossentropy',
                                  optimizer='rmsprop',
                                  metrics=['accuracy'])

            y_pr_ = model_tmp.predict(x_test_embedded, batch_size=batch_size)
            y_pr_ = np.argmax(y_pr_, axis=1)
            y_pr.append(np.array(y_pr_))

            if sparse_categorical:
                score.append(accuracy_score(y_test, y_pr_))
            else:
                y_test_temp = np.argmax(y_test, axis=1)
                score.append(accuracy_score(y_test_temp, y_pr_))
            i += 1

            del model_tmp
            del model_CNN
            gc.collect()
        except:
            print("Error in model", i, "try to re-generate an other model")
            if max_hidden_layer_cnn > 5:
                max_hidden_layer_cnn -= 1
            if max_nodes_cnn > 128:
                max_nodes_cnn -= 2
                min_nodes_cnn -= 1

    gc.collect()

    y_proba = np.array(y_pr).transpose()

    final_y = []

    for i in range(0, y_proba.shape[0]):
        a = np.array(y_proba[i, :])
        a = collections.Counter(a).most_common()[0][0]
        final_y.append(a)
    if sparse_categorical:
        F_score = accuracy_score(y_test, final_y)
        F1 = precision_recall_fscore_support(y_test, final_y, average='micro')
        F2 = precision_recall_fscore_support(y_test, final_y, average='macro')
        F3 = precision_recall_fscore_support(y_test,
                                             final_y,
                                             average='weighted')
        cnf_matrix = confusion_matrix(y_test, final_y)
        # Compute confusion matrix
        # Plot non-normalized confusion matrix

        if plot:
            classes = list(range(0, np.max(y_test) + 1))
            Plot.plot_confusion_matrix(
                cnf_matrix,
                classes=classes,
                title='Confusion matrix, without normalization')

            # Plot normalized confusion matrix

            Plot.plot_confusion_matrix(cnf_matrix,
                                       classes=classes,
                                       normalize=True,
                                       title='Normalized confusion matrix')
    else:
        y_test_temp = np.argmax(y_test, axis=1)
        F_score = accuracy_score(y_test_temp, final_y)
        F1 = precision_recall_fscore_support(y_test_temp,
                                             final_y,
                                             average='micro')
        F2 = precision_recall_fscore_support(y_test_temp,
                                             final_y,
                                             average='macro')
        F3 = precision_recall_fscore_support(y_test_temp,
                                             final_y,
                                             average='weighted')
    if plot:
        Plot.RMDL_epoch(History)
    print(y_proba.shape)
    print("Accuracy of", len(score), "models:", score)
    print("Accuracy:", F_score)
    print("F1_Micro:", F1)
    print("F1_Macro:", F2)
    print("F1_weighted:", F3)
Ejemplo n.º 2
0
def Text_Classification(x_train, y_train, x_test,  y_test, batch_size=128,
                        EMBEDDING_DIM=50,MAX_SEQUENCE_LENGTH = 500, MAX_NB_WORDS = 75000,
                        GloVe_dir="", GloVe_file = "glove.6B.50d.txt",
                        sparse_categorical=True, random_deep=[3, 3, 3], epochs=[500, 500, 500],  plot=False,
                        min_hidden_layer_dnn=1, max_hidden_layer_dnn=8, min_nodes_dnn=128, max_nodes_dnn=1024,
                        min_hidden_layer_rnn=1, max_hidden_layer_rnn=5, min_nodes_rnn=32,  max_nodes_rnn=128,
                        min_hidden_layer_cnn=3, max_hidden_layer_cnn=10, min_nodes_cnn=128, max_nodes_cnn=512,
                        random_state=42, random_optimizor=True, dropout=0.5,no_of_classes=0):


    """
    Text_Classification(x_train, y_train, x_test,  y_test, batch_size=128,
                        EMBEDDING_DIM=50,MAX_SEQUENCE_LENGTH = 500, MAX_NB_WORDS = 75000,
                        GloVe_dir="", GloVe_file = "glove.6B.50d.txt",
                        sparse_categorical=True, random_deep=[3, 3, 3], epochs=[500, 500, 500],  plot=False,
                        min_hidden_layer_dnn=1, max_hidden_layer_dnn=8, min_nodes_dnn=128, max_nodes_dnn=1024,
                        min_hidden_layer_rnn=1, max_hidden_layer_rnn=5, min_nodes_rnn=32,  max_nodes_rnn=128,
                        min_hidden_layer_cnn=3, max_hidden_layer_cnn=10, min_nodes_cnn=128, max_nodes_cnn=512,
                        random_state=42, random_optimizor=True, dropout=0.5):

        Parameters
        ----------
            batch_size : Integer, , optional
                Number of samples per gradient update. If unspecified, it will default to 128
            MAX_NB_WORDS: int, optional
                Maximum number of unique words in datasets, it will default to 75000.
            GloVe_dir: String, optional
                Address of GloVe or any pre-trained directory, it will default to null which glove.6B.zip will be download.
            GloVe_dir: String, optional
                Which version of GloVe or pre-trained word emending will be used, it will default to glove.6B.50d.txt.
                NOTE: if you use other version of GloVe EMBEDDING_DIM must be same dimensions.
            sparse_categorical: bool.
                When target's dataset is (n,1) should be True, it will default to True.
            random_deep: array of int [3], optional
                Number of ensembled model used in RMDL random_deep[0] is number of DNN, random_deep[1] is number of RNN, random_deep[0] is number of CNN, it will default to [3, 3, 3].
            epochs: array of int [3], optional
                Number of epochs in each ensembled model used in RMDL epochs[0] is number of epochs used in DNN, epochs[1] is number of epochs used in RNN, epochs[0] is number of epochs used in CNN, it will default to [500, 500, 500].
            plot: bool, optional
                True: shows confusion matrix and accuracy and loss
            min_hidden_layer_dnn: Integer, optional
                Lower Bounds of hidden layers of DNN used in RMDL, it will default to 1.
            max_hidden_layer_dnn: Integer, optional
                Upper bounds of hidden layers of DNN used in RMDL, it will default to 8.
            min_nodes_dnn: Integer, optional
                Lower bounds of nodes in each layer of DNN used in RMDL, it will default to 128.
            max_nodes_dnn: Integer, optional
                Upper bounds of nodes in each layer of DNN used in RMDL, it will default to 1024.
            min_hidden_layer_rnn: Integer, optional
                Lower Bounds of hidden layers of RNN used in RMDL, it will default to 1.
            min_hidden_layer_rnn: Integer, optional
                Upper Bounds of hidden layers of RNN used in RMDL, it will default to 5.
            min_nodes_rnn: Integer, optional
                Lower bounds of nodes (LSTM or GRU) in each layer of RNN used in RMDL, it will default to 32.
            max_nodes_rnn: Integer, optional
                Upper bounds of nodes (LSTM or GRU) in each layer of RNN used in RMDL, it will default to 128.
            min_hidden_layer_cnn: Integer, optional
                Lower Bounds of hidden layers of CNN used in RMDL, it will default to 3.
            max_hidden_layer_cnn: Integer, optional
                Upper Bounds of hidden layers of CNN used in RMDL, it will default to 10.
            min_nodes_cnn: Integer, optional
                Lower bounds of nodes (2D convolution layer) in each layer of CNN used in RMDL, it will default to 128.
            min_nodes_cnn: Integer, optional
                Upper bounds of nodes (2D convolution layer) in each layer of CNN used in RMDL, it will default to 512.
            random_state : Integer, optional
                RandomState instance or None, optional (default=None)
                If Integer, random_state is the seed used by the random number generator;
            random_optimizor : bool, optional
                If False, all models use adam optimizer. If True, all models use random optimizers. it will default to True
            dropout: Float, optional
                between 0 and 1. Fraction of the units to drop for the linear transformation of the inputs.

    """
    np.random.seed(random_state)


    glove_directory = GloVe_dir
    GloVe_file = GloVe_file

    print("Done1")

    GloVe_needed = random_deep[1] != 0 or random_deep[2] != 0
    
    # example_input  = [0,1,3]
    # example_output :
    # 
    # [[1 0 0 0]
    #  [0 1 0 0]
    #  [0 0 0 1]]
    
    def one_hot_encoder(value, label_data_):

        label_data_[value] = 1

        return label_data_

    def _one_hot_values(labels_data):
        encoded = [0] * len(labels_data)

        for index_no, value in enumerate(labels_data):
            max_value = [0] * (np.max(labels_data) + 1)

            encoded[index_no] = one_hot_encoder(value, max_value)

        return np.array(encoded)

    if not isinstance(y_train[0], list) and not isinstance(y_train[0], np.ndarray) and not sparse_categorical::
        #checking if labels are one hot or not otherwise dense_layer will give shape error 
        
        print("converted_into_one_hot")
        y_train = _one_hot_values(y_train)
        y_test = _one_hot_values(y_test)
            




    if GloVe_needed:
        if glove_directory == "":
            GloVe_directory = GloVe.download_and_extract()
            GloVe_DIR = os.path.join(GloVe_directory, GloVe_file)
        else:
            GloVe_DIR = os.path.join(glove_directory, GloVe_file)

        if not os.path.isfile(GloVe_DIR):
            print("Could not find %s Set GloVe Directory in Global.py ", GloVe)
            exit()

    G.setup()
    if random_deep[0] != 0:
        x_train_tfidf, x_test_tfidf = txt.loadData(x_train, x_test,MAX_NB_WORDS=MAX_NB_WORDS)
    if random_deep[1] != 0 or random_deep[2] != 0 :
        print(GloVe_DIR)
        x_train_embedded, x_test_embedded, word_index, embeddings_index = txt.loadData_Tokenizer(x_train, x_test,GloVe_DIR,MAX_NB_WORDS,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM)

    del x_train
    del x_test
    gc.collect()

    y_pr = []
    History = []
    score = []

    if no_of_classes==0:
        #checking no_of_classes
        #np.max(data)+1 will not work for one_hot encoding labels
        if sparse_categorical:
            number_of_classes = np.max(y_train) + 1
        else:
            number_of_classes = len(y_train[0])
    else:
        number_of_classes = no_of_classes
    print(number_of_classes)


    i = 0
    while i < random_deep[0]:
        # model_DNN.append(Sequential())
        try:
            print("DNN " + str(i))
            filepath = "weights\weights_DNN_" + str(i) + ".hdf5"
            checkpoint = ModelCheckpoint(filepath,
                                         monitor='val_acc',
                                         verbose=1,
                                         save_best_only=True,
                                         mode='max')
            callbacks_list = [checkpoint]

            model_DNN, model_tmp = BuildModel.Build_Model_DNN_Text(x_train_tfidf.shape[1],
                                                                   number_of_classes,
                                                                   sparse_categorical,
                                                                   min_hidden_layer_dnn,
                                                                   max_hidden_layer_dnn,
                                                                   min_nodes_dnn,
                                                                   max_nodes_dnn,
                                                                   random_optimizor,
                                                                   dropout)
            model_history = model_DNN.fit(x_train_tfidf, y_train,
                              validation_data=(x_test_tfidf, y_test),
                              epochs=epochs[0],
                              batch_size=batch_size,
                              callbacks=callbacks_list,
                              verbose=2)
            History.append(model_history)

            model_tmp.load_weights(filepath)
            if sparse_categorical:
                model_tmp.compile(loss='sparse_categorical_crossentropy',
                                  optimizer='adam',
                                  metrics=['accuracy'])

                y_pr_ = model_tmp.predict_classes(x_test_tfidf,
                                                  batch_size=batch_size)
                y_pr.append(np.array(y_pr_))
                score.append(accuracy_score(y_test, y_pr_))
            else:
                model_tmp.compile(loss='categorical_crossentropy',
                                  optimizer='adam',
                                  metrics=['accuracy'])

                y_pr_ = model_tmp.predict(x_test_tfidf,
                                          batch_size=batch_size)

                y_pr_ = np.argmax(y_pr_, axis=1)
                y_pr.append(np.array(y_pr_))
                y_test_temp = np.argmax(y_test, axis=1)
                score.append(accuracy_score(y_test_temp, y_pr_))
            # print(y_proba)
            i += 1
            del model_tmp
            del model_DNN

        except Exception as e:

            print("Check the Error \n {} ".format(e))

            print("Error in model", i, "try to re-generate another model")
            if max_hidden_layer_dnn > 3:
                max_hidden_layer_dnn -= 1
            if max_nodes_dnn > 256:
                max_nodes_dnn -= 8

    try:
        del x_train_tfidf
        del x_test_tfidf
        gc.collect()
    except:
        pass

    i=0
    while i < random_deep[1]:
        try:
            print("RNN " + str(i))
            filepath = "weights\weights_RNN_" + str(i) + ".hdf5"
            checkpoint = ModelCheckpoint(filepath,
                                         monitor='val_acc',
                                         verbose=1,
                                         save_best_only=True,
                                         mode='max')
            callbacks_list = [checkpoint]

            model_RNN, model_tmp = BuildModel.Build_Model_RNN_Text(word_index,
                                                                   embeddings_index,
                                                                   number_of_classes,
                                                                   MAX_SEQUENCE_LENGTH,
                                                                   EMBEDDING_DIM,
                                                                   sparse_categorical,
                                                                   min_hidden_layer_rnn,
                                                                   max_hidden_layer_rnn,
                                                                   min_nodes_rnn,
                                                                   max_nodes_rnn,
                                                                   random_optimizor,
                                                                   dropout)

            model_history = model_RNN.fit(x_train_embedded, y_train,
                              validation_data=(x_test_embedded, y_test),
                              epochs=epochs[1],
                              batch_size=batch_size,
                              callbacks=callbacks_list,
                              verbose=2)
            History.append(model_history)

            if sparse_categorical:
                model_tmp.load_weights(filepath)
                model_tmp.compile(loss='sparse_categorical_crossentropy',
                                  optimizer='rmsprop',
                                  metrics=['accuracy'])

                y_pr_ = model_tmp.predict_classes(x_test_embedded, batch_size=batch_size)
                y_pr.append(np.array(y_pr_))
                score.append(accuracy_score(y_test, y_pr_))
            else:
                model_tmp.load_weights(filepath)
                model_tmp.compile(loss='categorical_crossentropy',
                                  optimizer='rmsprop',
                                  metrics=['accuracy'])
                y_pr_ = model_tmp.predict(x_test_embedded, batch_size=batch_size)
                y_pr_ = np.argmax(y_pr_, axis=1)
                y_pr.append(np.array(y_pr_))
                y_test_temp = np.argmax(y_test, axis=1)
                score.append(accuracy_score(y_test_temp, y_pr_))
            i += 1
            del model_tmp
            del model_RNN
            gc.collect()
        except:
            print("Error in model", i, "try to re-generate another model")
            if max_hidden_layer_rnn > 3:
                max_hidden_layer_rnn -= 1
            if max_nodes_rnn > 64:
                max_nodes_rnn -= 2

    gc.collect()

    i = 0
    while i < random_deep[2]:
        try:
            print("CNN " + str(i))

            model_CNN, model_tmp = BuildModel.Build_Model_CNN_Text(word_index,
                                                                   embeddings_index,
                                                                   number_of_classes,
                                                                   MAX_SEQUENCE_LENGTH,
                                                                   EMBEDDING_DIM,
                                                                   sparse_categorical,
                                                                   min_hidden_layer_cnn,
                                                                   max_hidden_layer_cnn,
                                                                   min_nodes_cnn,
                                                                   max_nodes_cnn,
                                                                   random_optimizor,
                                                                   dropout)



            filepath = "weights\weights_CNN_" + str(i) + ".hdf5"
            checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True,
                                         mode='max')
            callbacks_list = [checkpoint]

            model_history = model_CNN.fit(x_train_embedded, y_train,
                                          validation_data=(x_test_embedded, y_test),
                                          epochs=epochs[2],
                                          batch_size=batch_size,
                                          callbacks=callbacks_list,
                                          verbose=2)
            History.append(model_history)

            model_tmp.load_weights(filepath)
            if sparse_categorical:
                model_tmp.compile(loss='sparse_categorical_crossentropy',
                                  optimizer='rmsprop',
                                  metrics=['accuracy'])
            else:
                model_tmp.compile(loss='categorical_crossentropy',
                                  optimizer='rmsprop',
                                  metrics=['accuracy'])

            y_pr_ = model_tmp.predict(x_test_embedded, batch_size=batch_size)
            y_pr_ = np.argmax(y_pr_, axis=1)
            y_pr.append(np.array(y_pr_))

            if sparse_categorical:
                score.append(accuracy_score(y_test, y_pr_))
            else:
                y_test_temp = np.argmax(y_test, axis=1)
                score.append(accuracy_score(y_test_temp, y_pr_))
            i += 1

            del model_tmp
            del model_CNN
            gc.collect()
        except:
            print("Error in model", i, "try to re-generate an other model")
            if max_hidden_layer_cnn > 5:
                max_hidden_layer_cnn -= 1
            if max_nodes_cnn > 128:
                max_nodes_cnn -= 2
                min_nodes_cnn -= 1

    gc.collect()


    y_proba = np.array(y_pr).transpose()

    final_y = []

    for i in range(0, y_proba.shape[0]):
        a = np.array(y_proba[i, :])
        a = collections.Counter(a).most_common()[0][0]
        final_y.append(a)
    if sparse_categorical:
        F_score = accuracy_score(y_test, final_y)
        F1 = precision_recall_fscore_support(y_test, final_y, average='micro')
        F2 = precision_recall_fscore_support(y_test, final_y, average='macro')
        F3 = precision_recall_fscore_support(y_test, final_y, average='weighted')
        cnf_matrix = confusion_matrix(y_test, final_y)
        # Compute confusion matrix
        # Plot non-normalized confusion matrix

        if plot:
            classes = list(range(0, np.max(y_test)+1))
            Plot.plot_confusion_matrix(cnf_matrix, classes=classes,
                                       title='Confusion matrix, without normalization')

            # Plot normalized confusion matrix

            Plot.plot_confusion_matrix(cnf_matrix, classes=classes, normalize=True,
                                       title='Normalized confusion matrix')
    else:
        y_test_temp = np.argmax(y_test, axis=1)
        F_score = accuracy_score(y_test_temp, final_y)
        F1 = precision_recall_fscore_support(y_test_temp, final_y, average='micro')
        F2 = precision_recall_fscore_support(y_test_temp, final_y, average='macro')
        F3 = precision_recall_fscore_support(y_test_temp, final_y, average='weighted')
    if plot:
        Plot.RMDL_epoch(History)
    print(y_proba.shape)
    print("Accuracy of",len(score),"models:",score)
    print("Accuracy:",F_score)
    print("F1_Micro:",F1)
    print("F1_Macro:",F2)
    print("F1_weighted:",F3)
Ejemplo n.º 3
0
def train(x_train,
          y_train,
          x_val,
          y_val,
          class_weight=None,
          batch_size=128,
          embedding_dim=50,
          max_seq_len=500,
          max_num_words=75000,
          glove_dir="",
          glove_file="glove.6B.50d.txt",
          sparse_categorical=True,
          random_deep=[3, 3, 3],
          epochs=[500, 500, 500],
          plot=False,
          min_hidden_layer_dnn=1,
          max_hidden_layer_dnn=6,
          min_nodes_dnn=128,
          max_nodes_dnn=1024,
          min_hidden_layer_rnn=1,
          max_hidden_layer_rnn=5,
          min_nodes_rnn=128,
          max_nodes_rnn=512,
          min_hidden_layer_cnn=3,
          max_hidden_layer_cnn=10,
          min_nodes_cnn=128,
          max_nodes_cnn=512,
          random_state=42,
          random_optimizor=True,
          dropout=0.5,
          dnn_l2=0,
          rnn_l2=0.01,
          cnn_l2=0.01,
          use_cuda=True,
          use_bidirectional=True,
          lr=1e-3):
    """
    train(x_train, y_train, x_val, y_val, class_weight=None batch_size=128,
            embedding_dim=50, max_seq_len=500, max_num_words=75000,
            glove_dir="", glove_file="glove.6B.50d.txt",
            sparse_categorical=True, random_deep=[3, 3, 3], epochs=[500, 500, 500], plot=False,
            min_hidden_layer_dnn=1, max_hidden_layer_dnn=6, min_nodes_dnn=128, max_nodes_dnn=1024,
            min_hidden_layer_rnn=1, max_hidden_layer_rnn=5, min_nodes_rnn=32,  max_nodes_rnn=128,
            min_hidden_layer_cnn=3, max_hidden_layer_cnn=10, min_nodes_cnn=128, max_nodes_cnn=512,
            random_state=42, random_optimizor=True, dropout=0.5)

        Parameters
        ----------
            class_weight: dict, optional
                Dictionary mapping class indices (integers) to a weight (float) value, used for weighting the loss function (during training only).
                This can be useful to tell the model to "pay more attention" to samples from an under-represented class.
            batch_size: int, optional
                Number of samples per gradient update. It will default to 128.
            embedding_dim: int, optional
                Dimensionality of the vector representation (word embedding) of each token in the corpus.
                It will default to 50.
            max_seq_len: int, optional
                Maximum number of words in a text to consider. It will default to 500.
            max_num_words: int, optional
                Maximum number of unique words in datasets. It will default to 75000.
            glove_dir: string, optional
                Path to GloVe or any pre-trained word embedding directory. It will default to the current
                directory where glove.6B.zip should be downloaded.
            glove_file: string, optional
                Which version of GloVe or any pre-trained word embedding will be used. It will default to glove.6B.50d.txt.
                NOTE: If you use other version of GloVe embedding_dim must be the same dimensions.
            sparse_categorical: bool
                When target's dataset is (n,1) should be True. It will default to True.
            random_deep: array of int [3], optional
                Number of ensembled models used in RMDL random_deep[0] is number of DNNs,
                random_deep[1] is number of RNNs, random_deep[2] is number of CNNs. It will default to [3, 3, 3].
            epochs: array of int [3], optional
                Number of epochs in each ensembled model used in RMDL epochs[0] is number of epochs used in DNNs,
                epochs[1] is number of epochs used in RNNs, epochs[0] is number of epochs used in CNNs. It will default to [500, 500, 500].
            plot: bool, optional
                Plot accuracies and losses of training and validation.
            min_hidden_layer_dnn: int, optional
                Lower Bounds of hidden layers of DNN used in RMDL. It will default to 1.
            max_hidden_layer_dnn: int, optional
                Upper bounds of hidden layers of DNN used in RMDL. It will default to 8.
            min_nodes_dnn: int, optional
                Lower bounds of nodes in each layer of DNN used in RMDL. It will default to 128.
            max_nodes_dnn: int, optional
                Upper bounds of nodes in each layer of DNN used in RMDL. It will default to 1024.
            min_hidden_layer_rnn: int, optional
                Lower Bounds of hidden layers of RNN used in RMDL. It will default to 1.
            min_hidden_layer_rnn: int, optional
                Upper Bounds of hidden layers of RNN used in RMDL. It will default to 5.
            min_nodes_rnn: int, optional
                Lower bounds of nodes (LSTM or GRU) in each layer of RNN used in RMDL. It will default to 32.
            max_nodes_rnn: int, optional
                Upper bounds of nodes (LSTM or GRU) in each layer of RNN used in RMDL. It will default to 128.
            min_hidden_layer_cnn: int, optional
                Lower Bounds of hidden layers of CNN used in RMDL. It will default to 3.
            max_hidden_layer_cnn: int, optional
                Upper Bounds of hidden layers of CNN used in RMDL. It will default to 10.
            min_nodes_cnn: int, optional
                Lower bounds of nodes (2D convolution layer) in each layer of CNN used in RMDL. It will default to 128.
            min_nodes_cnn: int, optional
                Upper bounds of nodes (2D convolution layer) in each layer of CNN used in RMDL. It will default to 512.
            random_state: int, optional
                RandomState instance or None, optional (default=None)
                If Integer, random_state is the seed used by the random number generator;
            random_optimizor: bool, optional
                If False, all models use adam optimizer. If True, all models use random optimizers. It will default to True
            dropout: float, optional
                between 0 and 1. Fraction of the units to drop for the linear transformation of the inputs.

        Returns
        -------
            history: list
                List of training history dictionaries for models used.
    """
    np.random.seed(random_state)

    models_dir = "models"

    util.setup()

    history = []

    if isinstance(y_train, list):
        number_of_classes = len(set(y_train))
    elif isinstance(y_train, np.ndarray):
        number_of_classes = np.unique(y_train).shape[0]

    if not isinstance(y_train[0], list) and not isinstance(y_train[0], np.ndarray) \
            and not sparse_categorical and number_of_classes != 2:
        # checking if labels are one hot or not otherwise dense_layer will give shape error
        print("convert labels into one hot encoded representation")
        y_train = txt.get_one_hot_values(y_train)
        y_val = txt.get_one_hot_values(y_val)

    glove_needed = random_deep[1] != 0 or random_deep[2] != 0
    if glove_needed:
        if glove_dir == "" and glove_file == "":
            glove_dir = GloVe.download_and_extract()
            glove_filepath = os.path.join(glove_dir, glove_file)
        else:
            glove_filepath = os.path.join(glove_dir, glove_file)

        if not os.path.isfile(glove_filepath):
            print(f"Could not find {GloVe} Set GloVe Directory in Global.py")
            exit()

    all_text = np.concatenate((x_train, x_val))
    if random_deep[0] != 0:
        all_text_tf_idf = txt.get_tf_idf_vectors(all_text,
                                                 max_num_words=max_num_words)
        x_train_tf_idf = all_text_tf_idf[:len(x_train), ]
        x_val_tf_idf = all_text_tf_idf[len(x_train):, ]
    if random_deep[1] != 0 or random_deep[2] != 0:
        print(glove_filepath)
        all_text_tokenized, word_index = txt.tokenize(
            all_text, max_num_words=max_num_words, max_seq_len=max_seq_len)
        x_train_tokenized = all_text_tokenized[:len(x_train), ]
        x_val_tokenized = all_text_tokenized[len(x_train):, ]
        embeddings_index = txt.get_word_embedding_index(
            glove_filepath, word_index)

    del x_train
    del x_val
    gc.collect()

    i = 0
    while i < random_deep[0]:
        try:
            print(f"\nBuilding and Training DNN-{i}")
            model_DNN, model_tmp_DNN = BuildModel.Build_Model_DNN_Text(
                x_train_tf_idf.shape[1],
                number_of_classes,
                sparse_categorical,
                min_hidden_layer_dnn,
                max_hidden_layer_dnn,
                min_nodes_dnn,
                max_nodes_dnn,
                random_optimizor,
                dropout,
                _l2=dnn_l2,
                lr=lr)
            model_arch_file = f"DNN_{i}.json"
            model_weights_file = f"DNN_{i}.hdf5"
            model_json = model_tmp_DNN.to_json()
            with open(os.path.join(models_dir, model_arch_file),
                      "w") as model_json_file:
                model_json_file.write(model_json)
            checkpoint = ModelCheckpoint(os.path.join(models_dir,
                                                      model_weights_file),
                                         monitor='val_loss',
                                         verbose=1,
                                         save_best_only=True,
                                         save_weights_only=True,
                                         mode='min')
            model_history = model_DNN.fit(x_train_tf_idf,
                                          y_train,
                                          validation_data=(x_val_tf_idf,
                                                           y_val),
                                          epochs=epochs[0],
                                          batch_size=batch_size,
                                          callbacks=[checkpoint],
                                          verbose=2,
                                          class_weight=class_weight)
            history.append(model_history)
            i += 1
            del model_DNN
            gc.collect()
        except Exception as e:
            print(f"\nCheck the Error \n {e}")
            print(
                f"Error in DNN-{i} model trying to re-generate another model")
            if max_hidden_layer_dnn > 3:
                max_hidden_layer_dnn -= 1
            if max_nodes_dnn > 256:
                max_nodes_dnn -= 8

    del x_train_tf_idf
    del x_val_tf_idf
    gc.collect()

    i = 0
    while i < random_deep[1]:
        try:
            print(f"\nBuilding and Training RNN-{i}")
            model_RNN, model_tmp_RNN = BuildModel.Build_Model_RNN_Text(
                word_index,
                embeddings_index,
                number_of_classes,
                max_seq_len,
                embedding_dim,
                sparse_categorical,
                min_hidden_layer_rnn,
                max_hidden_layer_rnn,
                min_nodes_rnn,
                max_nodes_rnn,
                random_optimizor,
                dropout,
                _l2=rnn_l2,
                use_cuda=use_cuda,
                use_bidirectional=use_bidirectional,
                lr=lr)
            model_arch_file = f"RNN_{i}.json"
            model_weights_file = f"RNN_{i}.hdf5"
            model_json = model_tmp_RNN.to_json()
            with open(os.path.join(models_dir, model_arch_file),
                      "w") as model_json_file:
                model_json_file.write(model_json)
            checkpoint = ModelCheckpoint(os.path.join(models_dir,
                                                      model_weights_file),
                                         monitor='val_loss',
                                         verbose=1,
                                         save_best_only=True,
                                         save_weights_only=True,
                                         mode='min')
            model_history = model_RNN.fit(x_train_tokenized,
                                          y_train,
                                          validation_data=(x_val_tokenized,
                                                           y_val),
                                          epochs=epochs[1],
                                          batch_size=batch_size,
                                          callbacks=[checkpoint],
                                          verbose=2,
                                          class_weight=class_weight)
            history.append(model_history)
            i += 1
            del model_RNN
            gc.collect()
        except Exception as e:
            print(f"\nCheck the Error \n {e}")
            print(
                f"Error in RNN-{i} model trying to re-generate another model")
            if max_hidden_layer_rnn > 3:
                max_hidden_layer_rnn -= 1
            if max_nodes_rnn > 64:
                max_nodes_rnn -= 2

    gc.collect()

    i = 0
    while i < random_deep[2]:
        try:
            print(f"\nBuilding and Training CNN-{i}")
            model_CNN, model_tmp_CNN = BuildModel.Build_Model_CNN_Text(
                word_index,
                embeddings_index,
                number_of_classes,
                max_seq_len,
                embedding_dim,
                sparse_categorical,
                min_hidden_layer_cnn,
                max_hidden_layer_cnn,
                min_nodes_cnn,
                max_nodes_cnn,
                random_optimizor,
                dropout,
                _l2=cnn_l2,
                lr=lr)
            model_arch_file = f"CNN_{i}.json"
            model_weights_file = f"CNN_{i}.hdf5"
            model_json = model_tmp_CNN.to_json()
            with open(os.path.join(models_dir, model_arch_file),
                      "w") as model_json_file:
                model_json_file.write(model_json)
            checkpoint = ModelCheckpoint(os.path.join(models_dir,
                                                      model_weights_file),
                                         monitor='val_loss',
                                         verbose=1,
                                         save_best_only=True,
                                         save_weights_only=True,
                                         mode='min')
            model_history = model_CNN.fit(x_train_tokenized,
                                          y_train,
                                          validation_data=(x_val_tokenized,
                                                           y_val),
                                          epochs=epochs[2],
                                          batch_size=batch_size,
                                          callbacks=[checkpoint],
                                          verbose=2,
                                          class_weight=class_weight)
            history.append(model_history)
            i += 1
            del model_CNN
            gc.collect()
        except Exception as e:
            print(f"\nCheck the Error \n {e}")
            print(
                f"Error in CNN-{i} model trying to re-generate another model")
            if max_hidden_layer_cnn > 5:
                max_hidden_layer_cnn -= 1
            if max_nodes_cnn > 128:
                max_nodes_cnn -= 2
                min_nodes_cnn -= 1

    if plot:
        plt.plot_history(history)
    return history