def train(args):
    print('loading weight matrix .....................')
    model = load_model('model/renew.en.msd.weights.best.hdf5')

    #print('Reading word vectors.')
    #embeddings_index = read_glove_vectors(args.embedding_file_path)
    #print('Found {} word vectors.'.format(len(embeddings_index)))

    print('Processing input data')
    texts, labels = read_input_data(args.data_dir)
    # texts - list of text samples
    # labels_index - dictionary mapping label name to numeric id
    # labels - list of label ids
    print('Found {} texts.'.format(len(texts)))

    # Vectorize the text sample into 2D integer tensor
    tokenizer = Tokenizer(nb_words=args.nb_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    print('Found {} unique tokens.'.format(len(word_index)))

    data = pad_sequences(sequences, maxlen=args.max_sequence_len)

    df = pd.DataFrame(data.tolist())
    mono, swap, disc = score_list('data/en_msd_score')
    df.columns = ['data']
    df['mono'] = mono
    df['swap'] = swap
    df['disc'] = disc

    rules = df.groupby(['data'], as_index=False).mean()
    df.to_csv('dataframe', sep='\t', encoding='utf-8', index=False)
    print(rules)
Beispiel #2
0
def train(args):
    print('Reading word vectors.')
    embeddings_index = read_glove_vectors(args.embedding_file_path)
    print('Found {} word vectors.'.format(len(embeddings_index)))

    print('Processing input data')
    texts, labels = read_input_data(args.data_dir)
    # texts - list of text samples
    # labels_index - dictionary mapping label name to numeric id
    # labels - list of label ids
    print('Found {} texts.'.format(len(texts)))

    # Vectorize the text sample into 2D integer tensor
    tokenizer = Tokenizer(nb_words=args.nb_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index

    # Transform labels to be categorical variables

    print('Found {} unique tokens.'.format(len(word_index)))

    data = pad_sequences(sequences, maxlen=args.max_sequence_len)
    x_train, y_train = train_data(data, labels)
    print(type(data))
    print(x_train[100])
    x_train = np.array(x_train).astype('int32')
    print(x_train[100])

    # Transform labels to be categorical variables
    labels = to_categorical(np.asarray(labels))
    y_train = to_categorical(np.asarray(y_train))
    print('Shape of total data tensor:', data.shape)
    print('Shape of total label tensor:', labels.shape)

    # split the input data into training set and validation set
    indices = np.arange(x_train.shape[0])
    np.random.shuffle(indices)
    x_train = x_train[indices]
    y_train = y_train[indices]

    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    x_val = data[indices]
    y_val = labels[indices]

    print('Preparing embedding matrix.')

    # initiate embedding matrix with zero vectors.
    nb_words = min(args.nb_words, len(word_index))
    embedding_matrix = np.zeros((nb_words + 1, args.embedding_dim))
    for word, i in word_index.items():
        if i > nb_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    args.nb_words = nb_words
    args.len_labels_index = 3

    model = model_selector(args, embedding_matrix)

    checkpoint_filepath = os.path.join(args.model_dir, "new.en.msd.weights.best.hdf5")
    checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_acc', verbose=1, save_best_only=True)
    callbacks_list = [checkpoint]
    model_json = model.to_json()
    with open(os.path.join(args.model_dir, "new.en.msd.model.json"), "w") as json_file:
        json_file.write(model_json)

    model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=args.num_epochs, batch_size=args.batch_size, callbacks=callbacks_list, verbose=1)
    proba = model.predict_proba(data, batch_size=300)
    np.savetxt('new_en_msd', proba, delimiter='\t', fmt='%.6f')
Beispiel #3
0
def train(args):
    print('Reading word vectors.')
    #embeddings_index = read_glove_vectors(args.embedding_file_path)

    embeddings_index = read_glove_vectors(
        "/home/duong/Desktop/CNN-Sentence-Classifier/app/GoogleNews-vectors-negative300.txt"
    )
    embeddings_index2 = read_glove_vectors(
        "/home/duong/Desktop/CNN-Sentence-Classifier/app/glove2.txt")
    print('Found {} word vectors in embedding2.'.format(
        len(embeddings_index2)))

    print('Processing input data')
    #texts, labels_index, labels = read_input_data(args.data_dir)

    input_name = [
        "input_CR_prccd.txt", "input_Sub_prccd.txt", "input_MPQA_prccd.txt",
        "inputPCQM_prccd.txt", "input_flood_phi_prccd.txt",
        "input_flood_colorado_prccd.txt", "input_flood_qeen_prccd.txt",
        "input_flood_manila_prccd.txt", "input_fire_australia_prccd.txt",
        "input_earthquake_chile_prccd.txt"
    ]
    label_name = [
        "label_CR.txt", "label_input_Sub.txt", "label_MPQA.txt",
        "labelPCQM.txt", "label_flood_phi.txt", "label_flood_colorado.txt",
        "label_flood_qeen.txt", "label_flood_manila.txt",
        "label_fire_australia.txt", "label_earthquake_chile.txt"
    ]

    with open("11Janlan1_Train3_CV50_w2v_Glove2_cnn3xStatic.txt",
              'wb') as result_CV:
        for list in range(0, 10):
            texts, labels_index, labels, textsPCQ, labels_indexPCQ, labelsPCQ, \
            textsVali, labels_indexVali, labelsVali = read_input_data(args.data_dir,input_name[list],label_name[list])

            # texts - list of text samples
            # labels_index - dictionary mapping label name to numeric id
            # labels - list of label ids
            print('Found {} texts.'.format(len(textsPCQ)))

            # Vectorize the text sample into 2D integer tensor
            tokenizer = Tokenizer(nb_words=args.nb_words)
            tokenizer.fit_on_texts(textsPCQ)
            sequences = tokenizer.texts_to_sequences(textsPCQ)
            word_index = tokenizer.word_index
            print('Found {} unique tokens.'.format(len(word_index)))

            data = pad_sequences(sequences, maxlen=args.max_sequence_len)

            # Transform labels to be categorical variables
            labelsPCQ = to_categorical(np.asarray(labelsPCQ))
            print('Shape of data tensor:', data.shape)
            print('Shape of label tensor:', labelsPCQ.shape)

            #Infor of Validat dataset
            print('Found {} Vali.'.format(len(textsVali)))

            # Vectorize the text sample into 2D integer tensor
            tokenizerVali = Tokenizer(nb_words=args.nb_words)
            tokenizerVali.fit_on_texts(textsVali)
            sequencesVali = tokenizerVali.texts_to_sequences(textsVali)
            word_indexVali = tokenizerVali.word_index
            print('Found {} unique tokens in Vali.'.format(
                len(word_indexVali)))

            dataVali = pad_sequences(sequencesVali,
                                     maxlen=args.max_sequence_len)

            # Transform labels to be categorical variables
            labelsVali = to_categorical(np.asarray(labelsVali))
            print('Shape of data tensor in Vali:', dataVali.shape)
            print('Shape of label tensor in Vali:', labelsVali.shape)

            #split the input data into training set and validation set
            indices = np.arange(data.shape[0])
            np.random.shuffle(indices)
            data = data[indices]
            labelsPCQ = labelsPCQ[indices]
            # nb_validation_samples = int(args.validation_split * data.shape[0])
            #
            # x_train = data[:-nb_validation_samples]
            # y_train = labelsPCQ[:-nb_validation_samples]
            # x_val = data[-nb_validation_samples:]
            # y_val = labelsPCQ[-nb_validation_samples:]

            # indices_train = np.arange(data.shape[0])
            # np.random.shuffle(indices_train)
            # data = data[indices_train]
            # labelsPCQ = labelsPCQ[indices_train]
            #
            # indicesVali = np.arange(dataVali.shape[0])
            # np.random.shuffle(indicesVali)
            # dataVali = dataVali[indicesVali]
            # labelsVali = labelsVali[indicesVali]
            #
            # x_train = data
            # y_train = labelsPCQ
            # x_val = dataVali
            # y_val = labelsVali

            print('Preparing embedding matrix.')

            # initiate embedding matrix with zero vectors for embedding1.
            nb_words = min(args.nb_words, len(word_index))
            embedding_matrix = np.zeros((nb_words + 1, args.embedding_dim))
            for word, i in word_index.items():
                if i > nb_words:
                    continue
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
            args.nb_words = nb_words
            args.len_labels_index = len(labels_index)

            # initiate embedding matrix with zero vectors for embedding2.
            nb_words2 = min(args.nb_words, len(word_index))
            embedding_matrix2 = np.zeros(
                (nb_words2 + 1, args.embedding_dim2))  #+100
            for word, i in word_index.items():
                if i > nb_words2:
                    continue
                embedding_vector2 = embeddings_index2.get(word)
                if embedding_vector2 is not None:
                    embedding_matrix2[i] = embedding_vector2
            args.nb_words = nb_words
            args.len_labels_index = len(labels_index)
            '''Remember uncomment model according to model.fit below'''
            #model = model_selector(args, embedding_matrix)
            #model = model_selector2(args, embedding_matrix, embedding_matrix2)
            model = model_selectorBoth(args, embedding_matrix,
                                       embedding_matrix2)

            print(args)
            cv_scores = []
            ROC_scores = []
            fold = 10

            for i in range(0, fold):
                print("\n")
                print("\n")
                print("\n")
                print("-------------FOLD :", (i + 1))
                window_data = data.shape[0] / fold
                #   Generate batches from indices
                x_train1 = data[:i * window_data]
                x_train2 = data[(i + 1) * window_data:]

                y_train1 = labelsPCQ[:i * window_data]
                y_train2 = labelsPCQ[(i + 1) * window_data:]

                if i == 0:
                    x_trainAll = x_train2
                    y_trainAll = y_train2
                else:
                    x_trainAll = np.concatenate((x_train1, x_train2), axis=0)
                    y_trainAll = np.concatenate((y_train1, y_train2), axis=0)

                x_val = data[i * window_data:(i + 1) * window_data]
                y_val = labelsPCQ[i * window_data:(i + 1) * window_data]

                indices_ = np.arange(x_trainAll.shape[0])
                np.random.shuffle(indices_)
                x_train = x_trainAll[indices_]
                y_train = y_trainAll[indices_]
                nb_validation_samples = int(args.validation_split *
                                            x_train.shape[0])

                x_train = x_train[:-nb_validation_samples]
                y_train = y_train[:-nb_validation_samples]
                x_dev = x_train[-nb_validation_samples:]
                y_dev = y_train[-nb_validation_samples:]

                model = None
                model = model_selectorBoth(args, embedding_matrix,
                                           embedding_matrix2)
                # checkpoint_filepath = os.path.join(args.model_dir, "weights.best.hdf5")
                # # checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_loss',
                # #                              verbose=1, save_best_only=True)
                # # callbacks_list = [checkpoint]
                #
                # earlystopper = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
                # checkpointer = ModelCheckpoint(checkpoint_filepath, monitor='val_loss', verbose=1, save_best_only=True)
                # callbacks_list = [earlystopper, checkpointer]
                # model_json = model.to_json()
                # with open(os.path.join(args.model_dir, "model.json"), "w") as json_file:
                #     json_file.write(model_json)
                #
                #
                # #model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=args.num_epochs,
                # #          batch_size=args.batch_size, callbacks=callbacks_list)
                # model.fit([x_train, x_train], y_train, validation_data=([x_val, x_val], y_val), nb_epoch=args.num_epochs,
                #           batch_size=args.batch_size, callbacks=callbacks_list)
                model.fit([x_train, x_train, x_train, x_train],
                          y_train,
                          epochs=args.num_epochs,
                          batch_size=args.batch_size,
                          verbose=0)
                # print("Test model ...")
                # print("Loading ...", checkpoint_filepath)
                # model.load_weights(checkpoint_filepath)
                y_prob = model.predict([x_val, x_val, x_val, x_val])

                roc = metrics.roc_auc_score(y_val, y_prob)
                print("ROC Prediction (binary classification):", roc)
                result_CV.write("roc: %.2f%%" % roc)
                scores = model.evaluate([x_val, x_val, x_val, x_val],
                                        y_val,
                                        verbose=0)
                print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))
                result_CV.write("acc: %.2f%%" % (scores[1] * 100))
                cv_scores.append(scores[1] * 100)
                ROC_scores.append(roc * 100)

            result_CV.write(time.asctime(time.localtime(time.time())) + '\n')
            print(input_name[list])
            print("ACC: %.2f%% (+/- %.2f%%)" %
                  (np.mean(cv_scores), np.std(cv_scores)))
            print("ROC: %.2f%% (+/- %.2f%%)" %
                  (np.mean(ROC_scores), np.std(ROC_scores)))
            result_CV.write(input_name[list] + " ACC: %.2f%% (+/- %.2f%%)" %
                            (np.mean(cv_scores), np.std(cv_scores)) +
                            " ROC: %.2f%% (+/- %.2f%%)" %
                            (np.mean(ROC_scores), np.std(ROC_scores)) + '\n')
            result_CV.write(time.asctime(time.localtime(time.time())) + '\n')
Beispiel #4
0
def train(args):
    print('Reading word vectors.')
    #embeddings_index = read_glove_vectors(args.embedding_file_path)

    embeddings_index = read_glove_vectors("/home/duong/Desktop/CNN-Sentence-Classifier/app/GoogleNews-vectors-negative300.txt")
    embeddings_index2 = read_glove_vectors("/home/duong/Desktop/CNN-Sentence-Classifier/app/glove.txt")
    print('Found {} word vectors in embedding2.'.format(len(embeddings_index2)))

    print('Processing input data')
    #texts, labels_index, labels = read_input_data(args.data_dir)
    texts, labels_index, labels, textsPCQ, labels_indexPCQ, labelsPCQ, \
    textsVali, labels_indexVali, labelsVali = read_input_data(args.data_dir)

    # texts - list of text samples
    # labels_index - dictionary mapping label name to numeric id
    # labels - list of label ids
    print('Found {} texts.'.format(len(textsPCQ)))

    # Vectorize the text sample into 2D integer tensor
    tokenizer = Tokenizer(nb_words=args.nb_words)
    tokenizer.fit_on_texts(textsPCQ)
    sequences = tokenizer.texts_to_sequences(textsPCQ)
    word_index = tokenizer.word_index
    print('Found {} unique tokens.'.format(len(word_index)))

    data = pad_sequences(sequences, maxlen=args.max_sequence_len)

    # Transform labels to be categorical variables
    labelsPCQ = to_categorical(np.asarray(labelsPCQ))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labelsPCQ.shape)

    #Infor of Validat dataset
    print('Found {} Vali.'.format(len(textsVali)))

    # Vectorize the text sample into 2D integer tensor
    tokenizerVali = Tokenizer(nb_words=args.nb_words)
    tokenizerVali.fit_on_texts(textsVali)
    sequencesVali = tokenizerVali.texts_to_sequences(textsVali)
    word_indexVali = tokenizerVali.word_index
    print('Found {} unique tokens in Vali.'.format(len(word_indexVali)))

    dataVali = pad_sequences(sequencesVali, maxlen=args.max_sequence_len)

    # Transform labels to be categorical variables
    labelsVali = to_categorical(np.asarray(labelsVali))
    print('Shape of data tensor in Vali:', dataVali.shape)
    print('Shape of label tensor in Vali:', labelsVali.shape)

    #split the input data into training set and validation set
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labelsPCQ = labelsPCQ[indices]
    nb_validation_samples = int(args.validation_split * data.shape[0])

    x_train = data[:-nb_validation_samples]
    y_train = labelsPCQ[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = labelsPCQ[-nb_validation_samples:]

    # indices_train = np.arange(data.shape[0])
    # np.random.shuffle(indices_train)
    # data = data[indices_train]
    # labelsPCQ = labelsPCQ[indices_train]
    #
    # indicesVali = np.arange(dataVali.shape[0])
    # np.random.shuffle(indicesVali)
    # dataVali = dataVali[indicesVali]
    # labelsVali = labelsVali[indicesVali]
    #
    # x_train = data
    # y_train = labelsPCQ
    # x_val = dataVali
    # y_val = labelsVali

    print('Preparing embedding matrix.')

    # initiate embedding matrix with zero vectors for embedding1.
    nb_words = min(args.nb_words, len(word_index))
    embedding_matrix = np.zeros((nb_words + 1, args.embedding_dim))
    for word, i in word_index.items():
        if i > nb_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    args.nb_words = nb_words
    args.len_labels_index = len(labels_index)

    # initiate embedding matrix with zero vectors for embedding2.
    nb_words2 = min(args.nb_words, len(word_index))
    embedding_matrix2 = np.zeros((nb_words2 + 1, args.embedding_dim2))#+100
    for word, i in word_index.items():
        if i > nb_words2:
            continue
        embedding_vector2 = embeddings_index2.get(word)
        if embedding_vector2 is not None:
            embedding_matrix2[i] = embedding_vector2
    args.nb_words = nb_words
    args.len_labels_index = len(labels_index)

    '''Remember uncomment model according to model.fit below'''
    #model = model_selector(args, embedding_matrix)
    #model = model_selector2(args, embedding_matrix, embedding_matrix2)
    model = model_selectorBoth(args, embedding_matrix, embedding_matrix2)

    checkpoint_filepath = os.path.join(args.model_dir, "weights.best.hdf5")
    # checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_loss',
    #                              verbose=1, save_best_only=True)
    # callbacks_list = [checkpoint]

    earlystopper = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
    checkpointer = ModelCheckpoint(checkpoint_filepath, monitor='val_loss', verbose=1, save_best_only=True)
    callbacks_list = [earlystopper, checkpointer]
    model_json = model.to_json()
    with open(os.path.join(args.model_dir, "model.json"), "w") as json_file:
        json_file.write(model_json)


    #model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=args.num_epochs,
    #          batch_size=args.batch_size, callbacks=callbacks_list)
    model.fit([x_train, x_train], y_train, validation_data=([x_val, x_val], y_val), nb_epoch=args.num_epochs,
              batch_size=args.batch_size, callbacks=callbacks_list)
    #model.fit([x_train, x_train, x_train], y_train, validation_data=([x_val, x_val, x_val], y_val), nb_epoch=args.num_epochs,
    #          batch_size=args.batch_size, callbacks=callbacks_list)
    print("Test model ...")
    print("Loading ...", checkpoint_filepath)
    model.load_weights(checkpoint_filepath)
    y_prob = model.predict([x_val,x_val])
    roc = metrics.roc_auc_score(y_val, y_prob)
    print("ROC Prediction (binary classification):", roc)
Beispiel #5
0
from reader.filereader import read_glove_vectors, read_input_data

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

import numpy as np

# load data
texts, labels_index, labels = read_input_data("../data");
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2);

tfidf_vec = TfidfVectorizer(stop_words="english", ngram_range=(1,2), norm='l2'); #token_pattern=r'\b\w+\b'

text_clf = Pipeline([ 
    ('tfvec', tfidf_vec),
    #('clf', LinearSVC(0.9))
    #('clf', KNeighborsClassifier(n_neighbors=7))
    #('clf', MultinomialNB(alpha=1.8)),
    #('clf', LogisticRegression(C=3.1, class_weight='balanced')),
    #('clf', RandomForestClassifier(n_estimators=100, class_weight='balanced_subsample'))
    ('clf', AdaBoostClassifier(n_estimators=100, )),
])

# comment out parmeters that you want to tune, not all of them are here yet