Exemple #1
0
def read_train_eval(testid, preprocess, maxseq, modelType, encodeTime, dropout,
                    earlyStop, seedNum, batchSize, maxEpoch, topn):
    '''

    :param testid: identifier
    :param preprocess: whether sequences are stemmed or not
    :param maxseq: the maximum sequence length
    :param modelType: one of SIMPLE_RNN | LSTM_RNN | GRU_RNN
    :param encodeTime:
    :param dropout
    :param earlyStop: whether training stops when errors are saturated
    :param seedNum: random seed
    :param batchSize
    :param maxEpoch
    :param topn: how frequently used word tokens are considered
    :return:
    '''
    N = 1000
    TRAIN_INSTANCE_DIR = os.path.join(
        'log',
        '{}_{}_{}_{}_{}_{}_{}_{}_{}'.format(testid, preprocess, maxseq,
                                            modelType, dropout, earlyStop,
                                            seedNum, batchSize, maxEpoch))

    if not os.path.isdir(TRAIN_INSTANCE_DIR):
        os.mkdir(TRAIN_INSTANCE_DIR)
    log_csvfile = os.path.join(TRAIN_INSTANCE_DIR, 'log.csv')
    result_file = os.path.join(TRAIN_INSTANCE_DIR, 'results.txt')

    print('Load data')
    session_data = load_data(preprocess=preprocess,
                             maxseq=maxseq,
                             encodeTime=encodeTime)
    label_data = load_label()
    topN_words = load_topn_words(session_data, N)

    sequences, labels = filter_labeled_data(session_data, label_data)

    print('Load embedding')
    if preprocess:
        w2v_model = load_embedding(embeddingType=EmbeddingType.PRE_ALL)
    else:
        w2v_model = load_embedding(embeddingType=EmbeddingType.NOPRE_ALL)

    print('Pre-processing sequences')
    print(' - Get word vectors')
    vocab_size, embedding_dim, word_indices, embedding_matrix = \
        get_wordvectors_from_keyedvectors(w2v_model, seed=seedNum)

    print(' - Transform sequences')
    if topn is False:
        transformed_seq = transform_sequence(sequences,
                                             word_indices=word_indices)
    else:
        transformed_seq = transform_sequence_using_topn(
            sequences, word_indices, w2v_model, topN_words)

    print(' - Transform labels')
    transformed_labels = transform_label(label_data)
    print(' - Transform seq data to list')
    X, y = transform_labeled_data_listform(transformed_seq, transformed_labels)

    sss = StratifiedShuffleSplit(n_splits=1,
                                 test_size=0.2,
                                 random_state=seedNum)
    for train_index, test_index in sss.split(X, y):
        pass

    X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]

    X_train, y_train = random_oversampling(X_train, y_train, seed=seedNum)
    X_test, y_test = random_oversampling(X_test, y_test, seed=seedNum)

    X_train = sequence.pad_sequences(X_train, maxlen=maxseq)
    X_test = sequence.pad_sequences(X_test, maxlen=maxseq)

    list_callbacks = [CSVLogger(log_csvfile, separator=',', append=False)]
    if earlyStop:
        earlyStopping = EarlyStopping(monitor='val_loss',
                                      patience=10,
                                      verbose=1,
                                      mode='auto')
        list_callbacks.append(earlyStopping)

    if modelType is ModelType.GRU_RNN:
        model = GRU_RNN(vocab_size=vocab_size,
                        maxlen=maxseq,
                        dropout=dropout,
                        embedding=embedding_matrix,
                        embedding_dim=embedding_dim)()
        model.fit({'text': X_train},
                  y_train,
                  validation_data=({
                      'text': X_test
                  }, y_test),
                  batch_size=batchSize,
                  epochs=maxEpoch,
                  verbose=1,
                  callbacks=list_callbacks)
        y_pred = model.predict({'text': X_test},
                               batch_size=batchSize,
                               verbose=1)
    elif modelType is ModelType.LSTM_RNN:
        model = LSTM_RNN(vocab_size=vocab_size,
                         maxlen=maxseq,
                         dropout=dropout,
                         embedding=embedding_matrix,
                         embedding_dim=embedding_dim)()
        model.fit({'text': X_train},
                  y_train,
                  validation_data=({
                      'text': X_test
                  }, y_test),
                  batch_size=batchSize,
                  epochs=maxEpoch,
                  verbose=1,
                  callbacks=list_callbacks)
        y_pred = model.predict({'text': X_test},
                               batch_size=batchSize,
                               verbose=1)
    elif modelType is ModelType.SIMPLE_RNN:
        model = SIMPLE_RNN(vocab_size=vocab_size,
                           maxlen=maxseq,
                           dropout=dropout,
                           embedding=embedding_matrix,
                           embedding_dim=embedding_dim)()
        model.fit({'text': X_train},
                  y_train,
                  validation_data=({
                      'text': X_test
                  }, y_test),
                  batch_size=batchSize,
                  epochs=maxEpoch,
                  verbose=1,
                  callbacks=list_callbacks)
        y_pred = model.predict({'text': X_test},
                               batch_size=batchSize,
                               verbose=1)
    else:
        print('This function should be set for XXX_single modeltype.')
        exit()

    print('Evaluation..')
    with open(result_file, 'wt') as f:
        writer.eval(y_pred, y_test, file=f)
Exemple #2
0
#split data in training and test data
d_train, d_test, q_train, q_test = train_test_split(datasets,
                                                    q_fasttext,
                                                    test_size=0.2)

#encode labels with LabelEncoder (imblearn does not support multi-class, multi-label classification)
label_encoder = LabelEncoder()
label_encoder.fit(datasets)
d_train_encoded = label_encoder.transform(d_train)

sampling_strategies = []
q_train_notsampled, d_train_notsampled = q_train, d_train_encoded
triple_notsampled = (q_train_notsampled, d_train_notsampled, "No Sampling")
sampling_strategies.append(triple_notsampled)
q_train_oversampled, d_train_oversampled = preprocessing.random_oversampling(
    q_train, d_train_encoded)
triple_oversampled = (q_train_oversampled, d_train_oversampled,
                      "Random Oversampling")
sampling_strategies.append(triple_oversampled)
q_train_undersampled, d_train_undersampled = preprocessing.random_undersampling(
    q_train, d_train_encoded)
triple_undersampled = (q_train_undersampled, d_train_undersampled,
                       "Random Undersampling")
sampling_strategies.append(triple_undersampled)

for sampling in sampling_strategies:
    q_train_sample = sampling[0]
    d_train_sample = sampling[1]
    name = sampling[2]
    documentation_file_modelopt.write(str(name) + "\n")
#split data in training and test data
d_train, d_test, q_train, q_test = train_test_split(datasets,
                                                    q_tfidf,
                                                    test_size=0.2)

#encode labels with LabelEncoder (imblearn does not support multi-class, multi-label classification)
label_encoder = LabelEncoder()
label_encoder.fit(datasets)
d_train_binarized = label_encoder.transform(d_train)

sampling_strategies = []
q_train_notsampled, d_train_notsampled = q_train, d_train_binarized
triple_notsampled = (q_train_notsampled, d_train_notsampled, "No Sampling")
sampling_strategies.append(triple_notsampled)
q_train_oversampled, d_train_oversampled = preprocessing.random_oversampling(
    q_train, d_train_binarized)
triple_oversampled = (q_train_oversampled, d_train_oversampled,
                      "Random Oversampling")
sampling_strategies.append(triple_oversampled)
q_train_undersampled, d_train_undersampled = preprocessing.random_undersampling(
    q_train, d_train_binarized)
triple_undersampled = (q_train_undersampled, d_train_undersampled,
                       "Random Undersampling")
sampling_strategies.append(triple_undersampled)

for sampling in sampling_strategies:
    q_train_sample = sampling[0]
    d_train_sample = sampling[1]
    name = sampling[2]
    documentation_file_modelopt.write(str(name) + "\n")