Beispiel #1
0
def asap_cv_cnn_multi():
    maxlen = 75
    nb_words = 4500
    embd_dim = 50
    nb_pos = 15

    folds = (1,2,3,4,5,6,7,8,9,10)
    trains = ['data/asap2/train'+str(fold)+'.csv' for fold in folds]
    tests = ['data/asap2/test'+str(fold)+'.csv' for fold in folds]
    pos_tas = ['data/asap2/pos/train'+str(fold)+'_pos.csv' for fold in folds]
    pos_tss = ['data/asap2/pos/test'+str(fold)+'_pos.csv' for fold in folds]
    dp_tas = ['data/asap2/dp/train'+str(fold)+'_dp.csv' for fold in folds]
    dp_tss = ['data/asap2/dp/test'+str(fold)+'_dp.csv' for fold in folds]

    pairs = zip(trains, tests, pos_tas, pos_tss, dp_tas, dp_tss)

    kappas = []
    for (train, test, pos_ta, pos_ts, dp_ta, dp_ts) in pairs:
        print(train + '=>' + test)
        X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test,
                                                                nb_words, maxlen, embd_type='self', w2v=None)
        pos_train, foo1, pos_test, foo2, foo3 = load_csvs(pos_ta, pos_ts,
                                                          nb_pos, maxlen, embd_type='self', w2v=None)
        dp_train,  foo1, dp_test, foo2, foo3 = load_csvs(dp_ta, dp_ts,
                                                         nb_words, maxlen, embd_type='self', w2v=None)

        kappa = cnn_multi_selfembd(X_train, Y_train, X_test, Y_test, nb_classes,
                             maxlen, nb_words, embd_dim,
                             pos_train, pos_test, 10,
                             dp_train, dp_test, 40,
                             50, 32, 30, 'rmsprop')
        kappas.append(kappa)
    kappa_cv = metrics.mean_quadratic_weighted_kappa(kappas)

    print('after 10-fold cv:' + str(kappa_cv))
Beispiel #2
0
def argu_cv():
    maxlen = 40
    nb_words = 8000
    embd_dim = 100

    folds = ['VC048263',
             'VC048408',
             'VC084849',
             'VC084851',
             'VC084853',
             'VC101537',
             'VC101541',
             'VC140094',
             'VC207640',
             'VC248479']

    trains = ['data/Argu/csv/generic_' + str(fold) + '_training.csv' for fold in folds]
    tests  = ['data/Argu/csv/generic_' + str(fold) + '_testing.csv' for fold in folds]
    pairs = zip(trains, tests)

    accs = []
    for (train, test) in pairs:
        print(train + '=>' + test)
        X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test, nb_words, maxlen,
                                                                 embd_type='self', w2v=None)

        acc = cnn1d_selfembd(X_train, Y_train, X_test, Y_test, nb_classes,
                             maxlen, nb_words, embd_dim,
                             100, 5, 50, 20, 'rmsprop')
        accs.append(acc)
    acc_cv = np.mean(accs)
    print('after 10-fold cv:' + str(acc_cv))
Beispiel #3
0
def tpo_cv_cnn_other():

    maxlen = 200
    nb_words = 6500
    filter_size = 20
    k = 4

    folds = range(1, 11)
    trains = ['data/tpov4/train_'+str(fold)+'.csv' for fold in folds]
    tests = ['data/tpov4/test_'+str(fold)+'.csv' for fold in folds]
    tas_other = ['data/tpov4/train_'+str(fold)+'_other.csv' for fold in folds]
    tss_other = ['data/tpov4/test_'+str(fold)+'_other.csv' for fold in folds]
    pairs = zip(trains, tests, tas_other, tss_other)

    accs = []
    for (train, test, ta_other, ts_other) in pairs:
        print(train + '=>' + test)
        X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test,
                                                             nb_words, maxlen, embd_type='self', w2v=None)
        Other_train = load_other(ta_other, maxlen, k)
        Other_test = load_other(ts_other, maxlen, k)

        acc = cnn_other(Y_train, Y_test, nb_classes,
                        Other_train, Other_test, k,
                        maxlen,
                        50, filter_size, 32, 25, 'rmsprop')
        accs.append(acc)
    acc_cv = np.mean(accs)
    print('after 10-fold cv:' + str(acc_cv))
def keras_model():

    import pandas as pd
    import numpy as np

    from keras.preprocessing import sequence
    from keras.models import Sequential
    from keras.layers.core import Dense, Dropout, Activation, Flatten
    from keras.layers.convolutional import Convolution1D, MaxPooling1D
    from keras.callbacks import EarlyStopping
    from keras.utils import np_utils

    from data_util import load_csvs, load_other
    import ml_metrics as metrics

    nb_words = 6500
    maxlen = 175
    filter_length = 10
    other_col_dim = 4

    X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/tpov4/train_1.csv',
                                                             'data/tpov4/test_1.csv',
                                                              nb_words, maxlen, 'self', w2v=None)

    # read _other.csv
    other_train = load_other('data/tpov4/train_1_other.csv', maxlen, other_col_dim)
    other_test = load_other('data/tpov4/test_1_other.csv', maxlen, other_col_dim)

    print('other tensor:', other_train.shape)

    pool_length = maxlen - filter_length + 1

    model = Sequential()
    model.add(Convolution1D(nb_filter=50,
                            filter_length=filter_length,
                            border_mode="valid", activation="relu",
                            input_shape=(maxlen, other_col_dim)))
    model.add(MaxPooling1D(pool_length=pool_length))
    model.add(Flatten())
    model.add(Dropout(0.05))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer={{choice(['rmsprop', 'adam', 'adadelta', 'adagrad'])}})

    earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)

    model.fit(other_train, Y_train, batch_size=32, nb_epoch=25,
              validation_split=0.1, show_accuracy=True, callbacks=[earlystop])

    classes = earlystop.model.predict_classes(other_test, batch_size=32)
    org_classes = np_utils.categorical_probas_to_classes(Y_test)

    acc = np_utils.accuracy(classes, org_classes)  # accuracy only supports classes
    print('Test accuracy:', acc)
    kappa = metrics.quadratic_weighted_kappa(classes, org_classes)
    print('Test Kappa:', kappa)
    return {'loss': -acc, 'status': STATUS_OK}
Beispiel #5
0
def ted_cv_w2v():
    maxlen = 20

    folds = range(1,11)
    trains = ['data/TED/train'+str(fold)+'.csv' for fold in folds]
    tests = ['data/TED/test'+str(fold)+'.csv' for fold in folds]
    pairs = zip(trains, tests)

    accs = []
    for (train, test) in pairs:
        print(train + '=>' + test)
        X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test,
                                                             0, maxlen, embd_type='w2v')

        acc = cnn1d_w2vembd(X_train, Y_train, X_test, Y_test, nb_classes,
                             maxlen,
                             100, 5, 50, 20, 'rmsprop')
        accs.append(acc)
    acc_cv = np.mean(accs)
    print('after 10-fold cv:' + str(acc_cv))
Beispiel #6
0
def pun_cv():
    maxlen = 20
    nb_words = 8000
    embd_dim = 100

    folds = range(1,11)
    trains = ['data/pun_of_day/train'+str(fold)+'.csv' for fold in folds]
    tests = ['data/pun_of_day/test'+str(fold)+'.csv' for fold in folds]
    pairs = zip(trains, tests)

    accs = []
    for (train, test) in pairs:
        print(train + '=>' + test)
        X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test, nb_words, maxlen,
                                                                 embd_type='self', w2v=None)

        acc = cnn1d_selfembd(X_train, Y_train, X_test, Y_test, nb_classes,
                             maxlen, nb_words, embd_dim,
                             100, 5, 50, 20, 'rmsprop')
        accs.append(acc)
    acc_cv = np.mean(accs)
    print('after 10-fold cv:' + str(acc_cv))
Beispiel #7
0
def tpo_cv_cnnvar():
    maxlen = 200
    nb_words = 6500
    embd_dim = 100

    folds = range(1, 11)
    trains = ['data/tpov4/train_'+str(fold)+'.csv' for fold in folds]
    tests = ['data/tpov4/test_'+str(fold)+'.csv' for fold in folds]
    pairs = zip(trains, tests)

    accs = []
    for (train, test) in pairs:
        print(train + '=>' + test)
        X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test,
                                                             nb_words, maxlen, embd_type='self', w2v=None)

        acc = cnn_var_selfembd(X_train, Y_train, X_test, Y_test, nb_classes,
                             maxlen, nb_words, embd_dim,
                             50, 32, 25, 'rmsprop')
        accs.append(acc)
    acc_cv = np.mean(accs)
    print('after 10-fold cv:' + str(acc_cv))
Beispiel #8
0
def tpo_cv_w2v_cnnvar():
    maxlen = 175

    folds = range(1,11)
    trains = ['data/tpov4/train_'+str(fold)+'.csv' for fold in folds]
    tests = ['data/tpov4/test_'+str(fold)+'.csv' for fold in folds]
    pairs = zip(trains, tests)

    w2v = load_w2v('data/Google_w2v.bin')
    print("loaded Google word2vec")

    accs = []
    for (train, test) in pairs:
        print(train + '=>' + test)
        X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test,
                                                             0, maxlen, embd_type='w2v', w2v=w2v)

        acc = cnn_var_w2vembd(X_train, Y_train, X_test, Y_test, nb_classes,
                             maxlen,
                             100, 50, 25, 'rmsprop')
        accs.append(acc)
    acc_cv = np.mean(accs)
    print('after 10-fold cv:' + str(acc_cv))
Beispiel #9
0
def asap_cv_cnnvar():
    maxlen = 75
    nb_words = 4500
    embd_dim = 50

    folds = (1,2,3,4,5,6,7,8,9,10)
    trains = ['data/asap2/train'+str(fold)+'.csv' for fold in folds]
    tests = ['data/asap2/test'+str(fold)+'.csv' for fold in folds]
    pairs = zip(trains, tests)

    kappas = []
    for (train, test) in pairs:
        print(train + '=>' + test)
        X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test,
                                                             nb_words, maxlen, embd_type='self', w2v=None)

        kappa = cnn_var_selfembd(X_train, Y_train, X_test, Y_test, nb_classes,
                             maxlen, nb_words, embd_dim,
                             50, 32, 30, 'rmsprop')
        kappas.append(kappa)
    kappa_cv = metrics.mean_quadratic_weighted_kappa(kappas)

    print('after 10-fold cv:' + str(kappa_cv))
Beispiel #10
0
def asap_cv_w2v():
    maxlen = 40

    folds = range(1,11)
    trains = ['data/asap2/train'+str(fold)+'.csv' for fold in folds]
    tests = ['data/asap2/test'+str(fold)+'.csv' for fold in folds]
    pairs = zip(trains, tests)

    w2v = load_w2v('data/Google_w2v.bin')
    print("loaded Google word2vec")

    kappas = []
    for (train, test) in pairs:
        print(train + '=>' + test)
        X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test,
                                                             0, maxlen, embd_type='w2v', w2v=w2v)

        kappa = cnn1d_w2vembd(X_train, Y_train, X_test, Y_test, nb_classes,
                             maxlen,
                             100, 3, 50, 20, 'rmsprop')
        kappas.append(kappa)
    kappa_cv = np.mean(kappas)
    print('after 10-fold cv:' + str(kappa_cv))
Beispiel #11
0
def load_asap(nb_words=10000, maxlen=200, embd_type='self'):
    X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('../asap_sas/set1_train.csv',
                                                             '../asap_sas/set1_test.csv',
                                                             nb_words, maxlen, embd_type)
    return(X_train, Y_train, X_test, Y_test, nb_classes)
Beispiel #12
0
def load_ted(nb_words=8000, maxlen=150, embd_type='self'):
    X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/TED/train1.csv',
                                                             'data/TED/test1.csv',
                                                             nb_words, maxlen, embd_type)
    return(X_train, Y_train, X_test, Y_test, nb_classes)
Beispiel #13
0
def load_sg15(nb_words=8000, maxlen=150, embd_type='self'):
    X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/sg15_train.csv',
                                                             'data/test.csv',
                                                             nb_words, maxlen, embd_type)
    return(X_train, Y_train, X_test, Y_test, nb_classes)
"""
np.random.seed(75513)  # for reproducibility

print
"loading data..."

train_df = pd.read_csv('data/asap2/train1.csv')
test_df = pd.read_csv('data/asap2/test1.csv')

nb_words = 2900
maxlen = 75
embd_dim = 50

X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/asap2/train1.csv',
                                                         'data/asap2/test1.csv',
                                                         nb_words, maxlen, 'self', w2v=None)

nb_pos = 15
pos_train, foo1, pos_test, foo2, foo3 = load_csvs('../sentproc_spaCy/test/train1_pos.csv',
                                                  '../sentproc_spaCy/test/test1_pos.csv',
                                                  nb_pos, maxlen, 'self', w2v=None)
nb_dp = 2500
dp_train,  foo1, dp_test, foo2,  foo3        = load_csvs('../sentproc_spaCy/test/train1_dp.csv',
                                                         '../sentproc_spaCy/test/test1_dp.csv',
                                                         nb_dp, maxlen, 'self', w2v=None)

# get #char to be a feature.
len_char_train = np.array([len(x.split()) for x in train_df.text.values.tolist()], dtype='float32')
len_char_test = np.array([len(x.split()) for x in test_df.text.values.tolist()], dtype='float32')
Beispiel #15
0
"""
np.random.seed(75513)  # for reproducibility

print
"loading data..."

train_df = pd.read_csv('data/asap2/train1.csv')
test_df = pd.read_csv('data/asap2/test1.csv')

nb_words = 2900
maxlen = 75
embd_dim = 50

X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/asap2/train1.csv',
                                                         'data/asap2/test1.csv',
                                                         nb_words, maxlen, 'self', w2v=None)

nb_filter = 50
nb_epoch = 30
batch_size = 32

print('Build model...')
ngram_filters = [2, 5, 8]
nd_convs = ['conv_'+str(n) for n in ngram_filters]
nd_pools = ['pool_'+str(n) for n in ngram_filters]
nd_flats = ['flat_'+str(n) for n in ngram_filters]

model = Graph()
model.add_input(name='input', input_shape=(maxlen,), dtype=int)
"""
np.random.seed(75513)  # for reproducibility

print
"loading data..."

train_df = pd.read_csv('data/tpov4/train_1.csv')
test_df  = pd.read_csv('data/tpov4/test_1.csv')

nb_words = 6500
maxlen = 175
embd_dim = 100
other_col_dim = 4

X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/tpov4/train_1.csv',
                                                         'data/tpov4/test_1.csv',
                                                         nb_words, maxlen, 'self', w2v=None)

# read _other.csv
pos_train = load_other('data/tpov4/train_1_other.csv', maxlen, other_col_dim)
pos_test = load_other('data/tpov4/test_1_other.csv', maxlen, other_col_dim)

print('other tensor:', pos_train.shape)

# get #char to be a feature.
len_char_train = np.array([len(x.split()) for x in train_df.text.values.tolist()], dtype='float32')
len_char_test = np.array([len(x.split()) for x in test_df.text.values.tolist()], dtype='float32')

# normalize
len_max = np.max(len_char_train)
len_char_train /= len_max