def asap_cv_cnn_multi(): maxlen = 75 nb_words = 4500 embd_dim = 50 nb_pos = 15 folds = (1,2,3,4,5,6,7,8,9,10) trains = ['data/asap2/train'+str(fold)+'.csv' for fold in folds] tests = ['data/asap2/test'+str(fold)+'.csv' for fold in folds] pos_tas = ['data/asap2/pos/train'+str(fold)+'_pos.csv' for fold in folds] pos_tss = ['data/asap2/pos/test'+str(fold)+'_pos.csv' for fold in folds] dp_tas = ['data/asap2/dp/train'+str(fold)+'_dp.csv' for fold in folds] dp_tss = ['data/asap2/dp/test'+str(fold)+'_dp.csv' for fold in folds] pairs = zip(trains, tests, pos_tas, pos_tss, dp_tas, dp_tss) kappas = [] for (train, test, pos_ta, pos_ts, dp_ta, dp_ts) in pairs: print(train + '=>' + test) X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test, nb_words, maxlen, embd_type='self', w2v=None) pos_train, foo1, pos_test, foo2, foo3 = load_csvs(pos_ta, pos_ts, nb_pos, maxlen, embd_type='self', w2v=None) dp_train, foo1, dp_test, foo2, foo3 = load_csvs(dp_ta, dp_ts, nb_words, maxlen, embd_type='self', w2v=None) kappa = cnn_multi_selfembd(X_train, Y_train, X_test, Y_test, nb_classes, maxlen, nb_words, embd_dim, pos_train, pos_test, 10, dp_train, dp_test, 40, 50, 32, 30, 'rmsprop') kappas.append(kappa) kappa_cv = metrics.mean_quadratic_weighted_kappa(kappas) print('after 10-fold cv:' + str(kappa_cv))
def argu_cv(): maxlen = 40 nb_words = 8000 embd_dim = 100 folds = ['VC048263', 'VC048408', 'VC084849', 'VC084851', 'VC084853', 'VC101537', 'VC101541', 'VC140094', 'VC207640', 'VC248479'] trains = ['data/Argu/csv/generic_' + str(fold) + '_training.csv' for fold in folds] tests = ['data/Argu/csv/generic_' + str(fold) + '_testing.csv' for fold in folds] pairs = zip(trains, tests) accs = [] for (train, test) in pairs: print(train + '=>' + test) X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test, nb_words, maxlen, embd_type='self', w2v=None) acc = cnn1d_selfembd(X_train, Y_train, X_test, Y_test, nb_classes, maxlen, nb_words, embd_dim, 100, 5, 50, 20, 'rmsprop') accs.append(acc) acc_cv = np.mean(accs) print('after 10-fold cv:' + str(acc_cv))
def tpo_cv_cnn_other(): maxlen = 200 nb_words = 6500 filter_size = 20 k = 4 folds = range(1, 11) trains = ['data/tpov4/train_'+str(fold)+'.csv' for fold in folds] tests = ['data/tpov4/test_'+str(fold)+'.csv' for fold in folds] tas_other = ['data/tpov4/train_'+str(fold)+'_other.csv' for fold in folds] tss_other = ['data/tpov4/test_'+str(fold)+'_other.csv' for fold in folds] pairs = zip(trains, tests, tas_other, tss_other) accs = [] for (train, test, ta_other, ts_other) in pairs: print(train + '=>' + test) X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test, nb_words, maxlen, embd_type='self', w2v=None) Other_train = load_other(ta_other, maxlen, k) Other_test = load_other(ts_other, maxlen, k) acc = cnn_other(Y_train, Y_test, nb_classes, Other_train, Other_test, k, maxlen, 50, filter_size, 32, 25, 'rmsprop') accs.append(acc) acc_cv = np.mean(accs) print('after 10-fold cv:' + str(acc_cv))
def keras_model(): import pandas as pd import numpy as np from keras.preprocessing import sequence from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation, Flatten from keras.layers.convolutional import Convolution1D, MaxPooling1D from keras.callbacks import EarlyStopping from keras.utils import np_utils from data_util import load_csvs, load_other import ml_metrics as metrics nb_words = 6500 maxlen = 175 filter_length = 10 other_col_dim = 4 X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/tpov4/train_1.csv', 'data/tpov4/test_1.csv', nb_words, maxlen, 'self', w2v=None) # read _other.csv other_train = load_other('data/tpov4/train_1_other.csv', maxlen, other_col_dim) other_test = load_other('data/tpov4/test_1_other.csv', maxlen, other_col_dim) print('other tensor:', other_train.shape) pool_length = maxlen - filter_length + 1 model = Sequential() model.add(Convolution1D(nb_filter=50, filter_length=filter_length, border_mode="valid", activation="relu", input_shape=(maxlen, other_col_dim))) model.add(MaxPooling1D(pool_length=pool_length)) model.add(Flatten()) model.add(Dropout(0.05)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer={{choice(['rmsprop', 'adam', 'adadelta', 'adagrad'])}}) earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1) model.fit(other_train, Y_train, batch_size=32, nb_epoch=25, validation_split=0.1, show_accuracy=True, callbacks=[earlystop]) classes = earlystop.model.predict_classes(other_test, batch_size=32) org_classes = np_utils.categorical_probas_to_classes(Y_test) acc = np_utils.accuracy(classes, org_classes) # accuracy only supports classes print('Test accuracy:', acc) kappa = metrics.quadratic_weighted_kappa(classes, org_classes) print('Test Kappa:', kappa) return {'loss': -acc, 'status': STATUS_OK}
def ted_cv_w2v(): maxlen = 20 folds = range(1,11) trains = ['data/TED/train'+str(fold)+'.csv' for fold in folds] tests = ['data/TED/test'+str(fold)+'.csv' for fold in folds] pairs = zip(trains, tests) accs = [] for (train, test) in pairs: print(train + '=>' + test) X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test, 0, maxlen, embd_type='w2v') acc = cnn1d_w2vembd(X_train, Y_train, X_test, Y_test, nb_classes, maxlen, 100, 5, 50, 20, 'rmsprop') accs.append(acc) acc_cv = np.mean(accs) print('after 10-fold cv:' + str(acc_cv))
def pun_cv(): maxlen = 20 nb_words = 8000 embd_dim = 100 folds = range(1,11) trains = ['data/pun_of_day/train'+str(fold)+'.csv' for fold in folds] tests = ['data/pun_of_day/test'+str(fold)+'.csv' for fold in folds] pairs = zip(trains, tests) accs = [] for (train, test) in pairs: print(train + '=>' + test) X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test, nb_words, maxlen, embd_type='self', w2v=None) acc = cnn1d_selfembd(X_train, Y_train, X_test, Y_test, nb_classes, maxlen, nb_words, embd_dim, 100, 5, 50, 20, 'rmsprop') accs.append(acc) acc_cv = np.mean(accs) print('after 10-fold cv:' + str(acc_cv))
def tpo_cv_cnnvar(): maxlen = 200 nb_words = 6500 embd_dim = 100 folds = range(1, 11) trains = ['data/tpov4/train_'+str(fold)+'.csv' for fold in folds] tests = ['data/tpov4/test_'+str(fold)+'.csv' for fold in folds] pairs = zip(trains, tests) accs = [] for (train, test) in pairs: print(train + '=>' + test) X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test, nb_words, maxlen, embd_type='self', w2v=None) acc = cnn_var_selfembd(X_train, Y_train, X_test, Y_test, nb_classes, maxlen, nb_words, embd_dim, 50, 32, 25, 'rmsprop') accs.append(acc) acc_cv = np.mean(accs) print('after 10-fold cv:' + str(acc_cv))
def tpo_cv_w2v_cnnvar(): maxlen = 175 folds = range(1,11) trains = ['data/tpov4/train_'+str(fold)+'.csv' for fold in folds] tests = ['data/tpov4/test_'+str(fold)+'.csv' for fold in folds] pairs = zip(trains, tests) w2v = load_w2v('data/Google_w2v.bin') print("loaded Google word2vec") accs = [] for (train, test) in pairs: print(train + '=>' + test) X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test, 0, maxlen, embd_type='w2v', w2v=w2v) acc = cnn_var_w2vembd(X_train, Y_train, X_test, Y_test, nb_classes, maxlen, 100, 50, 25, 'rmsprop') accs.append(acc) acc_cv = np.mean(accs) print('after 10-fold cv:' + str(acc_cv))
def asap_cv_cnnvar(): maxlen = 75 nb_words = 4500 embd_dim = 50 folds = (1,2,3,4,5,6,7,8,9,10) trains = ['data/asap2/train'+str(fold)+'.csv' for fold in folds] tests = ['data/asap2/test'+str(fold)+'.csv' for fold in folds] pairs = zip(trains, tests) kappas = [] for (train, test) in pairs: print(train + '=>' + test) X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test, nb_words, maxlen, embd_type='self', w2v=None) kappa = cnn_var_selfembd(X_train, Y_train, X_test, Y_test, nb_classes, maxlen, nb_words, embd_dim, 50, 32, 30, 'rmsprop') kappas.append(kappa) kappa_cv = metrics.mean_quadratic_weighted_kappa(kappas) print('after 10-fold cv:' + str(kappa_cv))
def asap_cv_w2v(): maxlen = 40 folds = range(1,11) trains = ['data/asap2/train'+str(fold)+'.csv' for fold in folds] tests = ['data/asap2/test'+str(fold)+'.csv' for fold in folds] pairs = zip(trains, tests) w2v = load_w2v('data/Google_w2v.bin') print("loaded Google word2vec") kappas = [] for (train, test) in pairs: print(train + '=>' + test) X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test, 0, maxlen, embd_type='w2v', w2v=w2v) kappa = cnn1d_w2vembd(X_train, Y_train, X_test, Y_test, nb_classes, maxlen, 100, 3, 50, 20, 'rmsprop') kappas.append(kappa) kappa_cv = np.mean(kappas) print('after 10-fold cv:' + str(kappa_cv))
def load_asap(nb_words=10000, maxlen=200, embd_type='self'): X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('../asap_sas/set1_train.csv', '../asap_sas/set1_test.csv', nb_words, maxlen, embd_type) return(X_train, Y_train, X_test, Y_test, nb_classes)
def load_ted(nb_words=8000, maxlen=150, embd_type='self'): X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/TED/train1.csv', 'data/TED/test1.csv', nb_words, maxlen, embd_type) return(X_train, Y_train, X_test, Y_test, nb_classes)
def load_sg15(nb_words=8000, maxlen=150, embd_type='self'): X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/sg15_train.csv', 'data/test.csv', nb_words, maxlen, embd_type) return(X_train, Y_train, X_test, Y_test, nb_classes)
""" np.random.seed(75513) # for reproducibility print "loading data..." train_df = pd.read_csv('data/asap2/train1.csv') test_df = pd.read_csv('data/asap2/test1.csv') nb_words = 2900 maxlen = 75 embd_dim = 50 X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/asap2/train1.csv', 'data/asap2/test1.csv', nb_words, maxlen, 'self', w2v=None) nb_pos = 15 pos_train, foo1, pos_test, foo2, foo3 = load_csvs('../sentproc_spaCy/test/train1_pos.csv', '../sentproc_spaCy/test/test1_pos.csv', nb_pos, maxlen, 'self', w2v=None) nb_dp = 2500 dp_train, foo1, dp_test, foo2, foo3 = load_csvs('../sentproc_spaCy/test/train1_dp.csv', '../sentproc_spaCy/test/test1_dp.csv', nb_dp, maxlen, 'self', w2v=None) # get #char to be a feature. len_char_train = np.array([len(x.split()) for x in train_df.text.values.tolist()], dtype='float32') len_char_test = np.array([len(x.split()) for x in test_df.text.values.tolist()], dtype='float32')
""" np.random.seed(75513) # for reproducibility print "loading data..." train_df = pd.read_csv('data/asap2/train1.csv') test_df = pd.read_csv('data/asap2/test1.csv') nb_words = 2900 maxlen = 75 embd_dim = 50 X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/asap2/train1.csv', 'data/asap2/test1.csv', nb_words, maxlen, 'self', w2v=None) nb_filter = 50 nb_epoch = 30 batch_size = 32 print('Build model...') ngram_filters = [2, 5, 8] nd_convs = ['conv_'+str(n) for n in ngram_filters] nd_pools = ['pool_'+str(n) for n in ngram_filters] nd_flats = ['flat_'+str(n) for n in ngram_filters] model = Graph() model.add_input(name='input', input_shape=(maxlen,), dtype=int)
""" np.random.seed(75513) # for reproducibility print "loading data..." train_df = pd.read_csv('data/tpov4/train_1.csv') test_df = pd.read_csv('data/tpov4/test_1.csv') nb_words = 6500 maxlen = 175 embd_dim = 100 other_col_dim = 4 X_train, Y_train, X_test, Y_test, nb_classes = load_csvs('data/tpov4/train_1.csv', 'data/tpov4/test_1.csv', nb_words, maxlen, 'self', w2v=None) # read _other.csv pos_train = load_other('data/tpov4/train_1_other.csv', maxlen, other_col_dim) pos_test = load_other('data/tpov4/test_1_other.csv', maxlen, other_col_dim) print('other tensor:', pos_train.shape) # get #char to be a feature. len_char_train = np.array([len(x.split()) for x in train_df.text.values.tolist()], dtype='float32') len_char_test = np.array([len(x.split()) for x in test_df.text.values.tolist()], dtype='float32') # normalize len_max = np.max(len_char_train) len_char_train /= len_max