Esempio n. 1
0
def load_mr(nb_words=20000, maxlen=64, embd_type='self'):
    """
    :param embd_type: self vs. w2v
    :return:
    """

    train_size = 0.8

    df = pickled2df('data/mr.p')
    print(df.head())

    train_X, test_X, train_y, test_y = train_test_split(df.text.values.tolist(),
                                                        df.label.values,
                                                        train_size=train_size, random_state=1)
    train_X_wds = train_X
    test_X_wds = test_X

    nb_classes = len(np.unique(train_y))
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test  = np_utils.to_categorical(test_y, nb_classes)

    # tokenrize should be applied on train+test jointly
    n_ta = len(train_X)
    n_ts = len(test_X)
    print('train len vs. test len', n_ta, n_ts)

    textraw = [line.encode('utf-8') for line in train_X+test_X]  # keras needs str
    # keras deals with tokens
    token = Tokenizer(nb_words=nb_words)
    token.fit_on_texts(textraw)
    textseq = token.texts_to_sequences(textraw)

    # stat about textlist
    print('nb_words: ',len(token.word_counts))
    print('mean len: ',np.mean([len(x) for x in textseq]))

    train_X = textseq[0:n_ta]
    test_X = textseq[n_ta:]

    if(embd_type == 'self'):
        X_train = xcol_nninput_embd(train_X, nb_words, maxlen)
        X_test  = xcol_nninput_embd(test_X,  nb_words, maxlen)
    elif(embd_type == 'w2v'):
        w2v = load_w2v('data/Google_w2v.bin')
        print("loaded Google word2vec")
        X_train = sents_3dtensor(train_X_wds, maxlen, w2v)
        X_test  = sents_3dtensor(test_X_wds, maxlen, w2v)
    else:
        print('wrong embd_type')

    print('X tensor shape: ', X_train.shape)
    print('Y tensor shape: ', Y_train.shape)
    return (X_train, Y_train, X_test, Y_test, nb_classes)
Esempio n. 2
0
def load_csvs(traincsv, testcsv, nb_words, maxlen, embd_type):

    train_df = pd.read_csv(traincsv)
    test_df = pd.read_csv(testcsv)
    print(train_df.head())

    train_X = train_df.text.values.tolist()
    test_X = test_df.text.values.tolist()

    # save for w2v embd
    train_X_wds = train_X
    test_X_wds = test_X

    train_y = train_df.label.values
    test_y  = test_df.label.values
    nb_classes = len(np.unique(train_y))
    Y_train = np_utils.to_categorical(train_y, nb_classes)
    Y_test = np_utils.to_categorical(test_y, nb_classes)

    # tokenrize should be applied on train+test jointly
    n_ta = len(train_X)
    n_ts = len(test_X)
    print('train len vs. test len', n_ta, n_ts)

    textraw = [line.encode('utf-8') for line in train_X+test_X]  # keras needs str
    # keras deals with tokens
    token = Tokenizer(nb_words=nb_words)
    token.fit_on_texts(textraw)
    textseq = token.texts_to_sequences(textraw)

    # stat about textlist
    print('nb_words: ', len(token.word_counts))
    print('mean len: ', np.mean([len(x) for x in textseq]))

    train_X = textseq[0:n_ta]
    test_X = textseq[n_ta:]

    if(embd_type == 'self'):
        X_train = sequence.pad_sequences(train_X, maxlen=maxlen, padding='post', truncating='post')
        X_test = sequence.pad_sequences(test_X, maxlen=maxlen, padding='post', truncating='post')
    elif(embd_type == 'w2v'):
        w2v = load_w2v('data/Google_w2v.bin')
        print("loaded Google word2vec")
        X_train = sents_3dtensor(train_X_wds, maxlen, w2v)
        X_test = sents_3dtensor(test_X_wds, maxlen, w2v)
    else:
        print('wrong embd_type')

    print('X tensor shape: ', X_train.shape)
    print('Y tensor shape: ', Y_train.shape)
    return(X_train, Y_train, X_test, Y_test, nb_classes)
Esempio n. 3
0
def tpo_cv_w2v_cnnvar():
    maxlen = 175

    folds = range(1,11)
    trains = ['data/tpov4/train_'+str(fold)+'.csv' for fold in folds]
    tests = ['data/tpov4/test_'+str(fold)+'.csv' for fold in folds]
    pairs = zip(trains, tests)

    w2v = load_w2v('data/Google_w2v.bin')
    print("loaded Google word2vec")

    accs = []
    for (train, test) in pairs:
        print(train + '=>' + test)
        X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test,
                                                             0, maxlen, embd_type='w2v', w2v=w2v)

        acc = cnn_var_w2vembd(X_train, Y_train, X_test, Y_test, nb_classes,
                             maxlen,
                             100, 50, 25, 'rmsprop')
        accs.append(acc)
    acc_cv = np.mean(accs)
    print('after 10-fold cv:' + str(acc_cv))
Esempio n. 4
0
def asap_cv_w2v():
    maxlen = 40

    folds = range(1,11)
    trains = ['data/asap2/train'+str(fold)+'.csv' for fold in folds]
    tests = ['data/asap2/test'+str(fold)+'.csv' for fold in folds]
    pairs = zip(trains, tests)

    w2v = load_w2v('data/Google_w2v.bin')
    print("loaded Google word2vec")

    kappas = []
    for (train, test) in pairs:
        print(train + '=>' + test)
        X_train, Y_train, X_test, Y_test, nb_classes = load_csvs(train, test,
                                                             0, maxlen, embd_type='w2v', w2v=w2v)

        kappa = cnn1d_w2vembd(X_train, Y_train, X_test, Y_test, nb_classes,
                             maxlen,
                             100, 3, 50, 20, 'rmsprop')
        kappas.append(kappa)
    kappa_cv = np.mean(kappas)
    print('after 10-fold cv:' + str(kappa_cv))