Beispiel #1
0
def split_train_test(train_dir, vocab_dir, test_size, n_labeled, wordslength):
    #train_dir = './data/labeled1.txt'
    #vocab_dir = './data/vocab_yinan_test_rnn4.txt'
    if not os.path.exists(vocab_dir):
        build_vocab(train_dir, vocab_dir, 1000)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)

    x, y = process_file(train_dir, word_to_id, cat_to_id, wordslength)
    # x_rnn, y_rnn = process_file_rnn(train_dir, word_to_id, cat_to_id, 600)

    listy = []
    for i in range(np.shape(y)[0]):
        for j in range(np.shape(y)[1]):
            if y[i][j] == 1:
                listy.append(j)
    listy = np.array(listy)


    X_train, X_test, y_train, y_test = \
        train_test_split(x, listy, test_size=test_size)

    # X_train = X_train[:(n_labeled + 24)]
    trn_ds = Dataset(
        X_train,
        np.concatenate(
            [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
    # trn_ds = Dataset(X_train, np.concatenate(
    #     [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
    fully_tst_ds = Dataset(X_test, y_test)

    X_val, X_real_test, y_val, y_real_test = \
            train_test_split(X_test, y_test, test_size=0.5)

    tst_ds = Dataset(X_real_test, y_real_test)
    val_ds = Dataset(X_val, y_val)

    fully_labeled_trn_ds = Dataset(X_train, y_train)
    #    print (fully_labeled_trn_ds.get_entries()[0])
    return trn_ds, tst_ds, y_train, fully_labeled_trn_ds, fully_tst_ds, val_ds
Beispiel #2
0
def split_train_test_rnn(train_dir, vocab_dir, vocab_size, test_size, val_size,
                         n_labeled, wordslength, categories_class):
    if not os.path.exists(vocab_dir):
        build_vocab(train_dir, vocab_dir, vocab_size)
    categories, cat_to_id = read_category(categories_class)
    words, word_to_id = read_vocab(vocab_dir)

    data_id, label_id = process_file_rnn(train_dir, word_to_id, cat_to_id,
                                         wordslength)
    # x_rnn, y_rnn = process_file_rnn(train_dir, word_to_id, cat_to_id, 600)

    y = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))
    listy = []
    for i in range(np.shape(y)[0]):
        for j in range(np.shape(y)[1]):
            if y[i][j] == 1:
                listy.append(j)
    listy = np.array(listy)


    X_train, X_test, y_train, y_test = \
        train_test_split(data_id, listy, test_size=test_size)

    X_train_al = []
    X_test_al = []
    res = []
    for i in X_train:
        for j in range(wordslength):
            a = i.count(j)
            if a > 0:
                res.append(a)
            else:
                res.append(0)
        X_train_al.append(res)
        res = []

    for i in X_test:
        for j in range(wordslength):
            a = i.count(j)
            if a > 0:
                res.append(a)
            else:
                res.append(0)
        X_test_al.append(res)
        res = []

    X_train_al = np.array(X_train_al)
    X_test_al = np.array(X_test_al)

    trn_ds_al = Dataset(
        X_train_al,
        np.concatenate(
            [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))

    tst_ds_al = Dataset(X_test_al, y_test)

    X_train_rnn = kr.preprocessing.sequence.pad_sequences(X_train, wordslength)
    X_test_rnn = kr.preprocessing.sequence.pad_sequences(X_test, wordslength)

    X_train_rnn, X_val_rnn, y_train_rnn, y_val_rnn = \
            train_test_split(X_train_rnn, y_train, test_size=val_size)

    trn_ds_rnn = Dataset(
        X_train_rnn,
        np.concatenate(
            [y_train_rnn[:n_labeled],
             [None] * (len(y_train_rnn) - n_labeled)]))

    val_ds_rnn = Dataset(X_val_rnn, y_val_rnn)

    tst_ds_rnn = Dataset(X_test_rnn, y_test)

    fully_labeled_trn_ds_al = Dataset(X_train_al, y_train)
    fully_labeled_trn_ds_rnn = Dataset(X_train_rnn, y_train_rnn)

    return trn_ds_al, tst_ds_al, y_train_rnn, fully_labeled_trn_ds_al, \
        trn_ds_rnn, tst_ds_rnn, fully_labeled_trn_ds_rnn, val_ds_rnn