Python load_testdata Examples

Programming Language: Python

Namespace/Package Name: CIKM.datautils.datahelper

Method/Function: load_testdata

Examples at hotexamples.com: 3

Python load_testdata - 3 examples found. These are the top rated real world Python examples of CIKM.datautils.datahelper.load_testdata extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def process():
    x_text1, x_text2, _, y_train, _, x_train_reshape = datahelper.load_data(
        FLAGS.en_train, FLAGS.sp_train)
    word2index, index2word = datahelper.create_vocabulary(x_train_reshape)
    vocab_size = len(index2word)
    word_embedding = datahelper.asign_pretrained_word_embedding(
        index2word, vocab_size, FLAGS.word2vec_model_path)
    max_len = max([len(x.split(" ")) for x in x_train_reshape])
    test1, test2 = datahelper.load_testdata(filepath_test)
    test1_int = []
    test2_int = []

    x_text1_int = []
    x_text2_int = []

    for line in x_text1:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text1_int.append(text)

    for line in x_text2:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text2_int.append(text)

    for line in test1:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        test1_int.append(text)

    for line in test2:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        test2_int.append(text)

    x_train1 = pad_sequences(x_text1_int, max_len)
    x_train2 = pad_sequences(x_text2_int, max_len)
    x_test1 = pad_sequences(test1_int, max_len)
    x_test2 = pad_sequences(test2_int, max_len)

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    x_shuffled1 = x_train1[shuffle_indices]
    x_shuffled2 = x_train2[shuffle_indices]
    y_shuffled = y_train[shuffle_indices]

    dev_sample_index = -1 * int(
        FLAGS.dev_sample_percentage * float(len(y_train)))
    x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[
        dev_sample_index:]
    x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    del x_text1, x_text2, x_text1_int, x_text2_int

    return x_shuffled1, x_shuffled2, y_shuffled, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size, x_test1, x_test2

Example #2

Show file

File: test_unique.py Project: SFKevin/nlp_semantics

filepath_unlabel = "I:\\CIKM\\cikm_unlabel_spanish_train_20180516\\cikm_unlabel_spanish_train_20180516.txt"
w2v_pah = "I:\\CIKM\\w2v.model.bin"
fast_path = "I:\\CIKM\\fast_text_vectors_wiki.es.vec\\wiki.es.vec"
file_stop_word = "I:\\CIKM\\spanish_stop_word.txt"
from CIKM.datautils import datahelper
import pandas as pd
import numpy as np
from gensim.models.tfidfmodel import TfidfModel
from gensim.similarities import MatrixSimilarity
from scipy import spatial
import datetime
from scipy.stats import skew, kurtosis
from gensim.corpora.dictionary import Dictionary

x_train1, x_train2, _, _, _, _ = datahelper.load_data(filepath_en_train, filepath_sp_train)
x_test1, x_test2 = datahelper.load_testdata(filepath_test)
train = pd.DataFrame()
test = pd.DataFrame()

train['question1'] = x_train1
train['question2'] = x_train2

test['question1'] = x_test1
test['question2'] = x_test2

# clean
tfidf_txt = train['question1'].tolist() + train['question2'].tolist() + test['question1'].tolist() + test[
    'question2'].tolist()
train_qs = pd.Series(tfidf_txt).astype(str)
dictionary = Dictionary(x.split(" ") for x in tfidf_txt)

Example #3

Show file

File: doc2vec_inference.py Project: SFKevin/nlp_semantics

def makeFeature():
    x_train1, x_train2, _, _, _, _ = datahelper.load_data(
        filepath_en_train, filepath_sp_train)
    x_test1, x_test2 = datahelper.load_testdata(filepath_test)

    x_train1 = process_data(x_train1)
    x_train2 = process_data(x_train2)
    x_test1 = process_data(x_test1)
    x_test2 = process_data(x_test2)
    now = datetime.datetime.now()
    print
    now.strftime('%Y-%m-%d %H:%M:%S')
    print('get sentence vector')
    train = pd.DataFrame()
    test = pd.DataFrame()
    train['doc2vec_train1'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_train1
    ]
    train['doc2vec_train2'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_train2
    ]
    test['doc2vec_test1'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_test1
    ]
    test['doc2vec_test2'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_test2
    ]
    print('get six kinds of coefficient about vector')

    train['cosine1'] = train.apply(
        lambda x: Cosine(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['manhatton1'] = train.apply(
        lambda x: Manhatton(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['euclidean1'] = train.apply(
        lambda x: Euclidean(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['pearson1'] = train.apply(
        lambda x: PearsonSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train['spearman1'] = train.apply(
        lambda x: SpearmanSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train['kendall1'] = train.apply(
        lambda x: KendallSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train.to_csv('train_doc2vec1.csv', index=False)

    test['cosine1'] = test.apply(
        lambda x: Cosine(x['doc2vec_test1'], x['doc2vec_test2']), axis=1)
    test['manhatton1'] = test.apply(
        lambda x: Manhatton(x['doc2vec_test1'], x['doc2vec_test2']), axis=1)
    test['euclidean1'] = test.apply(
        lambda x: Euclidean(x['doc2vec_test1'], x['doc2vec_test2']), axis=1)
    test['pearson1'] = test.apply(
        lambda x: PearsonSimilar(x['doc2vec_test1'], x['doc2vec_test2']),
        axis=1)
    test['spearman1'] = test.apply(
        lambda x: SpearmanSimilar(x['doc2vec_test1'], x['doc2vec_test2']),
        axis=1)
    test['kendall1'] = test.apply(
        lambda x: KendallSimilar(x['doc2vec_test1'], x['doc2vec_test2']),
        axis=1)

    test.to_csv('test_doc2vec1.csv', index=False)