Ejemplos de load_data en Python, ejemplos de CIKM.datautils.datahelper.load_data en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: doc2vec_model.py Proyecto: SFKevin/nlp_semantics

def process():
    _, _, _, _, w2v_list, _ = datahelper.load_data(filepath_en_train, filepath_sp_train)

    stop_word = list(open(file_stop_word, "r", encoding='UTF-8').readlines())
    stop_word_list = [
        line.replace("\n", "").replace(",", "").replace(".", "").replace("?", "").replace("¿", "").replace("!",
                                                                                                           "").replace(
            "¡", "").lower() for
        line in
        stop_word]
    d2c_list = []
    for line in w2v_list:
        # line_list = [x for x in line if x not in stop_word_list]
        d2c_list.append(line)

    alldocuments = []
    analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
    for id, record in enumerate(d2c_list):
        qid = str('SENT_%s' % id)
        words = record
        words_text = " ".join(words)
        words = gensim.utils.simple_preprocess(words_text)
        tags = [qid]
        alldocuments.append(analyzedDocument(words, tags))
    print("Start Training Doc2Vec Time : %s" % (str(datetime.datetime.now())))
    saved_model_name = "doc_2_vec_" + str(int(time.time()))
    model_4 = gensim.models.Doc2Vec(alldocuments, dm=1, dm_concat=1, vector_size=300, window=5,
                                    min_count=2, epochs=100)
    model_4.save("%s" % (saved_model_name))
    print("model training completed : %s" % (saved_model_name))

Ejemplo n.º 2

0

Mostrar archivo

def process():
    x_text1, x_text2, _, y_train, _, x_train_reshape = datahelper.load_data(
        FLAGS.en_train, FLAGS.sp_train)
    word2index, index2word = datahelper.create_vocabulary(x_train_reshape)
    vocab_size = len(index2word)
    word_embedding = datahelper.asign_pretrained_word_embedding(
        index2word, vocab_size, FLAGS.word2vec_model_path)
    max_len = max([len(x.split(" ")) for x in x_train_reshape])
    test1, test2 = datahelper.load_testdata(filepath_test)
    test1_int = []
    test2_int = []

    x_text1_int = []
    x_text2_int = []

    for line in x_text1:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text1_int.append(text)

    for line in x_text2:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text2_int.append(text)

    for line in test1:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        test1_int.append(text)

    for line in test2:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        test2_int.append(text)

    x_train1 = pad_sequences(x_text1_int, max_len)
    x_train2 = pad_sequences(x_text2_int, max_len)
    x_test1 = pad_sequences(test1_int, max_len)
    x_test2 = pad_sequences(test2_int, max_len)

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    x_shuffled1 = x_train1[shuffle_indices]
    x_shuffled2 = x_train2[shuffle_indices]
    y_shuffled = y_train[shuffle_indices]

    dev_sample_index = -1 * int(
        FLAGS.dev_sample_percentage * float(len(y_train)))
    x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[
        dev_sample_index:]
    x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    del x_text1, x_text2, x_text1_int, x_text2_int

    return x_shuffled1, x_shuffled2, y_shuffled, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size, x_test1, x_test2

Ejemplo n.º 3

0

Mostrar archivo

def process():
    x_text1, x_text2, _, y_train, _, x_train_reshape = datahelper.load_data(
        FLAGS.en_train, FLAGS.sp_train)
    word2index, index2word = datahelper.create_vocabulary(x_train_reshape)
    vocab_size = len(index2word)
    word_embedding = datahelper.asign_pretrained_word_embedding(
        index2word, vocab_size, FLAGS.word2vec_model_path)
    max_len = max([len(x.split(" ")) for x in x_train_reshape])

    x_text1_int = []
    x_text2_int = []
    stop_word = list(open(FLAGS.stop_word, "r", encoding='UTF-8').readlines())
    stop_word_list = [
        line.replace("\n", "").replace(",", "").replace(".", "").replace(
            "?", "").replace("¿", "").replace("!", "").replace("¡",
                                                               "").lower()
        for line in stop_word
    ]
    for line in x_text1:
        line_list = line.split(" ")
        line_list = [x for x in line_list if x not in stop_word_list]
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text1_int.append(text)

    for line in x_text2:
        line_list = line.split(" ")
        line_list = [x for x in line_list if x not in stop_word_list]
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text2_int.append(text)

    x_train1 = pad_sequences(x_text1_int, max_len)
    x_train2 = pad_sequences(x_text2_int, max_len)

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    x_shuffled1 = x_train1[shuffle_indices]
    x_shuffled2 = x_train2[shuffle_indices]
    y_shuffled = y_train[shuffle_indices]

    dev_sample_index = -1 * int(
        FLAGS.dev_sample_percentage * float(len(y_train)))
    x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[
        dev_sample_index:]
    x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    del x_text1, x_text2, x_text1_int, x_text2_int

    return x_train1, x_dev1, x_train2, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size

Ejemplo n.º 4

0

Mostrar archivo

Archivo: train_lgb.py Proyecto: SFKevin/nlp_semantics

test_doc2vec = pd.read_csv(
    "I:\\temp\\CNNimpl_static\\CIKM\\features\\doc2vec\\test_doc2vec1.csv")
train_doc2vec.drop(['doc2vec_train1', 'doc2vec_train2'], axis=1, inplace=True)
test_doc2vec.drop(['doc2vec_test1', 'doc2vec_test2'], axis=1, inplace=True)

train = pd.concat([
    train_bag, train_magic1, train_magic2, train_freq, train_ngram,
    train_simple, train_weight, train_page, train_w2v, train_doc2vec
],
                  axis=1)
test = pd.concat([
    test_bag, test_magic1, test_magic2, test_freq, test_ngram, test_simple,
    test_weight, test_page, test_w2v, test_doc2vec
],
                 axis=1)
_, _, _, y_train, _, _ = datahelper.load_data(filepath_en_train,
                                              filepath_sp_train)
import numpy as np

sums = np.sum(y_train, axis=0)

print(sums / len(y_train))

from sklearn.model_selection import train_test_split

x_train, x_dev, y_train, y_dev = train_test_split(train.values,
                                                  y_train,
                                                  test_size=0.1,
                                                  random_state=0)
import lightgbm as lgb

train_input = lgb.Dataset(x_train, y_train)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_unique.py Proyecto: SFKevin/nlp_semantics

filepath_test = "I:\\CIKM\\cikm_test_a_20180516.txt"
filepath_unlabel = "I:\\CIKM\\cikm_unlabel_spanish_train_20180516\\cikm_unlabel_spanish_train_20180516.txt"
w2v_pah = "I:\\CIKM\\w2v.model.bin"
fast_path = "I:\\CIKM\\fast_text_vectors_wiki.es.vec\\wiki.es.vec"
file_stop_word = "I:\\CIKM\\spanish_stop_word.txt"
from CIKM.datautils import datahelper
import pandas as pd
import numpy as np
from gensim.models.tfidfmodel import TfidfModel
from gensim.similarities import MatrixSimilarity
from scipy import spatial
import datetime
from scipy.stats import skew, kurtosis
from gensim.corpora.dictionary import Dictionary

x_train1, x_train2, _, _, _, _ = datahelper.load_data(filepath_en_train, filepath_sp_train)
x_test1, x_test2 = datahelper.load_testdata(filepath_test)
train = pd.DataFrame()
test = pd.DataFrame()

train['question1'] = x_train1
train['question2'] = x_train2

test['question1'] = x_test1
test['question2'] = x_test2

# clean
tfidf_txt = train['question1'].tolist() + train['question2'].tolist() + test['question1'].tolist() + test[
    'question2'].tolist()
train_qs = pd.Series(tfidf_txt).astype(str)
dictionary = Dictionary(x.split(" ") for x in tfidf_txt)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: train.py Proyecto: SFKevin/nlp_semantics

def train_w2v():
    _, _, _, _, data, _ = datahelper.load_data(filepath_en_train,
                                               filepath_sp_train)
    model = gensim.models.Word2Vec(data, size=300, min_count=1)
    model.wv.save_word2vec_format('w2v.model.bin', binary=True)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: doc2vec_inference.py Proyecto: SFKevin/nlp_semantics

def makeFeature():
    x_train1, x_train2, _, _, _, _ = datahelper.load_data(
        filepath_en_train, filepath_sp_train)
    x_test1, x_test2 = datahelper.load_testdata(filepath_test)

    x_train1 = process_data(x_train1)
    x_train2 = process_data(x_train2)
    x_test1 = process_data(x_test1)
    x_test2 = process_data(x_test2)
    now = datetime.datetime.now()
    print
    now.strftime('%Y-%m-%d %H:%M:%S')
    print('get sentence vector')
    train = pd.DataFrame()
    test = pd.DataFrame()
    train['doc2vec_train1'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_train1
    ]
    train['doc2vec_train2'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_train2
    ]
    test['doc2vec_test1'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_test1
    ]
    test['doc2vec_test2'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_test2
    ]
    print('get six kinds of coefficient about vector')

    train['cosine1'] = train.apply(
        lambda x: Cosine(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['manhatton1'] = train.apply(
        lambda x: Manhatton(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['euclidean1'] = train.apply(
        lambda x: Euclidean(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['pearson1'] = train.apply(
        lambda x: PearsonSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train['spearman1'] = train.apply(
        lambda x: SpearmanSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train['kendall1'] = train.apply(
        lambda x: KendallSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train.to_csv('train_doc2vec1.csv', index=False)

    test['cosine1'] = test.apply(
        lambda x: Cosine(x['doc2vec_test1'], x['doc2vec_test2']), axis=1)
    test['manhatton1'] = test.apply(
        lambda x: Manhatton(x['doc2vec_test1'], x['doc2vec_test2']), axis=1)
    test['euclidean1'] = test.apply(
        lambda x: Euclidean(x['doc2vec_test1'], x['doc2vec_test2']), axis=1)
    test['pearson1'] = test.apply(
        lambda x: PearsonSimilar(x['doc2vec_test1'], x['doc2vec_test2']),
        axis=1)
    test['spearman1'] = test.apply(
        lambda x: SpearmanSimilar(x['doc2vec_test1'], x['doc2vec_test2']),
        axis=1)
    test['kendall1'] = test.apply(
        lambda x: KendallSimilar(x['doc2vec_test1'], x['doc2vec_test2']),
        axis=1)

    test.to_csv('test_doc2vec1.csv', index=False)