Ejemplo n.º 1
0
def process():
    _, _, _, _, w2v_list, _ = datahelper.load_data(filepath_en_train, filepath_sp_train)

    stop_word = list(open(file_stop_word, "r", encoding='UTF-8').readlines())
    stop_word_list = [
        line.replace("\n", "").replace(",", "").replace(".", "").replace("?", "").replace("¿", "").replace("!",
                                                                                                           "").replace(
            "¡", "").lower() for
        line in
        stop_word]
    d2c_list = []
    for line in w2v_list:
        # line_list = [x for x in line if x not in stop_word_list]
        d2c_list.append(line)

    alldocuments = []
    analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
    for id, record in enumerate(d2c_list):
        qid = str('SENT_%s' % id)
        words = record
        words_text = " ".join(words)
        words = gensim.utils.simple_preprocess(words_text)
        tags = [qid]
        alldocuments.append(analyzedDocument(words, tags))
    print("Start Training Doc2Vec Time : %s" % (str(datetime.datetime.now())))
    saved_model_name = "doc_2_vec_" + str(int(time.time()))
    model_4 = gensim.models.Doc2Vec(alldocuments, dm=1, dm_concat=1, vector_size=300, window=5,
                                    min_count=2, epochs=100)
    model_4.save("%s" % (saved_model_name))
    print("model training completed : %s" % (saved_model_name))
Ejemplo n.º 2
0
def process():
    x_text1, x_text2, _, y_train, _, x_train_reshape = datahelper.load_data(
        FLAGS.en_train, FLAGS.sp_train)
    word2index, index2word = datahelper.create_vocabulary(x_train_reshape)
    vocab_size = len(index2word)
    word_embedding = datahelper.asign_pretrained_word_embedding(
        index2word, vocab_size, FLAGS.word2vec_model_path)
    max_len = max([len(x.split(" ")) for x in x_train_reshape])
    test1, test2 = datahelper.load_testdata(filepath_test)
    test1_int = []
    test2_int = []

    x_text1_int = []
    x_text2_int = []

    for line in x_text1:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text1_int.append(text)

    for line in x_text2:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text2_int.append(text)

    for line in test1:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        test1_int.append(text)

    for line in test2:
        line_list = line.split(" ")
        text = [word2index.get(x, UNK_ID) for x in line_list]
        test2_int.append(text)

    x_train1 = pad_sequences(x_text1_int, max_len)
    x_train2 = pad_sequences(x_text2_int, max_len)
    x_test1 = pad_sequences(test1_int, max_len)
    x_test2 = pad_sequences(test2_int, max_len)

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    x_shuffled1 = x_train1[shuffle_indices]
    x_shuffled2 = x_train2[shuffle_indices]
    y_shuffled = y_train[shuffle_indices]

    dev_sample_index = -1 * int(
        FLAGS.dev_sample_percentage * float(len(y_train)))
    x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[
        dev_sample_index:]
    x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    del x_text1, x_text2, x_text1_int, x_text2_int

    return x_shuffled1, x_shuffled2, y_shuffled, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size, x_test1, x_test2
Ejemplo n.º 3
0
def process():
    x_text1, x_text2, _, y_train, _, x_train_reshape = datahelper.load_data(
        FLAGS.en_train, FLAGS.sp_train)
    word2index, index2word = datahelper.create_vocabulary(x_train_reshape)
    vocab_size = len(index2word)
    word_embedding = datahelper.asign_pretrained_word_embedding(
        index2word, vocab_size, FLAGS.word2vec_model_path)
    max_len = max([len(x.split(" ")) for x in x_train_reshape])

    x_text1_int = []
    x_text2_int = []
    stop_word = list(open(FLAGS.stop_word, "r", encoding='UTF-8').readlines())
    stop_word_list = [
        line.replace("\n", "").replace(",", "").replace(".", "").replace(
            "?", "").replace("¿", "").replace("!", "").replace("¡",
                                                               "").lower()
        for line in stop_word
    ]
    for line in x_text1:
        line_list = line.split(" ")
        line_list = [x for x in line_list if x not in stop_word_list]
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text1_int.append(text)

    for line in x_text2:
        line_list = line.split(" ")
        line_list = [x for x in line_list if x not in stop_word_list]
        text = [word2index.get(x, UNK_ID) for x in line_list]
        x_text2_int.append(text)

    x_train1 = pad_sequences(x_text1_int, max_len)
    x_train2 = pad_sequences(x_text2_int, max_len)

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    x_shuffled1 = x_train1[shuffle_indices]
    x_shuffled2 = x_train2[shuffle_indices]
    y_shuffled = y_train[shuffle_indices]

    dev_sample_index = -1 * int(
        FLAGS.dev_sample_percentage * float(len(y_train)))
    x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[
        dev_sample_index:]
    x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    del x_text1, x_text2, x_text1_int, x_text2_int

    return x_train1, x_dev1, x_train2, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size
Ejemplo n.º 4
0
test_doc2vec = pd.read_csv(
    "I:\\temp\\CNNimpl_static\\CIKM\\features\\doc2vec\\test_doc2vec1.csv")
train_doc2vec.drop(['doc2vec_train1', 'doc2vec_train2'], axis=1, inplace=True)
test_doc2vec.drop(['doc2vec_test1', 'doc2vec_test2'], axis=1, inplace=True)

train = pd.concat([
    train_bag, train_magic1, train_magic2, train_freq, train_ngram,
    train_simple, train_weight, train_page, train_w2v, train_doc2vec
],
                  axis=1)
test = pd.concat([
    test_bag, test_magic1, test_magic2, test_freq, test_ngram, test_simple,
    test_weight, test_page, test_w2v, test_doc2vec
],
                 axis=1)
_, _, _, y_train, _, _ = datahelper.load_data(filepath_en_train,
                                              filepath_sp_train)
import numpy as np

sums = np.sum(y_train, axis=0)

print(sums / len(y_train))

from sklearn.model_selection import train_test_split

x_train, x_dev, y_train, y_dev = train_test_split(train.values,
                                                  y_train,
                                                  test_size=0.1,
                                                  random_state=0)
import lightgbm as lgb

train_input = lgb.Dataset(x_train, y_train)
Ejemplo n.º 5
0
filepath_test = "I:\\CIKM\\cikm_test_a_20180516.txt"
filepath_unlabel = "I:\\CIKM\\cikm_unlabel_spanish_train_20180516\\cikm_unlabel_spanish_train_20180516.txt"
w2v_pah = "I:\\CIKM\\w2v.model.bin"
fast_path = "I:\\CIKM\\fast_text_vectors_wiki.es.vec\\wiki.es.vec"
file_stop_word = "I:\\CIKM\\spanish_stop_word.txt"
from CIKM.datautils import datahelper
import pandas as pd
import numpy as np
from gensim.models.tfidfmodel import TfidfModel
from gensim.similarities import MatrixSimilarity
from scipy import spatial
import datetime
from scipy.stats import skew, kurtosis
from gensim.corpora.dictionary import Dictionary

x_train1, x_train2, _, _, _, _ = datahelper.load_data(filepath_en_train, filepath_sp_train)
x_test1, x_test2 = datahelper.load_testdata(filepath_test)
train = pd.DataFrame()
test = pd.DataFrame()

train['question1'] = x_train1
train['question2'] = x_train2

test['question1'] = x_test1
test['question2'] = x_test2

# clean
tfidf_txt = train['question1'].tolist() + train['question2'].tolist() + test['question1'].tolist() + test[
    'question2'].tolist()
train_qs = pd.Series(tfidf_txt).astype(str)
dictionary = Dictionary(x.split(" ") for x in tfidf_txt)
Ejemplo n.º 6
0
def train_w2v():
    _, _, _, _, data, _ = datahelper.load_data(filepath_en_train,
                                               filepath_sp_train)
    model = gensim.models.Word2Vec(data, size=300, min_count=1)
    model.wv.save_word2vec_format('w2v.model.bin', binary=True)
Ejemplo n.º 7
0
def makeFeature():
    x_train1, x_train2, _, _, _, _ = datahelper.load_data(
        filepath_en_train, filepath_sp_train)
    x_test1, x_test2 = datahelper.load_testdata(filepath_test)

    x_train1 = process_data(x_train1)
    x_train2 = process_data(x_train2)
    x_test1 = process_data(x_test1)
    x_test2 = process_data(x_test2)
    now = datetime.datetime.now()
    print
    now.strftime('%Y-%m-%d %H:%M:%S')
    print('get sentence vector')
    train = pd.DataFrame()
    test = pd.DataFrame()
    train['doc2vec_train1'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_train1
    ]
    train['doc2vec_train2'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_train2
    ]
    test['doc2vec_test1'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_test1
    ]
    test['doc2vec_test2'] = [
        doc2vec_model.get_question_vector(x, model) for x in x_test2
    ]
    print('get six kinds of coefficient about vector')

    train['cosine1'] = train.apply(
        lambda x: Cosine(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['manhatton1'] = train.apply(
        lambda x: Manhatton(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['euclidean1'] = train.apply(
        lambda x: Euclidean(x['doc2vec_train1'], x['doc2vec_train2']), axis=1)
    train['pearson1'] = train.apply(
        lambda x: PearsonSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train['spearman1'] = train.apply(
        lambda x: SpearmanSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train['kendall1'] = train.apply(
        lambda x: KendallSimilar(x['doc2vec_train1'], x['doc2vec_train2']),
        axis=1)
    train.to_csv('train_doc2vec1.csv', index=False)

    test['cosine1'] = test.apply(
        lambda x: Cosine(x['doc2vec_test1'], x['doc2vec_test2']), axis=1)
    test['manhatton1'] = test.apply(
        lambda x: Manhatton(x['doc2vec_test1'], x['doc2vec_test2']), axis=1)
    test['euclidean1'] = test.apply(
        lambda x: Euclidean(x['doc2vec_test1'], x['doc2vec_test2']), axis=1)
    test['pearson1'] = test.apply(
        lambda x: PearsonSimilar(x['doc2vec_test1'], x['doc2vec_test2']),
        axis=1)
    test['spearman1'] = test.apply(
        lambda x: SpearmanSimilar(x['doc2vec_test1'], x['doc2vec_test2']),
        axis=1)
    test['kendall1'] = test.apply(
        lambda x: KendallSimilar(x['doc2vec_test1'], x['doc2vec_test2']),
        axis=1)

    test.to_csv('test_doc2vec1.csv', index=False)