def process(): x_text1, x_text2, _, y_train, _, x_train_reshape = datahelper.load_data( FLAGS.en_train, FLAGS.sp_train) word2index, index2word = datahelper.create_vocabulary(x_train_reshape) vocab_size = len(index2word) word_embedding = datahelper.asign_pretrained_word_embedding( index2word, vocab_size, FLAGS.word2vec_model_path) max_len = max([len(x.split(" ")) for x in x_train_reshape]) test1, test2 = datahelper.load_testdata(filepath_test) test1_int = [] test2_int = [] x_text1_int = [] x_text2_int = [] for line in x_text1: line_list = line.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] x_text1_int.append(text) for line in x_text2: line_list = line.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] x_text2_int.append(text) for line in test1: line_list = line.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] test1_int.append(text) for line in test2: line_list = line.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] test2_int.append(text) x_train1 = pad_sequences(x_text1_int, max_len) x_train2 = pad_sequences(x_text2_int, max_len) x_test1 = pad_sequences(test1_int, max_len) x_test2 = pad_sequences(test2_int, max_len) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y_train))) x_shuffled1 = x_train1[shuffle_indices] x_shuffled2 = x_train2[shuffle_indices] y_shuffled = y_train[shuffle_indices] dev_sample_index = -1 * int( FLAGS.dev_sample_percentage * float(len(y_train))) x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[ dev_sample_index:] x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] del x_text1, x_text2, x_text1_int, x_text2_int return x_shuffled1, x_shuffled2, y_shuffled, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size, x_test1, x_test2
filepath_unlabel = "I:\\CIKM\\cikm_unlabel_spanish_train_20180516\\cikm_unlabel_spanish_train_20180516.txt" w2v_pah = "I:\\CIKM\\w2v.model.bin" fast_path = "I:\\CIKM\\fast_text_vectors_wiki.es.vec\\wiki.es.vec" file_stop_word = "I:\\CIKM\\spanish_stop_word.txt" from CIKM.datautils import datahelper import pandas as pd import numpy as np from gensim.models.tfidfmodel import TfidfModel from gensim.similarities import MatrixSimilarity from scipy import spatial import datetime from scipy.stats import skew, kurtosis from gensim.corpora.dictionary import Dictionary x_train1, x_train2, _, _, _, _ = datahelper.load_data(filepath_en_train, filepath_sp_train) x_test1, x_test2 = datahelper.load_testdata(filepath_test) train = pd.DataFrame() test = pd.DataFrame() train['question1'] = x_train1 train['question2'] = x_train2 test['question1'] = x_test1 test['question2'] = x_test2 # clean tfidf_txt = train['question1'].tolist() + train['question2'].tolist() + test['question1'].tolist() + test[ 'question2'].tolist() train_qs = pd.Series(tfidf_txt).astype(str) dictionary = Dictionary(x.split(" ") for x in tfidf_txt)
def makeFeature(): x_train1, x_train2, _, _, _, _ = datahelper.load_data( filepath_en_train, filepath_sp_train) x_test1, x_test2 = datahelper.load_testdata(filepath_test) x_train1 = process_data(x_train1) x_train2 = process_data(x_train2) x_test1 = process_data(x_test1) x_test2 = process_data(x_test2) now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') print('get sentence vector') train = pd.DataFrame() test = pd.DataFrame() train['doc2vec_train1'] = [ doc2vec_model.get_question_vector(x, model) for x in x_train1 ] train['doc2vec_train2'] = [ doc2vec_model.get_question_vector(x, model) for x in x_train2 ] test['doc2vec_test1'] = [ doc2vec_model.get_question_vector(x, model) for x in x_test1 ] test['doc2vec_test2'] = [ doc2vec_model.get_question_vector(x, model) for x in x_test2 ] print('get six kinds of coefficient about vector') train['cosine1'] = train.apply( lambda x: Cosine(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['manhatton1'] = train.apply( lambda x: Manhatton(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['euclidean1'] = train.apply( lambda x: Euclidean(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['pearson1'] = train.apply( lambda x: PearsonSimilar(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['spearman1'] = train.apply( lambda x: SpearmanSimilar(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['kendall1'] = train.apply( lambda x: KendallSimilar(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train.to_csv('train_doc2vec1.csv', index=False) test['cosine1'] = test.apply( lambda x: Cosine(x['doc2vec_test1'], x['doc2vec_test2']), axis=1) test['manhatton1'] = test.apply( lambda x: Manhatton(x['doc2vec_test1'], x['doc2vec_test2']), axis=1) test['euclidean1'] = test.apply( lambda x: Euclidean(x['doc2vec_test1'], x['doc2vec_test2']), axis=1) test['pearson1'] = test.apply( lambda x: PearsonSimilar(x['doc2vec_test1'], x['doc2vec_test2']), axis=1) test['spearman1'] = test.apply( lambda x: SpearmanSimilar(x['doc2vec_test1'], x['doc2vec_test2']), axis=1) test['kendall1'] = test.apply( lambda x: KendallSimilar(x['doc2vec_test1'], x['doc2vec_test2']), axis=1) test.to_csv('test_doc2vec1.csv', index=False)