def process(): x_text1, x_text2, y_train = datahelper.load_data(FLAGS.en_train, FLAGS.sp_train) x_text = np.concatenate([x_text1, x_text2], axis=0) word2index, index2word = datahelper.create_vocabulary(x_text) vocab_size = len(index2word) word_embedding = datahelper.asign_pretrained_word_embedding() max_len = max([len(x.split(" ")) for x in x_text]) x_text1_int = [] x_text2_int = [] for line in x_text1: line_list = datahelper.text_to_wordlist(line) line_list = line_list.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] x_text1_int.append(text) for line in x_text2: line_list = datahelper.text_to_wordlist(line) line_list = line_list.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] x_text2_int.append(text) x_train1 = pad_sequences(x_text1_int, max_len) x_train2 = pad_sequences(x_text2_int, max_len) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y_train))) x_shuffled1 = x_train1[shuffle_indices] x_shuffled2 = x_train2[shuffle_indices] y_shuffled = y_train[shuffle_indices] dev_sample_index = -1 * int( FLAGS.dev_sample_percentage * float(len(y_train))) x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[ dev_sample_index:] x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] del x_text1, x_text2, x_text1_int, x_text2_int return x_train1, x_dev1, x_train2, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size
def process(): x_train1, x_train2, _ = datahelper.load_data(filepath_en_train, filepath_sp_train) # stop_word = list(open(file_stop_word, "r", encoding='UTF-8').readlines()) # stop_word_list = [ # line.replace("\n", "").replace(",", "").replace(".", "").replace("?", "").replace("¿", "").replace("!", # "").replace( # "¡", "").lower() for # line in # stop_word] train_data = np.concatenate([x_train1, x_train2], axis=0) d2c_list = [] for line in train_data: # line_list = [x for x in line if x not in stop_word_list] line = datahelper.text_to_wordlist(line, remove_stop_words=True, stem_words=False).split(" ") d2c_list.append(line) alldocuments = [] analyzedDocument = namedtuple('AnalyzedDocument', 'words tags') for id, record in enumerate(d2c_list): qid = str('SENT_%s' % id) words = record words_text = " ".join(words) words = gensim.utils.simple_preprocess(words_text) tags = [qid] alldocuments.append(analyzedDocument(words, tags)) print("Start Training Doc2Vec Time : %s" % (str(datetime.datetime.now()))) saved_model_name = "doc_2_vec_" + str(int(time.time())) model_4 = gensim.models.Doc2Vec(alldocuments, dm=1, dm_concat=1, vector_size=300, window=5, min_count=2, epochs=100) model_4.save("%s" % (saved_model_name)) print("model training completed : %s" % (saved_model_name))
def makeFeature(): x_train1, x_train2, _ = datahelper.load_data(filepath_en_train, filepath_sp_train) x_train1 = process_data(x_train1) x_train2 = process_data(x_train2) now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') print('get sentence vector') train = pd.DataFrame() train['doc2vec_train1'] = [ doc2vec_model.get_question_vector(x, model) for x in x_train1 ] train['doc2vec_train2'] = [ doc2vec_model.get_question_vector(x, model) for x in x_train2 ] print('get six kinds of coefficient about vector') train['cosine1'] = train.apply( lambda x: Cosine(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['manhatton1'] = train.apply( lambda x: Manhatton(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['euclidean1'] = train.apply( lambda x: Euclidean(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['pearson1'] = train.apply( lambda x: PearsonSimilar(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['spearman1'] = train.apply( lambda x: SpearmanSimilar(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['kendall1'] = train.apply( lambda x: KendallSimilar(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train.to_csv('train_doc2vec1.csv', index=False)
# filepath_unlabel = "E:\\CIKM2018\\cikm_unlabel_spanish_train_20180516\\cikm_unlabel_spanish_train_20180516.txt" # w2v_pah = "E:\\CIKM2018\\w2v.model.bin" # fast_path = "E:\\CIKM2018\\fast_text_vectors_wiki.es.vec\\wiki.es.vec" # file_stop_word = "E:\\CIKM2018\\spanish_stop_word.txt" filepath_en_train = "I:\\CIKM\\cikm_english_train_20180516\\cikm_english_train_20180516.txt" filepath_sp_train = "I:\\CIKM\\cikm_spanish_train_20180516.txt" filepath_test = "I:\\CIKM\\cikm_test_a_20180516.txt" filepath_unlabel = "I:\\CIKM\\cikm_unlabel_spanish_train_20180516\\cikm_unlabel_spanish_train_20180516.txt" w2v_pah = "I:\\CIKM\\w2v.model.bin" fast_path = "I:\\CIKM\\fast_text_vectors_wiki.es.vec\\wiki.es.vec" file_stop_word = "I:\\CIKM\\spanish_stop_word.txt" from text_match.en.data_utils import datahelper import pandas as pd import hashlib x_train1, x_train2, _ = datahelper.load_data(filepath_en_train, filepath_sp_train) train = pd.DataFrame() test = pd.DataFrame() train['question1'] = x_train1 train['question2'] = x_train2 # Generating a graph of Questions and their neighbors def generate_qid_graph_table(row): hash_key1 = hashlib.md5(row["question1"].encode('utf-8')).hexdigest() hash_key2 = hashlib.md5(row["question2"].encode('utf-8')).hexdigest() qid_graph.setdefault(hash_key1, []).append(hash_key2) qid_graph.setdefault(hash_key2, []).append(hash_key1)
"I:\\nlp_semantics\\text_match\\en\\features\\word2ve\\train_weight_tfidf.csv" ) train_w2v.drop(['question1', 'question2', 'q1_unique', 'q2_unique'], axis=1, inplace=True) train_doc2vec = pd.read_csv( "I:\\nlp_semantics\\text_match\\en\\features\\doc2vec\\train_doc2vec1.csv") train_doc2vec.drop(['doc2vec_train1', 'doc2vec_train2'], axis=1, inplace=True) train = pd.concat([ train_bag, train_magic1, train_magic2, train_freq, train_ngram, train_simple, train_weight, train_page, train_w2v, train_doc2vec ], axis=1) _, _, y_train = datahelper.load_data(filepath_en_train, filepath_sp_train) import numpy as np sums = np.sum(y_train, axis=0) print(sums / len(y_train)) from sklearn.model_selection import train_test_split x_train, x_dev, y_train, y_dev = train_test_split(train.values, y_train, test_size=0.1, random_state=0) import lightgbm as lgb train_input = lgb.Dataset(x_train, y_train)