def s2s_end2end(filename): pkl_name = "data/split/20180405.pkl" token_idx_dict, idx_token_dict, ontology_idx_dict, idx_ontology_dict, pretrained_dict, train_token, test_token, train_ontology, test_ontology = pickle.load( open(pkl_name, "rb")) s2s_core = Seq2seqCore(gpu_device=0, encoder_vocab_size=len(token_idx_dict), decoder_vocab_size=len(ontology_idx_dict)) index_inputs = list() for one_line in open(filename): one_line = one_line.strip() one_line = one_line.lower() assert len(one_line.split("\t")) == 2 one_line = one_line.split("\t")[0] one_line_list = word_tokenize(one_line) one_index_input = data_indexer(one_line_list, token_idx_dict) index_inputs.append(one_index_input) input_dict = feeder_generator(index_inputs) for epoch_number in range(1, 300): print(epoch_number) s2s_core.load("models/20180329/pso/%05d.tfmodel" % epoch_number) predict_result = s2s_core.predict(input_dict) predict_list = predict_result.sample_id[:, 0:3].tolist() f_w = open("data/output/pso_attention_%03d.txt" % epoch_number, "w") for one_predict_result in predict_list: one_predict_tuple = data_indexer(one_predict_result, idx_ontology_dict) f_w.write("%s\n" % (" ".join(one_predict_tuple))) f_w.close()
def feeder_generator(raw_inputs, token_idx_dict, pretrained_dict): result_dict = dict() max_length = 0 for one_entry in raw_inputs: if len(one_entry) > max_length: max_length = len(one_entry) x_data = list() pretrained_x_data = list() x_length = list() y_length = list() for one_entry in raw_inputs: one_pretrained_x = list() for one_original_token in one_entry: try: one_pretrained_x.append(pretrained_dict[one_original_token]) except Exception: one_pretrained_x.append(np.zeros([ 200, ], dtype=np.float32)) x_length.append(len(one_entry)) x_data.append( data_indexer(one_entry, token_idx_dict) + [0] * (max_length + 2 - len(one_entry))) pretrained_x_data.append( np.concatenate((np.array(one_pretrained_x, dtype=np.float32), np.zeros([max_length + 2 - len(one_entry), 200], dtype=np.float32)), axis=0)) y_length.append(5) result_dict["encoder_length"] = np.array(x_length, dtype=np.int32) result_dict["decoder_length"] = np.array(y_length, dtype=np.int32) result_dict["encoder_input"] = np.array(x_data, dtype=np.int32) result_dict["encoder_pretrained"] = np.array(pretrained_x_data, dtype=np.float32) return result_dict
def s2s_split(file_name): _, _, ontology_idx_dict, _, _, _, raw_sentences, ontology_results = pickle.load( open(file_name, "rb")) test_ratio = 0.0014 test_size = int(math.floor(len(raw_sentences) * test_ratio)) get_out_flag = False while get_out_flag is False: randomized_idx = np.random.permutation(len(raw_sentences)) train_split_idx = randomized_idx[test_size:] train_raw_ontology = list() for one_idx in train_split_idx: train_raw_ontology.append(ontology_results[one_idx]) ontology_voc_list = list() for one_ontology_result in train_raw_ontology: for one_ontology in one_ontology_result: ontology_voc_list.append(one_ontology) ontology_voc_list = list(set(ontology_voc_list)) new_ontology_idx_dict, _ = dictionary_generator(ontology_voc_list, oov_flag=False) print(len(new_ontology_idx_dict)) print(len(ontology_idx_dict)) if len(new_ontology_idx_dict) == len(ontology_idx_dict): get_out_flag = True train_raw_token = list() for one_idx in train_split_idx: train_raw_token.append(raw_sentences[one_idx]) token_voc_list = list() for one_raw_sentence in train_raw_token: for one_token in one_raw_sentence: token_voc_list.append(one_token) token_voc_list = list(set(token_voc_list)) token_idx_dict, idx_token_dict = dictionary_generator(token_voc_list, eos_flag=False) ontology_idx_dict, idx_ontology_dict = dictionary_generator( ontology_voc_list, oov_flag=False) train_token = list() for one_raw_sentence in train_raw_token: train_token.append(data_indexer(one_raw_sentence, token_idx_dict)) test_raw_token = list() for one_idx in randomized_idx[:test_size]: test_raw_token.append(raw_sentences[one_idx]) test_token = list() for one_raw_sentence in test_raw_token: test_token.append(data_indexer(one_raw_sentence, token_idx_dict)) train_ontology = list() for one_raw_ontology in train_raw_ontology: train_ontology.append(data_indexer(one_raw_ontology, ontology_idx_dict)) test_raw_ontology = list() for one_idx in randomized_idx[:test_size]: test_raw_ontology.append(ontology_results[one_idx]) test_ontology = list() for one_raw_ontology in test_raw_ontology: test_ontology.append(data_indexer(one_raw_ontology, ontology_idx_dict)) split_data = (token_idx_dict, idx_token_dict, ontology_idx_dict, idx_ontology_dict, train_token, test_token, train_ontology, test_ontology) pickle.dump(split_data, open("data/split/20180331.pkl", "wb"))
def s2s_preprocess(train_file_name, test_file_name): raw_sentences = list() ontology_results = list() max_length = 0 for one_line in open(train_file_name): one_line = one_line.strip() print(one_line) if len(one_line.split("\t")) != 2: continue raw_sentence = one_line.split("\t")[0] ontology_string = one_line.split("\t")[1] tokenized_list = word_tokenize(raw_sentence) if len(tokenized_list) > max_length: max_length = len(tokenized_list) ontology_tuple = ontology_string.split() if len(ontology_tuple) != 3: continue raw_sentences.append(tokenized_list) ontology_results.append(ontology_tuple) token_voc_list = list() ontology_voc_list = list() for one_raw_sentence in raw_sentences: for one_token in one_raw_sentence: token_voc_list.append(one_token) token_voc_list = list(set(token_voc_list)) for one_ontology_result in ontology_results: for one_ontology in one_ontology_result: ontology_voc_list.append(one_ontology) ontology_voc_list = list(set(ontology_voc_list)) token_idx_dict, idx_token_dict = dictionary_generator(token_voc_list, eos_flag=False) ontology_idx_dict, idx_ontology_dict = dictionary_generator(ontology_voc_list, oov_flag=False) token_store_data = list() for one_raw_sentence in raw_sentences: token_store_data.append(data_indexer(one_raw_sentence, token_idx_dict)) ontology_store_data = list() for one_ontology_result in ontology_results: ontology_store_data.append(data_indexer(one_ontology_result, ontology_idx_dict)) pretrained_dict = dict() print("Loading pretrained Word2Vec model ...") w2v_embedding_path = "data/w2v/wiki20170101" w2v_model = Word2Vec.load(w2v_embedding_path) for one_line in open(train_file_name): one_line = one_line.strip() if len(one_line.split("\t")) != 2: continue raw_sentence = one_line.split("\t")[0] tokenized_list = word_tokenize(raw_sentence) for one_token in tokenized_list: if one_token not in w2v_model.wv.vocab: continue pretrained_dict[one_token] = w2v_model[one_token] for one_line in open(test_file_name): one_line = one_line.strip() if len(one_line.split("\t")) != 2: continue raw_sentence = one_line.split("\t")[0] tokenized_list = word_tokenize(raw_sentence) for one_token in tokenized_list: if one_token not in w2v_model.wv.vocab: continue pretrained_dict[one_token] = w2v_model[one_token] processed_data = (token_idx_dict, idx_token_dict, ontology_idx_dict, idx_ontology_dict, pretrained_dict, token_store_data, ontology_store_data, raw_sentences, ontology_results) pickle.dump(processed_data, open("data/preprocessed/20180405.pkl", "wb"))