def generate_dic_and_corpus(filepath, stop_words): # not stop words file raw_questions, raw_answers, raw_questions_passages = load_data_text( filepath) q_length = len(raw_questions) knowledge_texts = [] # knowledge = question + passages questions_str = tokenizer(raw_questions, stop_words) #answers_str = tokenizer(raw_answers, stop_words) for idx in range(len(raw_questions_passages)): # pass_text = [] # each passage of a question, put all passages' words together # for passage in raw_questions_passages[idx]: # pass_text += passage temp = [] q_kb = tokenizer(raw_questions_passages[idx], stop_words, remove_stopwords=True, \ remove_single_word=False) for kp in q_kb: temp += kp temp += questions_str[idx] knowledge_texts.append(temp) dictionary = corpora.Dictionary( knowledge_texts) # dictionary of knowledge and train data os.path.join(CUR_PATH + 'tmp/dictionary.dict') dictionary.save(CUR_PATH + 'tmp/dictionary.dict') corpus = [dictionary.doc2bow(text) for text in knowledge_texts] # corpus of knowledge corpora.MmCorpus.serialize(CUR_PATH + 'tmp/knowledge_corpus.mm', corpus)
def generate_dic_and_corpus(knowledge_file, file_name, stop_words): knowledge_texts = tokenizer(knowledge_file, stop_words) train_texts = tokenizer(file_name, stop_words) dictionary = corpora.Dictionary( knowledge_texts + train_texts) # dictionary of knowledge and train data dictionary.save(os.path.join('tmp/dictionary.dict')) corpus = [dictionary.doc2bow(text) for text in knowledge_texts] # corpus of knowledge corpora.MmCorpus.serialize('tmp/knowledge_corpus.mm', corpus)
def generate_dic_and_corpus(knowledge_file, file_name, stop_words): knowledge_texts = tokenizer(knowledge_file, stop_words) train_texts = tokenizer(file_name, stop_words) # 保存字典 if not os.path.exists('./tmp'): os.makedirs('./tmp') dictionary = corpora.Dictionary(knowledge_texts + train_texts) dictionary.save(os.path.join('./tmp/dictionary.dict')) corpus = [dictionary.doc2bow(text) for text in knowledge_texts] # corpus of knowledge corpora.MmCorpus.serialize('./tmp/knowledge_corpus.mm', corpus) # todo 啥方法????
import sys import numpy as np import get_config import data_util import gru gConfig = get_config.get_config() train_data = gConfig['train_data'] test_data = gConfig['test_data'] epochs = gConfig['epochs'] batch_size = gConfig['batch_size'] x_array, y_array = data_util.create_data(train_data) a_array, b_array = data_util.create_data(test_data) x_array, lang_tokenizer = data_util.tokenizer(x_array, 'UNK', 0) y_array = data_util.padding_target(y_array, gConfig['max_inp']) y_array = np.expand_dims(y_array, 2) print(x_array.shape) print(y_array.shape) def train(): print('Training data in %s' % gConfig['train_data']) checkpoint_dir = gConfig['model_data'] steps_per_epoch = len(x_array) // gConfig['batch_size'] ckpt = tf.io.gfile.exists(checkpoint_dir) if ckpt: gru.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) BUFFER_SIZE = len(x_array) dataset = tf.data.Dataset.from_tensor_slices(
key=lambda item: -item[1])[:k] ] # topk index sim_ixs.append(sim_ix) tmp.clear() with open(sim_path, "wb") as f: pickle.dump(sim_ixs, f) return sim_ixs # module test if __name__ == '__main__': stop_words_ = codecs.open("data/stop_words.txt", 'r', encoding='utf8').readlines() stop_words_ = [w.strip() for w in stop_words_] generate_dic_and_corpus("data/knowledge.txt", "data/train.txt", stop_words_) res = topk_sim_ix("data/train.txt", stop_words_, 5) print(len(res)) knowledge_file = "data/knowledge.txt" file_name = "data/train.txt", knowledge_texts = tokenizer(knowledge_file, stop_words_) knowledge_texts[0] #['地球', '宇宙', '中', '一颗', '行星', '运动', '规律'] dictionary.doc2bow(knowledge_texts[0]) #"""Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples. #[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)] dictionary.doc2bow(knowledge_texts[10]) #['蟹', '状', '星云', '金牛座', '一团', '膨胀', '气体'] #[(27, 1), (51, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1)]