def _fill_np_matrix(self, texts, keyword = True): max_time = max(map(len, texts)) # Edited if keyword: # one keyword each, 64 keywords in total for 16 poems matrix = np.zeros([_BATCH_SIZE, max_time, CHAR_VEC_DIM], dtype = np.float32) for i in range(_BATCH_SIZE): for j in range(max_time): matrix[i, j, :] = self.planner.get_vect(end_of_sentence()) for i, text in enumerate(texts): matrix[i, : len(text)] = self.planner.get_vects(text) seq_length = [len(texts[i]) if i < len(texts) else 0 \ for i in range(_BATCH_SIZE)] else: matrix = np.zeros([_BATCH_SIZE, max_time, CHARPIN_VEC_DIM], dtype = np.float32) for i in range(_BATCH_SIZE): for j in range(max_time): matrix[i, j, :] = self.char2vec.get_vect(end_of_sentence()) for i, text in enumerate(texts): matrix[i, : len(text)] = self.char2vec.get_vects(text) seq_length = [len(texts[i]) if i < len(texts) else 0 \ for i in range(_BATCH_SIZE)] return matrix, seq_length
def gen_train_data(): """获取每一句的keywords,拼起来写入文件""" print("Generating training data ...") segmenter = Segmenter() poems = Poems() ranked_words = RankedWords() gen_data = list() plan_data = list() valid = True counter_line = 0 print('len(poems)==>', len(poems)) for poem in poems: # print(len(poem)) if len(poem) != 4: # print(poem) valid = False continue context = start_of_sentence() keywords = list() for sentence in poem: counter_line += 1 keyword = '' if len(sentence) != 7: valid = False break filterwords = list( filter(lambda x: x in ranked_words, segmenter.segment(sentence))) if filterwords: keyword = filterwords[0] for word in filterwords: # print('word==>',word) if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word if keyword: gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' keywords.append(keyword) gen_data.append(gen_line) context += sentence + end_of_sentence() plan_data.append(' '.join(keywords)) with open(plan_data_path, 'w') as fw: for data_iter in gen_data: fw.write(data_iter + '\n') with open(gen_data_path, 'w') as fw: for data_iter in gen_data: fw.write(data_iter) print('counter_line==>', counter_line) del segmenter, poems, ranked_words
def generate(funcutils,generator,keywords): assert NUM_OF_SENTENCES == len(keywords) pron_dict = PronDict() saver = tf.train.Saver() context = start_of_sentence() with tf.Session() as session: flag_trained = generator.initialize_session(session, saver) if not flag_trained: print("Please train the model first! (./train.py -g)") sys.exit(1) for keyword in keywords: ##为何 [keyword*_batch_size? keyword_data 都是一样的 keyword_data, keyword_length = funcutils.fill_np_matrix( [keyword] * _BATCH_SIZE) context_data, context_length = funcutils.fill_np_matrix( [context] * _BATCH_SIZE) char = start_of_sentence() for _ in range(7): # print('char==>', char) decoder_input, decoder_input_length = \ funcutils.fill_np_matrix([char]) encoder_feed_dict = { generator.keywords: keyword_data, generator.length_keywords: keyword_length, generator.context: context_data, generator.context_length: context_length, generator.sequence_decoder: decoder_input, generator.length_decoder: decoder_input_length } if char == start_of_sentence(): pass else: encoder_feed_dict[generator.initial_decode_state] = state # do = session.run([self.decoder_outputs], feed_dict=encoder_feed_dict) # print(do) # print(do[0].shape) probs, state = session.run( [generator.logits, generator.decoder_final_state], feed_dict=encoder_feed_dict) prob_list = _gen_prob_list(funcutils,probs, context, pron_dict) id_probmax = np.argmax(prob_list, axis=0) char = funcutils.char_dict.id2char(id_probmax) # prob_sums = np.cumsum(prob_list) # rand_val = prob_sums[-1] * random() # for i, prob_sum in enumerate(prob_sums): # if rand_val < prob_sum: # char = self.char_dict.int2char(i) # break context += char context += end_of_sentence() return context[1:].split(end_of_sentence())
def gen_train_data(): print("Generating training data ...") segmenter = Segmenter() poems = Poems() poems.shuffle() ranked_words = RankedWords() plan_data = [] gen_data = [] for poem in poems: if len(poem) != 4: continue # Only consider quatrains. valid = True context = start_of_sentence() gen_lines = [] keywords = [] for sentence in poem: if len(sentence) != 7: #只考虑七字诀句 valid = False break #get a list of selected words from this sentence #ignore all words if they are not in the ranked words list words = list( filter(lambda seg: seg in ranked_words, segmenter.segment(sentence))) if len(words) == 0: valid = False break keyword = words[0] # from all words in this sentence, get the word with highest text_rank score for word in words[1:]: if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' gen_lines.append(gen_line) keywords.append(keyword) context += sentence + end_of_sentence() if valid: # plan data: each line is four keywords from the 4 sentences plan_data.append('\t'.join(keywords) + '\n') gen_data.extend(gen_lines) with open(plan_data_path, 'w') as fout: for line in plan_data: fout.write(line) with open(gen_data_path, 'w') as fout: for line in gen_data: fout.write(line)
def gen_train_data(): print("Generating training data ...") segmenter = Segmenter() poems = Poems() poems.shuffle() ranked_words = RankedWords() plan_data = [] gen_data = [] for poem in poems: # 只处理四行七言的诗 if len(poem) != 4: continue valid = True context = start_of_sentence() gen_lines = [] keywords = [] for sentence in poem: if len(sentence) != 7: valid = False break words = list( filter(lambda seg: seg in ranked_words, segmenter.segment(sentence))) if len(words) == 0: valid = False break keyword = words[0] for word in words[1:]: if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' gen_lines.append(gen_line) keywords.append(keyword) context += sentence + end_of_sentence() if valid: plan_data.append('\t'.join(keywords) + '\n') gen_data.extend(gen_lines) with open(plan_data_path, 'w') as fout: for line in plan_data: fout.write(line) with open(gen_data_path, 'w') as fout: for line in gen_data: fout.write(line)
def generate(self, keywords): if NUM_OF_SENTENCES < len(keywords): keywords = keywords[:4] pron_dict = PronDict() context = start_of_sentence() with tf.Session() as session: self._initialize_session(session) if not self.trained: print("Please train the model first! (./train.py -g)") sys.exit(1) for keyword in keywords: keyword_data, keyword_length = self._fill_np_matrix( [keyword] * _BATCH_SIZE) context_data, context_length = self._fill_np_matrix( [context] * _BATCH_SIZE) char = start_of_sentence() for _ in range(7): decoder_input, decoder_input_length = \ self._fill_np_matrix([char]) encoder_feed_dict = { self.keyword: keyword_data, self.keyword_length: keyword_length, self.context: context_data, self.context_length: context_length, self.decoder_inputs: decoder_input, self.decoder_input_length: decoder_input_length } if char == start_of_sentence(): pass else: encoder_feed_dict[self.decoder_init_state] = state probs, state = session.run( [self.probs, self.decoder_final_state], feed_dict=encoder_feed_dict) prob_list = self._gen_prob_list(probs, context, pron_dict) prob_sums = np.cumsum(prob_list) rand_val = prob_sums[-1] * random() for i, prob_sum in enumerate(prob_sums): if rand_val < prob_sum: char = self.char_dict.int2char(i) break context += char context += end_of_sentence() return context[1:].split(end_of_sentence())
def generate(self, keywords): if not tf.train.get_checkpoint_state(save_dir): print("Please train the model first! (./train.py -g)") sys.exit(1) self.checkpoint.restore(self.manager.latest_checkpoint) print("Checkpoint is loaded successfully !") assert NUM_OF_SENTENCES == len(keywords) context = start_of_sentence() pron_dict = PronDict() for keyword in keywords: keyword_data, keyword_length = self._fill_np_matrix([keyword] * _BATCH_SIZE) context_data, context_length = self._fill_np_matrix([context] * _BATCH_SIZE) keyword_state, context_output, final_output, final_state, context_state = self.encoder( keyword_data, context_data) char = start_of_sentence() for _ in range(7): decoder_input, decoder_input_length = \ self._fill_np_matrix([char]) if char == start_of_sentence(): pass else: keyword_state = final_state probs, final_state, _ = self.decoder(keyword_state, context_output, decoder_input, decoder_input_length, final_output, final_state, context_state) prob_list = self._gen_prob_list(probs, context, pron_dict) prob_sums = np.cumsum(prob_list) rand_val = prob_sums[-1] * random() for i, prob_sum in enumerate(prob_sums): if rand_val < prob_sum: char = self.char_dict.int2char(i) break context += char context += end_of_sentence() return context[1:].split(end_of_sentence())
def _fill_np_matrix(self, texts): max_time = max(map(len, texts)) #the len of keyword matrix = np.zeros([_BATCH_SIZE, max_time, CHAR_VEC_DIM], dtype=np.float32) for i in range(_BATCH_SIZE): for j in range(max_time): matrix[i, j, :] = self.char2vec.get_vect(end_of_sentence()) for i, text in enumerate(texts): matrix[i, :len(text)] = self.char2vec.get_vects(text) seq_length = [len(texts[i]) if i < len(texts) else 0 \ for i in range(_BATCH_SIZE)] return matrix, seq_length
def fill_np_matrix(self, texts): maxlen = max(map(len, texts)) matrix = np.zeros(shape=[_BATCH_SIZE, maxlen, _NUM_UNITS]) for i in range(_BATCH_SIZE): for j in range(maxlen): matrix[i, j, :] = self.char2vec.get_vect(end_of_sentence()) for index, text in enumerate(texts): matrix[index, :len(text)] = self.char2vec.get_vects(text) lens_seq = [ len(texts[index]) if index < len(texts) else 0 for index in range(_BATCH_SIZE) ] return matrix, np.array(lens_seq)
def generate(self, keywords): assert NUM_OF_SENTENCES + 1 == len(keywords) pron_dict = PronDict() context = start_of_sentence() with tf.Session() as session: self._initialize_session(session) if not self.trained: print("Please train the model first! (./train.py -g)") sys.exit(1) # iterate through all keyword, which means iterate through all four sentences # provide a random hint to the first sentence to avoid generating the same thing hint = keywords.pop(randrange(len(keywords))) first_line = True for keyword in keywords: if first_line: context += hint first_line = False keyword_data, keyword_length = self._fill_np_matrix( [keyword] * _BATCH_SIZE) context_data, context_length = self._fill_np_matrix( [context] * _BATCH_SIZE) char = start_of_sentence() word_count = 0 state = '' while word_count < 7: prob_list, state = self._compute_prob_list(char,keyword_data,keyword_length,\ context_data,context_length,context,state,session,pron_dict) # randomly sample BEAM_SIZE number of characters and choose the highest probability # generates different poems when given different keywords if word_count == 0: prob_sums = np.cumsum(prob_list) # the array which store the first char char_array = [] score_array = [] for i in range(BEAM_SIZE): char_array.append('') score_array.append(1) for i in range(BEAM_SIZE): rand_val = prob_sums[-1] * random() for j, prob_sum in enumerate(prob_sums): if rand_val < prob_sum: char_array[i] = self.char_dict.int2char(j) score_array[i] *= -math.log(prob_list[j]) break # because we took the negative log we need the minimum prob min_value = 1000 min_index = 0 for k in range(len(score_array)): if score_array[k] < min_value: min_index = k min_value = score_array[k] char = char_array[min_index] # generates the same poem for the same keywords ''' max_value = prob_list[0] max_index = 0 for k in range(len(prob_list)): if prob_list[k] > max_value: max_index = k max_value = prob_list[k] char = self.char_dict.int2char(max_index) ''' context += char word_count += 1 # end of first word else: # perform beam search for two chars char_array = [] second_char_array = [] score_array = [] for i in range(BEAM_SIZE): char_array.append('') second_char_array.append('') score_array.append(1) max = 0 # choose the BEAM_SIZE most possible choices for i in range(BEAM_SIZE): char_array[i], score, used_index = self._return_n_most_likely(prob_list,i+1) score_array[i] *= score # make sure that the same thing is not selected again prob_list[used_index] = 0 # choose the most possible choice based on the current choice for i in range(BEAM_SIZE): current_context = context + char_array[i] prob_list, state = self._compute_prob_list(char_array[i],keyword_data,keyword_length,\ context_data,context_length,current_context,state,session,pron_dict) second_char_array[i], score, used_index = self._return_n_most_likely(prob_list,1) # randomly sample second array and make sure it does not repeat # random_sample = second_char_array[randrange(len(second_char_array))] random_sample = second_char_array[i] used_chars = set(ch for ch in context) tmp = 2 while(random_sample == char_array[i] or random_sample in used_chars): second_char_array[i], score, used_index = self._return_n_most_likely(prob_list,tmp) random_sample = second_char_array[i] tmp += 1 score_array[i] *= score # because we took the negative log the minimum score is the best min_value = 1000 min_index = 0 for i in range(len(score_array)): if score_array[i] < min_value: min_index = i min_value = score_array[i] # adjust so that we prevent using the same character again and again used_chars = set(ch for ch in context) first_char = char_array[min_index] in_loop = 0 while first_char in used_chars and in_loop < len(char_array): score_array[min_index] = 1000 min_value = 1000 for i in range(len(score_array)): # find the minimum in the remaining if score_array[i] < min_value: min_index = i min_value = score_array[i] first_char = char_array[min_index] in_loop += 1 first_char = char_array[min_index] second_char = second_char_array[min_index] context += first_char context += second_char char = second_char word_count += 2 # append the <END> label context += end_of_sentence() # remove the extra hint context = context[0] + context[len(hint) + 1:] return context[1:].split(end_of_sentence())