def gen_train_data(): print("Generating training data ...") segmenter = Segmenter() poems = Poems() poems.shuffle() ranked_words = RankedWords() plan_data = [] gen_data = [] for poem in poems: if len(poem) != 4: continue # Only consider quatrains. valid = True context = start_of_sentence() gen_lines = [] keywords = [] for sentence in poem: if len(sentence) != 7: #只考虑七字诀句 valid = False break #get a list of selected words from this sentence #ignore all words if they are not in the ranked words list words = list( filter(lambda seg: seg in ranked_words, segmenter.segment(sentence))) if len(words) == 0: valid = False break keyword = words[0] # from all words in this sentence, get the word with highest text_rank score for word in words[1:]: if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' gen_lines.append(gen_line) keywords.append(keyword) context += sentence + end_of_sentence() if valid: # plan data: each line is four keywords from the 4 sentences plan_data.append('\t'.join(keywords) + '\n') gen_data.extend(gen_lines) with open(plan_data_path, 'w') as fout: for line in plan_data: fout.write(line) with open(gen_data_path, 'w') as fout: for line in gen_data: fout.write(line)
def gen_train_data(): print("Generating training data ...") segmenter = Segmenter() poems = Poems() poems.shuffle() ranked_words = RankedWords() plan_data = [] gen_data = [] for poem in poems: # 只处理四行七言的诗 if len(poem) != 4: continue valid = True context = start_of_sentence() gen_lines = [] keywords = [] for sentence in poem: if len(sentence) != 7: valid = False break words = list( filter(lambda seg: seg in ranked_words, segmenter.segment(sentence))) if len(words) == 0: valid = False break keyword = words[0] for word in words[1:]: if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' gen_lines.append(gen_line) keywords.append(keyword) context += sentence + end_of_sentence() if valid: plan_data.append('\t'.join(keywords) + '\n') gen_data.extend(gen_lines) with open(plan_data_path, 'w') as fout: for line in plan_data: fout.write(line) with open(gen_data_path, 'w') as fout: for line in gen_data: fout.write(line)
def gen_train_data(): print("Generating training data ...") '''with open(r'raw/corpus.txt', 'r',encoding='utf-8') as fin: for line in fin.readlines()[0 : 6]: for sentence in split_sentences(line): print(' '.join(sentence))''' poems = Poems() poems.shuffle() ranked_words = RankedWords() plan_data = [] gen_data = [[], [], [], []] gen_data_for_pair_train = [] sentence2_without_keyword = [] sentence4_without_keyword = [] for poem in poems: if len(poem) != 4: continue # Only consider quatrains. 只考虑绝句 valid = True context = start_of_sentence() gen_lines = [] keywords = [] for i, sentence in enumerate(poem): if len(sentence) != 5: # 选5言诗 valid = False break temp = ''.join(sentence) tempList = [temp[:2], temp[1:3], temp[2:4], temp[3:5] ] + sentence #把相邻的两个字组成的对子和单个字都列出来 words = list(filter(lambda seg: seg in ranked_words, tempList)) #取出关键字, 只考虑了单个字的情况 if len(words) == 0: #希望句子里有排出来的高效的字 valid = False break keyword = words[0] for word in words[1:]: if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word #找第一关键字 if len(keyword) == 2: #根本没有这回事, ranked_words都是单个的 keyword = keyword[0] + ' ' + keyword[1] gen_line = ' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' '+ ' '.join(sentence) + \ '\t' + keyword + '\t' + context + '\n' #数据格式: 当前诗句 当前诗句 关键字 上文 if i == 2: sentence2_without_keyword.append(' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' ' + ' '.join(sentence) +\ '\t' + '^' + '\t' + context + '\n') if i == 3: sentence4_without_keyword.append(' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' ' + ' '.join(sentence) +\ '\t' + '^' + '\t' + context + '\n') #nokeyword数据格式: 当前诗句 当前诗句 上文 if i == 1 or i == 3: gen_line_ = ' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' ' + ' '.join(sentence) +\ '\t' + '^' + '\t' + '^' + ' ' + ' '.join(last_sentence) + '\n' gen_data_for_pair_train.append(gen_line_) gen_lines.append(gen_line) keywords.append(keyword) context += ' ' + ' '.join(sentence) + ' ' + end_of_sentence() last_sentence = sentence if valid: plan_data.append('\t'.join(keywords) + '\n') #对每首诗提4个关键字出来 for i, line in enumerate(gen_lines): gen_data[i].append(line) with open(plan_data_path, 'w') as fout: for line in plan_data: fout.write(line) '''with open(gen_data_path, 'w') as fout: pass''' for i in range(1, 5): with open(train_data_path[i] + '/' + f'keyword{i}' + '.txt', 'w') as f: for line in gen_data[i - 1]: f.write(line) #对仗可能需要用到的上下句关系 with open(train_data_path[0] + '/pair.txt', 'w') as f: for line in gen_data_for_pair_train: f.write(line) with open(train_data_path[2] + '/pair.txt', 'w') as f: for line in gen_data_for_pair_train: f.write(line) with open(train_data_path[4] + '/pair.txt', 'w') as f: for line in gen_data_for_pair_train: f.write(line) with open(train_data_path[3] + '/nokeyword3.txt', 'w') as f: for line in sentence2_without_keyword: f.write(line) with open(train_data_path[4] + '/nokeyword4.txt', 'w') as f: for line in sentence4_without_keyword: f.write(line)