def _gen_train_data(): segmenter = Segmenter() poems = get_pop_quatrains() random.shuffle(poems) ranks = get_word_ranks() print("Generating training data ...") data = [] kw_data = [] for idx, poem in enumerate(poems): sentences = poem['sentences'] if len(sentences) == 4: flag = True rows = [] kw_row = [] for sentence in sentences: rows.append([sentence]) segs = list(filter(lambda seg: seg in ranks, segmenter.segment(sentence))) if 0 == len(segs): flag = False break keyword = reduce(lambda x,y: x if ranks[x] < ranks[y] else y, segs) kw_row.append(keyword) rows[-1].append(keyword) if flag: data.extend(rows) kw_data.append(kw_row) if 0 == (idx+1)%2000: print("[Training Data] %d/%d poems are processed." %(idx+1, len(poems))) with codecs.open(train_path, 'w', 'utf-8') as fout: for row in data: fout.write('\t'.join(row)+'\n') with codecs.open(kw_train_path, 'w', 'utf-8') as fout: for kw_row in kw_data: fout.write('\t'.join(kw_row)+'\n') print("Training data is generated.")
def _gen_cangtou_train_data(): poems = get_pop_quatrains() random.shuffle(poems) with codecs.open(cangtou_train_path, 'w', 'utf-8') as fout: for idx, poem in enumerate(poems): for sentence in poem['sentences']: fout.write(sentence + "\t" + sentence[0] + "\n") if 0 == (idx + 1) % 2000: print("[Training Data] %d/%d poems are processed." %(idx+1, len(poems))) print("Cangtou training data is generated.")
def _gen_train_data(): sampled_poems = np.array(random_int_list(1, 70000, 4000)) segmenter = Segmenter() #generation sxhy dict poems = get_pop_quatrains() #获得较为流行的10万首诗 random.shuffle(poems) #重新排序 ranks = get_word_ranks() #Textrank word -rank_number print("Generating training data ...") data = [] kw_data = [] test_data = [] for idx, poem in enumerate(poems): sentences = poem['sentences'] if len(sentences) == 4: flag = True test_flag = True rows = [] kw_row = [] test_row = [] if idx in sampled_poems: test_flag = False for sentence in sentences: rows.append([sentence]) test_row.append([sentence]) segs = list( filter(lambda seg: seg in ranks, segmenter.segment(sentence))) if 0 == len(segs): flag = False break keyword = reduce(lambda x, y: x if ranks[x] < ranks[y] else y, segs) #选取权重比较大的keywords kw_row.append(keyword) rows[-1].append(keyword) if flag and test_flag: data.extend(rows) kw_data.append(kw_row) if flag and test_flag is False: test_data.extend(test_row) if 0 == (idx + 1) % 2000: print("[Training Data] %d/%d poems are processed." % (idx + 1, len(poems))) print(test_data) with codecs.open(train_path, 'w', 'utf-8') as fout: for row in data: fout.write('\t'.join(row) + '\n') with codecs.open(kw_train_path, 'w', 'utf-8') as fout: for kw_row in kw_data: fout.write('\t'.join(kw_row) + '\n') with codecs.open(test_path, 'w', 'utf-8') as fout: for test_row in test_data: fout.write('\t'.join(test_row) + '\n') print("Training data is generated.")
def main(): if os.path.exists(human_samples_path): print 'Poems already sampled, use the same human samples.' cleaned_poems = load_human_samples() else: print 'Poems not yet sampled, use new human samples.' poems = get_pop_quatrains() sampled_poems = sample_poems(poems) cleaned_poems = map(lambda poem: poem['sentences'], sampled_poems) print 'Generating human samples.' generate_human_samples(cleaned_poems) print 'Generating model samples' generate_rnn_samples(cleaned_poems)
def main(): if os.path.exists(human_samples_path): print('Poems already sampled, use the same human samples.') cleaned_poems = load_human_samples() else: print('Poems not yet sampled, use new human samples.') poems = get_pop_quatrains() #获取名气比较大的十万首古诗 sampled_poems = sample_poems(poems) print(sampled_poems) cleaned_poems = list(map(lambda poem: poem['sentences'], sampled_poems)) #只保留古诗内容 print('Generating human samples.') generate_human_samples(cleaned_poems) print('Generating model samples') generate_rnn_samples(cleaned_poems)
def _gen_train_data(): segmenter = Segmenter() poems = get_pop_quatrains() random.shuffle(poems) ranks = get_word_ranks() print "Generating training data ..." data = [] kw_data = [] for idx, poem in enumerate(poems): sentences = poem['sentences'] if len(sentences) == 4: flag = True lines = u'' rows = [] kw_row = [] for sentence in sentences: rows.append([sentence]) segs = filter(lambda seg: seg in ranks, segmenter.segment(sentence)) if 0 == len(segs): # 只要该行诗句存在不在ranks中的词则这一首诗都不能用 flag = False break keyword = reduce(lambda x, y: x if ranks[x] < ranks[y] else y, segs) kw_row.append(keyword) rows[-1].append(keyword) # rows的每一个元素是该行诗句加上对应的关键字数组 if flag: data.extend(rows) # 用extend,data的每一个元素和rows的每一个元素相同 kw_data.append(kw_row) # 用append if 0 == (idx + 1) % 2000: print "[Training Data] %d/%d poems are processed." % (idx + 1, len(poems)) with codecs.open(train_path, 'w', 'utf-8') as fout: for row in data: fout.write('\t'.join(row) + '\n') # 每一行都是用tab键分隔开的一行诗加上关键字序列 with codecs.open(kw_train_path, 'w', 'utf-8') as fout: for kw_row in kw_data: fout.write('\t'.join(kw_row) + '\n') print "Training data is generated."