def get_quatrains(): _, ch2int = get_vocab() def quatrain_filter(poem): if not is_quatrain(poem): return False else: for sentence in poem['sentences']: for ch in sentence: if ch not in ch2int: return False return True return list(filter(quatrain_filter, get_all_corpus()))
def get_quatrains(): # 返回每个字符都在字库ch2int中的四行诗的诗句 _, ch2int = get_vocab() def quatrain_filter(poem): if not is_quatrain(poem): return False else: for sentence in poem['sentences']: for ch in sentence: if ch not in ch2int: return False return True return filter(quatrain_filter, get_all_corpus()) # get_all_corpus()方法返回的是所有诗句文件数据中的诗的记录,每一行代表一首诗的名、作者、朝代、诗句
def _gen_vocab(): print "Generating the vocabulary ..." corpus = get_all_corpus() char_cnts = dict() for idx, poem in enumerate(corpus): for sentence in poem['sentences']: for ch in sentence: char_cnts[ch] = char_cnts[ch]+1 if ch in char_cnts else 1 if 0 == (idx+1)%10000: print "[Vocabulary] %d/%d poems have been processed." %(idx+1, len(corpus)) vocab = sorted([ch for ch in char_cnts], key = lambda ch: -char_cnts[ch])[:VOCAB_SIZE-2] with codecs.open(_vocab_path, 'w', 'utf-8') as fout: json.dump(vocab, fout) print "The vocabulary has been built."