def _gen_embedding(ndim, alignment=False): print "Generating %d-dim word embedding ..." %ndim int2ch, ch2int = get_vocab() ch_lists = [] quatrains = get_quatrains() for idx, poem in enumerate(quatrains): for sentence in poem['sentences']: ch_lists.append(filter(lambda ch: ch in ch2int, sentence)) if alignment: # the i-th characters in the poem, used to boost Dui Zhang i_characters = [[sentence[j] for sentence in poem['sentences']] for j in range(len(poem['sentences'][0]))] for characters in i_characters: ch_lists.append(filter(lambda ch: ch in ch2int, characters)) if 0 == (idx+1)%10000: print "[Word2Vec] %d/%d poems have been processed." %(idx+1, len(quatrains)) print "Hold on. This may take some time ..." model = models.Word2Vec(ch_lists, size = ndim, min_count = 5) embedding = uniform(-1.0, 1.0, [VOCAB_SIZE, ndim]) for idx, ch in enumerate(int2ch): if ch in model.wv: embedding[idx,:] = model.wv[ch] if alignment: model.save(_w2v_with_alignment_model_path) print "Word2Vec model is saved." np.save(_w2v_with_alignment_path, embedding) print "Word embedding is saved." else: model.save(_w2v_model_path) print "Word2Vec model is saved." np.save(_w2v_path, embedding) print "Word embedding is saved."
def _rank_all_words(): segmenter = Segmenter() #generation sxhy dictp stopwords = get_stopwords() print("Start TextRank over the selected quatrains ...") quatrains = get_quatrains() adjlist = dict() for idx, poem in enumerate(quatrains): if 0 == (idx + 1) % 10000: print("[TextRank] Scanning %d/%d poems ..." % (idx + 1, len(quatrains))) for sentence in poem['sentences']: segs = filter(lambda word: word not in stopwords, segmenter.segment(sentence)) #分词结果 for seg in segs: if seg not in adjlist: adjlist[seg] = dict() for i, seg in enumerate(segs): for _, other in enumerate(segs[i + 1:]): if seg != other: adjlist[seg][other] = adjlist[seg][other]+1 \ if other in adjlist[seg] else 1.0 adjlist[other][seg] = adjlist[other][seg]+1 \ if seg in adjlist[other] else 1.0 for word in adjlist: w_sum = sum(weight for other, weight in adjlist[word].items()) #权重增加 for other in adjlist[word]: adjlist[word][other] /= w_sum print("[TextRank] Weighted graph has been built.") _text_rank(adjlist)
def _rank_all_words(): segmenter = Segmenter() # 诗句分段器 stopwords = get_stopwords() # 停用词列表 print "Start TextRank over the selected quatrains ..." quatrains = get_quatrains() # 四行诗集合 adjlist = dict() for idx, poem in enumerate(quatrains): # 对于每首诗 if 0 == (idx + 1) % 10000: print "[TextRank] Scanning %d/%d poems ..." % (idx + 1, len(quatrains)) for sentence in poem['sentences']: # 对于每一句诗 segs = filter(lambda word: word not in stopwords, segmenter.segment(sentence)) # 得到不再停用词中的词段 for seg in segs: # 对于每个词段 if seg not in adjlist: adjlist[seg] = dict() # 每个词段生成一个字典dict for i, seg in enumerate(segs): # 对于每个词段 for _, other in enumerate( segs[i + 1:]): # 去和后面的每个词段比较,实际是源于text_rank需要的网状结构图 if seg != other: # 精巧的code adjlist[seg][other] = adjlist[seg][other]+1 \ if other in adjlist[seg] else 1.0 adjlist[other][seg] = adjlist[other][seg]+1 \ if seg in adjlist[other] else 1.0 for word in adjlist: w_sum = sum( weight for other, weight in adjlist[word].items()) # 求该word对应的所有词的权重综合 for other in adjlist[word]: adjlist[word][other] /= w_sum # 求该word中每个value对应的权重平均值 print "[TextRank] Weighted graph has been built." _text_rank(adjlist)
def get_pop_quatrains(num = 100000): cnts = get_word_cnts() segmenter = Segmenter() quatrains = get_quatrains() min_word_cnts = [_min_word_cnt(cnts, quatrain, segmenter) \ for i, quatrain in enumerate(quatrains)] indexes = sorted(range(len(quatrains)), key = lambda i: -min_word_cnts[i]) return [quatrains[index] for index in indexes[:min(num, len(indexes))]]
def eval_train_data(): evaluator = RhymeEvaluator() quatrains = get_quatrains() poems = list(map(lambda quatrain: quatrain['sentences'], quatrains)) # Strip out metadata information print( "Testing {} quatrains from the corpus.".format(len(poems))) eval_poems(evaluator, poems)
def _gen_word_cnts(): counters = dict() segmenter = Segmenter() quatrains = get_quatrains() for idx, poem in enumerate(quatrains): for sentence in poem['sentences']: segs = segmenter.segment(sentence) for seg in segs: counters[seg] = counters[seg]+1 if seg in counters else 1 if 0 == (idx+1)%10000: print "[Word Count] %d/%d quatrains has been processed." %(idx+1, len(quatrains)) with codecs.open(_wc_path, 'w', 'utf-8') as fout: json.dump(counters, fout)
def _train(self): print("Start training Word2Vec for planner ...") quatrains = get_quatrains() segmenter = Segmenter() seg_lists = [] for idx, quatrain in enumerate(quatrains): seg_list = [] for sentence in quatrain['sentences']: seg_list.extend([seg for seg in segmenter.segment(sentence) if seg in self.ranks]) seg_lists.append(seg_list) if 0 == (idx+1)%10000: print("[Plan Word2Vec] %d/%d quatrains has been processed." %(idx+1, len(quatrains))) print("Hold on. This may take some time ...") self.model = models.Word2Vec(seg_lists, size = 512, min_count = 5) self.model.save(_model_path)
def _gen_embedding(ndim): print "Generating %d-dim word embedding ..." % ndim int2ch, ch2int = get_vocab() ch_lists = [] quatrains = get_quatrains() for idx, poem in enumerate(quatrains): for sentence in poem['sentences']: ch_lists.append(filter(lambda ch: ch in ch2int, sentence)) if 0 == (idx + 1) % 10000: print "[Word2Vec] %d/%d poems have been processed." % ( idx + 1, len(quatrains)) print "Hold on. This may take some time ..." model = models.Word2Vec(ch_lists, size=ndim, min_count=5) embedding = uniform(-1.0, 1.0, [VOCAB_SIZE, ndim]) for idx, ch in enumerate(int2ch): if ch in model.wv: embedding[idx, :] = model.wv[ch] np.save(_w2v_path, embedding) print "Word embedding is saved."
def _train(self): print "Start training Word2Vec for planner ..." quatrains = get_quatrains() segmenter = Segmenter() # 对诗句分段和取其中的每个词不一样 seg_lists = [] for idx, quatrain in enumerate(quatrains): seg_list = [] for sentence in quatrain['sentences']: seg_list.extend( filter(lambda seg: seg in self.ranks, segmenter.segment(sentence))) seg_lists.append(seg_list) if 0 == (idx + 1) % 10000: print "[Plan Word2Vec] %d/%d quatrains has been processed." % ( idx + 1, len(quatrains)) print "Hold on. This may take some time ..." self.model = models.Word2Vec(seg_lists, size=512, min_count=5) # 代表一个词向量类,生成的是词向量模型 self.model.save(_model_path)
def _gen_embedding(ndim): # 生成ndim维度的词向量 print "Generating %d-dim word embedding ..." % ndim int2ch, ch2int = get_vocab() # 得到词库 ch_lists = [] quatrains = get_quatrains() # 得到所有符合要求规则的四行诗的诗句 for idx, poem in enumerate(quatrains): # 对于四行诗中的每一首诗 for sentence in poem['sentences']: # 对于诗中的每一句诗 ch_lists.append(filter(lambda ch: ch in ch2int, sentence)) # 检查诗句的每一行中哪些在ch2int词典中 if 0 == (idx + 1) % 10000: print "[Word2Vec] %d/%d poems have been processed." % ( idx + 1, len(quatrains)) print "Hold on. This may take some time ..." model = models.Word2Vec(ch_lists, size=ndim, min_count=5) # ch_list是词库,ndim是要生成的词向量的维度 embedding = uniform(-1.0, 1.0, [VOCAB_SIZE, ndim]) # 平均分布的矩阵,每一行代表一个词向量,每一个词向量维度ndim for idx, ch in enumerate(int2ch): if ch in model.wv: # 如果int2ch中的该词在model生成的词向量中 embedding[idx, :] = model.wv[ch] # embedding中的该行代表ch对应的词向量 np.save(_w2v_path, embedding) print "Word embedding is saved."
def extract_couplets_with_tag(self, tag_dir, tag_name): tag_path = os.path.join(tag_dir, tag_name) if not os.path.exists(tag_path): raise ValueError('There is no valid tags') tag_sets = set() with codecs.open(tag_path, 'r', 'utf-8') as fin: # TODO: read from tags(default: one tag one line) line = fin.readline().strip() while line: tag_sets.add(line) line = fin.readline().strip() couplets_dict = quatrains.get_quatrains() couplets = [couplet['sentences'] for couplet in couplets_dict] seg = Segmenter() tag_couplet_path = os.path.join(DATA_PROCESSED_DIR, 'tag_couplets.txt') with open(tag_couplet_path, 'w') as fout: print 'create tag_couplets.txt' data = [] with codecs.open(tag_couplet_path, 'a', 'utf-8') as fout: for couplet in couplets: seg_list_0 = seg.segment(couplet[0]) seg_list_1 = seg.segment(couplet[1]) flag_0 = set(seg_list_0) & tag_sets flag_1 = set(seg_list_1) & tag_sets if flag_0 or flag_1: fout.write(couplet[0]) fout.write('\n') fout.write(couplet[1]) fout.write('\n') fout.write('\n') data.append(couplet) return data
from generate import Generator from plan import Planner if __name__ == '__main__': evaluator = RhymeEvaluator() ''' print "Evaluating rule-based method ..." scores = [] with codecs.open('results.txt', 'r', 'utf-8') as fin: line = fin.readline() while line: scores.append(evaluator.eval(split_sentences(line.strip()))) line = fin.readline() print "Mean score = %f, standard deviation = %f" % (np.mean(scores), np.std(scores)) ''' quatrains = get_quatrains() print "Testing %d quatrains from the corpus." % len(quatrains) scores = [] for quatrain in quatrains: score = evaluator.eval(quatrain['sentences']) scores.append(score) print "Mean score = %f, standard deviation = %f" % (np.mean(scores), np.std(scores)) num = 100 print "Testing %d poems generated by RNN ..." % num scores = [] planner = Planner() generator = Generator() for _ in range(num): keywords = planner.plan(u'') assert 4 == len(keywords)