def gen_train_data(): print("Generating training data ...") segmenter = Segmenter() poems = Poems() poems.shuffle() ranked_words = RankedWords() plan_data = [] gen_data = [] for poem in poems: if len(poem) != 4: continue # Only consider quatrains. valid = True context = start_of_sentence() gen_lines = [] keywords = [] for sentence in poem: if len(sentence) != 7: #只考虑七字诀句 valid = False break #get a list of selected words from this sentence #ignore all words if they are not in the ranked words list words = list( filter(lambda seg: seg in ranked_words, segmenter.segment(sentence))) if len(words) == 0: valid = False break keyword = words[0] # from all words in this sentence, get the word with highest text_rank score for word in words[1:]: if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' gen_lines.append(gen_line) keywords.append(keyword) context += sentence + end_of_sentence() if valid: # plan data: each line is four keywords from the 4 sentences plan_data.append('\t'.join(keywords) + '\n') gen_data.extend(gen_lines) with open(plan_data_path, 'w') as fout: for line in plan_data: fout.write(line) with open(gen_data_path, 'w') as fout: for line in gen_data: fout.write(line)
def _get_adjlists(self): print("[TextRank] Generating word graph ...") segmenter = Segmenter() poems = Poems() adjlists = dict() # Count number of co-occurrence. for poem in poems: for sentence in poem: words = [] for word in segmenter.segment(sentence): if word not in self.stopwords: words.append(word) for word in words: if word not in adjlists: adjlists[word] = dict() for i in range(len(words)): for j in range(i + 1, len(words)): if words[j] not in adjlists[words[i]]: adjlists[words[i]][words[j]] = 1.0 else: adjlists[words[i]][words[j]] += 1.0 if words[i] not in adjlists[words[j]]: adjlists[words[j]][words[i]] = 1.0 else: adjlists[words[j]][words[i]] += 1.0 # Normalize weights. for a in adjlists: sum_w = sum(w for _, w in adjlists[a].items()) for b in adjlists[a]: adjlists[a][b] /= sum_w return adjlists
def gen_train_data(): print("Generating training data ...") segmenter = Segmenter() poems = Poems() poems.shuffle() ranked_words = RankedWords() plan_data = [] gen_data = [] for poem in poems: # 只处理四行七言的诗 if len(poem) != 4: continue valid = True context = start_of_sentence() gen_lines = [] keywords = [] for sentence in poem: if len(sentence) != 7: valid = False break words = list( filter(lambda seg: seg in ranked_words, segmenter.segment(sentence))) if len(words) == 0: valid = False break keyword = words[0] for word in words[1:]: if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' gen_lines.append(gen_line) keywords.append(keyword) context += sentence + end_of_sentence() if valid: plan_data.append('\t'.join(keywords) + '\n') gen_data.extend(gen_lines) with open(plan_data_path, 'w') as fout: for line in plan_data: fout.write(line) with open(gen_data_path, 'w') as fout: for line in gen_data: fout.write(line)
def _gen_char2vec(): print("Generating char2vec model ...") char_dict = CharDict() poems = Poems() model = models.Word2Vec(poems, size=CHAR_VEC_DIM, min_count=5) embedding = uniform(-1.0, 1.0, [len(char_dict), CHAR_VEC_DIM]) for i, ch in enumerate(char_dict): if ch in model.wv: embedding[i, :] = model.wv[ch] np.save(char2vec_path, embedding)
def gen_train_data(): """获取每一句的keywords,拼起来写入文件""" print("Generating training data ...") segmenter = Segmenter() poems = Poems() ranked_words = RankedWords() gen_data = list() plan_data = list() valid = True counter_line = 0 print('len(poems)==>', len(poems)) for poem in poems: # print(len(poem)) if len(poem) != 4: # print(poem) valid = False continue context = start_of_sentence() keywords = list() for sentence in poem: counter_line += 1 keyword = '' if len(sentence) != 7: valid = False break filterwords = list( filter(lambda x: x in ranked_words, segmenter.segment(sentence))) if filterwords: keyword = filterwords[0] for word in filterwords: # print('word==>',word) if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word if keyword: gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' keywords.append(keyword) gen_data.append(gen_line) context += sentence + end_of_sentence() plan_data.append(' '.join(keywords)) with open(plan_data_path, 'w') as fw: for data_iter in gen_data: fw.write(data_iter + '\n') with open(gen_data_path, 'w') as fw: for data_iter in gen_data: fw.write(data_iter) print('counter_line==>', counter_line) del segmenter, poems, ranked_words
def _gen_word2vec(): print("Generating word2vec model ...") word_dict = wordDict() poems = Poems() poems = [poem[0] + poem[1] + poem[2] + poem[3] for poem in poems] print(poems[1]) model = models.Word2Vec(poems, size=WORD_VEC_DIM, min_count=1) # 低频词比较多 embedding = uniform(-1.0, 1.0, [len(word_dict), WORD_VEC_DIM]) for i, ch in enumerate(word_dict): if ch in model: embedding[i, :] = model[ch] np.save(word2vec_path, embedding)
def _get_adjlists(self) -> Dict[str, Dict[str, float]]: print("[TextRank] Generating word graph ...") poems = Poems() # 获取共现矩阵(邻接表):两个词在同一句中共同出现的次数。使用 adjlists # 来存储。下面展示了它的示例: # adjlists= dict { # "word1" : dict { # "word2" : 1.0, # "word3" : 1.0, # ... # }, # "word2" : dict { # "word1" : 1.0, # "word3" : 1.0, # ... # } # ... # } # 其中 "word*" 指明单词 V_i, V_j;对应的 1.0 表示“边权值” w_{ij}。 # 在这个阶段,矩阵是对称的,即 w_{ij}=w{ji}。 adjlists: Dict[str, Dict[str, float]] = dict() for poem in poems: sentence = poem[0] + poem[1] + poem[2] + poem[3] words: List[str] = [] for word in sentence: if word not in self.stopwords: words.append(word) for word in words: if word not in adjlists: adjlists[word] = dict() for _, i in enumerate(words): for j in words[_ + 1:]: if j not in adjlists[i]: adjlists[i][j] = 1.0 else: adjlists[i][j] += 1.0 if i not in adjlists[j]: adjlists[j][i] = 1.0 else: adjlists[j][i] += 1.0 # 对矩阵 W 进行预处理(正规化) # $$r_{ji}=\frac{w_{ij}}{\displaystyle\sum_{k=1}^{|V|}w_{jk}}$$ for j in adjlists: sum_k = sum(k for k in adjlists[j].values()) for i in adjlists[j]: adjlists[j][i] /= sum_k return adjlists
def _get_adjlists(self): poems = Poems() segmenter = Segmenter() adjlists = collections.defaultdict(dict) for poem_set in poems: for poem in poem_set: words = segmenter.segment(poem) for i in range(len(words) - 1): for j in range(i + 1, len(words)): if words[j] not in adjlists[words[i]]: adjlists[words[i]][words[j]] = 1.0 else: adjlists[words[i]][words[j]] += 1.0 if words[i] not in adjlists[words[j]]: adjlists[words[j]][words[i]] = 1.0 else: adjlists[words[j]][words[i]] += 1.0 return adjlists
def _build_adjlists_from_tencent_embeddings(self): print("[TextRank] Generating word graph ...") segmenter = Segmenter() poems = Poems() adjlists = dict( ) # 2D dict, dict[word1][word2]=prob(going from word1 to word2) wv = get_tencent_embedding_keyedVectors(_tencent_embedding_path) # Count number of co-occurrence. ######################## get a 2D cos sim matrix for all words ################### words = set() for poem in poems: for sentence in poem: for word in segmenter.segment(sentence): # for each word selected from the sentence if word not in self.stopwords: #keep only non-stopwords words words.add(word) for word in words: if word not in adjlists: #initialize all words to a new dict() adjlists[word] = dict() for word in words: for other in words: if word == other: continue if other in adjlists[word] or word in adjlists[other]: continue sim = wv.similarity(word, other) adjlists[word][other] = sim adjlists[other][word] = sim # Normalize weights. for a in adjlists: sum_w = sum(w for _, w in adjlists[a].items()) for b in adjlists[a]: adjlists[a][b] /= sum_w return adjlists
def _gen_char2vec(): print("Generating char2vec model ...") char_dict = CharDict() cpu_count = multiprocessing.cpu_count() poems = Poems() poems_str = [ list(line) for line in list(itertools.chain.from_iterable(poems)) ] # for item in poems_str: # print(item) # model=models.Word2Vecrd2Vec(sentences=poems, size=CHAR_VEC_DIM, alpha=0.025, window=5, min_count=5) model = models.Word2Vec(sentences=poems_str, size=CHAR_VEC_DIM, alpha=0.025, window=2, min_count=2, workers=cpu_count, min_alpha=0.0001, sg=0, hs=1, negative=5, cbow_mean=1, hashfxn=hash, iter=30, null_word=0, trim_rule=None, sorted_vocab=1) embedding = uniform(-1.0, 1.0, size=[len(char_dict), CHAR_VEC_DIM]) # print(len(model.wv)) # for word in model.vocabulary.: # print(word) counter_yes, counter_no = 0, 0 for index, word in char_dict: if word in model.wv: embedding[index] = model.wv[word] counter_yes += 1 else: counter_no += 1 print('{}不在wv中'.format(word)) print('有wv的字{}个没有wv的字{}个'.format(counter_yes, counter_no)) np.save(char2vec_path, embedding)
def _get_adjlists(self): print("[TextRank] Generating word graph ...") segmenter = Segmenter() poems = Poems() adjlists = dict( ) # 2D dict, dict[word1][word2]=prob(going from word1 to word2) # Count number of co-occurrence. """ ######################## count relationship per sentence ################### for poem in poems: for sentence in poem: words = [] for word in segmenter.segment(sentence): # for each word selected from the sentence if word not in self.stopwords: #keep only non-stopwords words words.append(word) for word in words: if word not in adjlists: #initialize all words to a new dict() adjlists[word] = dict() for i in range(len(words)): for j in range(i + 1, len(words)): #### if two words present in the same sentence, their score +=1 ##### if words[j] not in adjlists[words[i]]: adjlists[words[i]][words[j]] = 1.0 else: adjlists[words[i]][words[j]] += 1.0 if words[i] not in adjlists[words[j]]: adjlists[words[j]][words[i]] = 1.0 else: adjlists[words[j]][words[i]] += 1.0 ######################## end count relationship per sentence ################### """ ######################## count relationship per poem ################### for poem in poems: for sentence in poem: words = [] for word in segmenter.segment(sentence): # for each word selected from the sentence if word not in self.stopwords: #keep only non-stopwords words words.append(word) for word in words: if word not in adjlists: #initialize all words to a new dict() adjlists[word] = dict() for i in range(len(words)): for j in range(i + 1, len(words)): #### if two words present in the same sentence, their score +=1 ##### if words[j] not in adjlists[words[i]]: adjlists[words[i]][words[j]] = 1.0 else: adjlists[words[i]][words[j]] += 1.0 if words[i] not in adjlists[words[j]]: adjlists[words[j]][words[i]] = 1.0 else: adjlists[words[j]][words[i]] += 1.0 ######################## end count relationship per poem ################### # Normalize weights. for a in adjlists: sum_w = sum(w for _, w in adjlists[a].items()) for b in adjlists[a]: adjlists[a][b] /= sum_w return adjlists
def gen_train_data(): print("Generating training data ...") '''with open(r'raw/corpus.txt', 'r',encoding='utf-8') as fin: for line in fin.readlines()[0 : 6]: for sentence in split_sentences(line): print(' '.join(sentence))''' poems = Poems() poems.shuffle() ranked_words = RankedWords() plan_data = [] gen_data = [[], [], [], []] gen_data_for_pair_train = [] sentence2_without_keyword = [] sentence4_without_keyword = [] for poem in poems: if len(poem) != 4: continue # Only consider quatrains. 只考虑绝句 valid = True context = start_of_sentence() gen_lines = [] keywords = [] for i, sentence in enumerate(poem): if len(sentence) != 5: # 选5言诗 valid = False break temp = ''.join(sentence) tempList = [temp[:2], temp[1:3], temp[2:4], temp[3:5] ] + sentence #把相邻的两个字组成的对子和单个字都列出来 words = list(filter(lambda seg: seg in ranked_words, tempList)) #取出关键字, 只考虑了单个字的情况 if len(words) == 0: #希望句子里有排出来的高效的字 valid = False break keyword = words[0] for word in words[1:]: if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word #找第一关键字 if len(keyword) == 2: #根本没有这回事, ranked_words都是单个的 keyword = keyword[0] + ' ' + keyword[1] gen_line = ' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' '+ ' '.join(sentence) + \ '\t' + keyword + '\t' + context + '\n' #数据格式: 当前诗句 当前诗句 关键字 上文 if i == 2: sentence2_without_keyword.append(' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' ' + ' '.join(sentence) +\ '\t' + '^' + '\t' + context + '\n') if i == 3: sentence4_without_keyword.append(' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' ' + ' '.join(sentence) +\ '\t' + '^' + '\t' + context + '\n') #nokeyword数据格式: 当前诗句 当前诗句 上文 if i == 1 or i == 3: gen_line_ = ' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' ' + ' '.join(sentence) +\ '\t' + '^' + '\t' + '^' + ' ' + ' '.join(last_sentence) + '\n' gen_data_for_pair_train.append(gen_line_) gen_lines.append(gen_line) keywords.append(keyword) context += ' ' + ' '.join(sentence) + ' ' + end_of_sentence() last_sentence = sentence if valid: plan_data.append('\t'.join(keywords) + '\n') #对每首诗提4个关键字出来 for i, line in enumerate(gen_lines): gen_data[i].append(line) with open(plan_data_path, 'w') as fout: for line in plan_data: fout.write(line) '''with open(gen_data_path, 'w') as fout: pass''' for i in range(1, 5): with open(train_data_path[i] + '/' + f'keyword{i}' + '.txt', 'w') as f: for line in gen_data[i - 1]: f.write(line) #对仗可能需要用到的上下句关系 with open(train_data_path[0] + '/pair.txt', 'w') as f: for line in gen_data_for_pair_train: f.write(line) with open(train_data_path[2] + '/pair.txt', 'w') as f: for line in gen_data_for_pair_train: f.write(line) with open(train_data_path[4] + '/pair.txt', 'w') as f: for line in gen_data_for_pair_train: f.write(line) with open(train_data_path[3] + '/nokeyword3.txt', 'w') as f: for line in sentence2_without_keyword: f.write(line) with open(train_data_path[4] + '/nokeyword4.txt', 'w') as f: for line in sentence4_without_keyword: f.write(line)
import argparse from train import train from infer import generate_control from char2vec import Char2Vec from char_dict import CharDict from poems import Poems from data_utils import batch_train_data from rank_words import RankedWords if __name__=='__main__': arguementparser=argparse.ArgumentParser(description='chinese poem generation') arguementparser.add_argument('-t',action='store_true',dest='train',default=False) arguementparser.add_argument('-p',action='store_true',dest='pretrain',default=False) arguementparser.add_argument('-i',action='store_true',dest='infer',default=False) # arguementparser.add_argument('-p', dest = 'planner', default = False,action = 'store_true', # help = 'train planning model') args=arguementparser.parse_args() # print('args==>',args) if args.train: print('进入训练阶段') train(n_epochs=1000) elif args.pretrain: print('进入预训练阶段') CharDict() RankedWords() Char2Vec() Poems() batch_train_data(32) elif args.infer: print('进入测试阶段') generate_control()