def __init__(self): if not check_uptodate(poems_path): _gen_poems() self.poems = [] with open(poems_path, 'r') as fin: for line in fin.readlines(): self.poems.append(line.strip().split())
def __init__(self): if not check_uptodate(sxhy_path): _gen_sxhy_dict() with open(sxhy_path, 'r') as fr: self.sxhy_dict = set(fr.read().split()) for word in self.sxhy_dict: jieba.add_word(word)
def __init__(self): self.stopwords = _get_stopwords() if not check_uptodate(wordrank_path): self._do_text_rank() with open(wordrank_path, 'r') as fin: self.word_scores = json.load(fin) self.word2rank = dict( (word_score[0], rank) for rank, word_score in enumerate(self.word_scores))
def __init__(self): self.poems = list() for corpus_name in _corpus_list: corpuspath = os.path.join(raw_dir, corpus_name) if not check_uptodate(corpuspath): _gen_poems() if not check_uptodate(poems_path): _gen_poems() for corpus_name in _corpus_list: corpuspath = os.path.join(raw_dir, corpus_name) with open(corpuspath, 'r') as fr: for line in fr: # print(line) sentences = split_sentences( line.strip('\r\n ').split()[-1]) # print(sentences) self.poems.append(sentences) print('self.poems==>', len(self.poems))
def train_planner(): print("Training Word2Vec-based planner ...") if not os.path.exists(save_dir): os.mkdir(save_dir) if not check_uptodate(plan_data_path): gen_train_data() word_lists = [] with open(plan_data_path, 'r') as fin: for line in fin.readlines(): word_lists.append(line.strip().split('\t')) model = models.Word2Vec(word_lists, size=512, min_count=5) model.save(_plan_model_path)
def train_planner(): """利用gensim,将提取的关键词向量化""" if not os.path.exists(save_dir): os.mkdir(save_dir) if not check_uptodate(plan_data_path): gen_train_data() keywords_list = [ ] # 格式:[ ['keyword1', 'keyword2', 'keyword3', 'keyword4'] ] with open(plan_data_path, 'r') as infile: for line in infile.readlines(): keywords_list.append(line.strip().split('\t')) # word2vec 训练词向量 model = models.Word2Vec(keywords_list, size=512, window=4, min_count=1) model.save(_plan_model_path)
def __init__(self): if not check_uptodate(wordrank_path): self._do_text_rank() self.word2rank = dict() self.rank2word = dict() with open(wordrank_path, 'r') as fr: word2score = json.load(fr) for rank, word in enumerate(word2score): self.word2rank[word[0]] = rank self.rank2word[rank] = word[0] if 1 in self.word2rank or 28 in self.word2rank or '28' in self.word2rank: print('1 在 scored内部') print('self.word2rank', len(self.word2rank)) print('self.rank2word', len(self.rank2word))
def train_planner(): # TODO: try other keyword-expansion models. print("Training Word2Vec-based planner ...") if not os.path.exists(save_dir): os.mkdir(save_dir) if not check_uptodate(plan_data_path): gen_train_data() word_lists = [] with open(plan_data_path, 'r') as fin: for line in fin.readlines(): word_lists.append(line.strip().split('\t')) # model = models.FastText(word_lists, size = 512, min_count = 5) model = models.Word2Vec(word_lists, size=300, min_count=3) # print(model.wv.vocab) model.save(_plan_model_path)
def __init__(self): if not check_uptodate(char_dict_path): _gen_char_dict() self._int2char = [] self._char2int = dict() # Add start-of-sentence symbol. self._int2char.append(start_of_sentence()) self._char2int[start_of_sentence()] = 0 with open(char_dict_path, 'r') as fin: idx = 1 for ch in fin.read(): self._int2char.append(ch) self._char2int[ch] = idx idx += 1 # Add end-of-sentence symbol. self._int2char.append(end_of_sentence()) self._char2int[end_of_sentence()] = len(self._int2char) - 1
def __init__(self): print('CharDict 初始化一次') self._char2id = dict() self._id2char = dict() if not check_uptodate(char_dict_path): _gen_char_dict() with open(char_dict_path, 'r') as fr: chrs = list(filter(is_cn_char, fr.read())) self._char2id[start_of_sentence()] = 0 self._id2char[0] = start_of_sentence() for idx in range(len(chrs)): # 给start_sentence,end_sentence 留位置 self._char2id[chrs[idx]] = idx + 1 self._id2char[idx + 1] = chrs[idx] self._char2id[end_of_sentence()] = len(chrs) + 1 self._id2char[len(chrs) + 1] = end_of_sentence() print('len(self._char2id)==>', len(self._char2id)) print('len(self._id2char)==>', len(self._id2char))
def __init__(self): # 获取停用词 self.stopwords = _get_stopwords() # 生成分词的《诗学含英》 '''if not check_uptodate(sxhy_path): _gen_sxhy_dict()''' with open(sxhy_path, 'r', encoding='utf-8') as fin: self.sxhy_dict: Set[str] = set(fin.read().split()) # 生成 TextRank if not check_uptodate(wordrank_path): self._do_text_rank() with open(wordrank_path, 'r', encoding='utf-8') as fin: self.word_scores: List[Tuple[str, float]] = json.load(fin) self.rank: Dict[str, int] = dict( (ws[0], i) for i, ws in enumerate(self.word_scores))
def __init__(self): if not check_uptodate(char2vec_path): _gen_char2vec() self.embedding = np.load(char2vec_path) self.char_dict = CharDict()
fout.write(line) with open(gen_data_path, 'w') as fout: for line in gen_data: fout.write(line) def batch_train_data(batch_size): """ Training data generator for the poem generator.""" gen_train_data() # Shuffle data order and cool down CPU. keywords = [] contexts = [] sentences = [] with open(gen_data_path, 'r') as fin: for line in fin.readlines(): toks = line.strip().split('\t') sentences.append(toks[0]) keywords.append(toks[1]) contexts.append(toks[2]) if len(keywords) == batch_size: yield keywords, contexts, sentences keywords.clear() contexts.clear() sentences.clear() # For simplicity, only return full batches for now. if __name__ == '__main__': if not check_uptodate(plan_data_path) or \ not check_uptodate(gen_data_path): gen_train_data()
def __init__(self): if not check_uptodate(sxhy_path): _gen_sxhy_dict() with open(sxhy_path, 'r') as fin: self.sxhy_dict = set(fin.read().split())
def __init__(self): if not check_uptodate(data_sxhy_path): # check if previously processed shixuehanying already, if not process and write to data/ dir generate_sxhy_word_set() with open(data_sxhy_path, 'r') as fin: self.sxhy_dict = set(fin.read().split())