Exemple #1
0
 def __init__(self):
     if not check_uptodate(poems_path):
         _gen_poems()
     self.poems = []
     with open(poems_path, 'r') as fin:
         for line in fin.readlines():
             self.poems.append(line.strip().split())
Exemple #2
0
 def __init__(self):
     if not check_uptodate(sxhy_path):
         _gen_sxhy_dict()
     with open(sxhy_path, 'r') as fr:
         self.sxhy_dict = set(fr.read().split())
     for word in self.sxhy_dict:
         jieba.add_word(word)
Exemple #3
0
 def __init__(self):
     self.stopwords = _get_stopwords()
     if not check_uptodate(wordrank_path):
         self._do_text_rank()
     with open(wordrank_path, 'r') as fin:
         self.word_scores = json.load(fin)
     self.word2rank = dict(
         (word_score[0], rank)
         for rank, word_score in enumerate(self.word_scores))
Exemple #4
0
 def __init__(self):
     self.poems = list()
     for corpus_name in _corpus_list:
         corpuspath = os.path.join(raw_dir, corpus_name)
         if not check_uptodate(corpuspath):
             _gen_poems()
     if not check_uptodate(poems_path):
         _gen_poems()
     for corpus_name in _corpus_list:
         corpuspath = os.path.join(raw_dir, corpus_name)
         with open(corpuspath, 'r') as fr:
             for line in fr:
                 # print(line)
                 sentences = split_sentences(
                     line.strip('\r\n ').split()[-1])
                 # print(sentences)
                 self.poems.append(sentences)
         print('self.poems==>', len(self.poems))
def train_planner():
    print("Training Word2Vec-based planner ...")
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    if not check_uptodate(plan_data_path):
        gen_train_data()
    word_lists = []
    with open(plan_data_path, 'r') as fin:
        for line in fin.readlines():
            word_lists.append(line.strip().split('\t'))
    model = models.Word2Vec(word_lists, size=512, min_count=5)
    model.save(_plan_model_path)
def train_planner():
    """利用gensim,将提取的关键词向量化"""
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    if not check_uptodate(plan_data_path):
        gen_train_data()
    keywords_list = [
    ]  # 格式:[ ['keyword1', 'keyword2', 'keyword3', 'keyword4'] ]
    with open(plan_data_path, 'r') as infile:
        for line in infile.readlines():
            keywords_list.append(line.strip().split('\t'))
    # word2vec 训练词向量
    model = models.Word2Vec(keywords_list, size=512, window=4, min_count=1)
    model.save(_plan_model_path)
 def __init__(self):
     if not check_uptodate(wordrank_path):
         self._do_text_rank()
     self.word2rank = dict()
     self.rank2word = dict()
     with open(wordrank_path, 'r') as fr:
         word2score = json.load(fr)
         for rank, word in enumerate(word2score):
             self.word2rank[word[0]] = rank
             self.rank2word[rank] = word[0]
     if 1 in self.word2rank or 28 in self.word2rank or '28' in self.word2rank:
         print('1 在 scored内部')
     print('self.word2rank', len(self.word2rank))
     print('self.rank2word', len(self.rank2word))
def train_planner():
    # TODO: try other keyword-expansion models.
    print("Training Word2Vec-based planner ...")
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    if not check_uptodate(plan_data_path):
        gen_train_data()
    word_lists = []
    with open(plan_data_path, 'r') as fin:
        for line in fin.readlines():
            word_lists.append(line.strip().split('\t'))

    # model = models.FastText(word_lists, size = 512, min_count = 5)
    model = models.Word2Vec(word_lists, size=300, min_count=3)
    # print(model.wv.vocab)
    model.save(_plan_model_path)
 def __init__(self):
     if not check_uptodate(char_dict_path):
         _gen_char_dict()
     self._int2char = []
     self._char2int = dict()
     # Add start-of-sentence symbol.
     self._int2char.append(start_of_sentence())
     self._char2int[start_of_sentence()] = 0
     with open(char_dict_path, 'r') as fin:
         idx = 1
         for ch in fin.read():
             self._int2char.append(ch)
             self._char2int[ch] = idx
             idx += 1
     # Add end-of-sentence symbol.
     self._int2char.append(end_of_sentence())
     self._char2int[end_of_sentence()] = len(self._int2char) - 1
Exemple #10
0
 def __init__(self):
     print('CharDict 初始化一次')
     self._char2id = dict()
     self._id2char = dict()
     if not check_uptodate(char_dict_path):
         _gen_char_dict()
     with open(char_dict_path, 'r') as fr:
         chrs = list(filter(is_cn_char, fr.read()))
         self._char2id[start_of_sentence()] = 0
         self._id2char[0] = start_of_sentence()
         for idx in range(len(chrs)):  # 给start_sentence,end_sentence 留位置
             self._char2id[chrs[idx]] = idx + 1
             self._id2char[idx + 1] = chrs[idx]
         self._char2id[end_of_sentence()] = len(chrs) + 1
         self._id2char[len(chrs) + 1] = end_of_sentence()
         print('len(self._char2id)==>', len(self._char2id))
         print('len(self._id2char)==>', len(self._id2char))
Exemple #11
0
    def __init__(self):
        # 获取停用词
        self.stopwords = _get_stopwords()

        # 生成分词的《诗学含英》
        '''if not check_uptodate(sxhy_path):
            _gen_sxhy_dict()'''
        with open(sxhy_path, 'r', encoding='utf-8') as fin:
            self.sxhy_dict: Set[str] = set(fin.read().split())

        # 生成 TextRank
        if not check_uptodate(wordrank_path):
            self._do_text_rank()
        with open(wordrank_path, 'r', encoding='utf-8') as fin:
            self.word_scores: List[Tuple[str, float]] = json.load(fin)
        self.rank: Dict[str, int] = dict(
            (ws[0], i) for i, ws in enumerate(self.word_scores))
Exemple #12
0
 def __init__(self):
     if not check_uptodate(char2vec_path):
         _gen_char2vec()
     self.embedding = np.load(char2vec_path)
     self.char_dict = CharDict()
Exemple #13
0
            fout.write(line)
    with open(gen_data_path, 'w') as fout:
        for line in gen_data:
            fout.write(line)


def batch_train_data(batch_size):
    """ Training data generator for the poem generator."""
    gen_train_data()  # Shuffle data order and cool down CPU.
    keywords = []
    contexts = []
    sentences = []
    with open(gen_data_path, 'r') as fin:
        for line in fin.readlines():
            toks = line.strip().split('\t')
            sentences.append(toks[0])
            keywords.append(toks[1])
            contexts.append(toks[2])
            if len(keywords) == batch_size:
                yield keywords, contexts, sentences
                keywords.clear()
                contexts.clear()
                sentences.clear()
        # For simplicity, only return full batches for now.


if __name__ == '__main__':
    if not check_uptodate(plan_data_path) or \
            not check_uptodate(gen_data_path):
        gen_train_data()
 def __init__(self):
     if not check_uptodate(sxhy_path):
         _gen_sxhy_dict()
     with open(sxhy_path, 'r') as fin:
         self.sxhy_dict = set(fin.read().split())
Exemple #15
0
 def __init__(self):
     if not check_uptodate(data_sxhy_path):
         # check if previously processed shixuehanying already, if not process and write to data/ dir
         generate_sxhy_word_set()
     with open(data_sxhy_path, 'r') as fin:
         self.sxhy_dict = set(fin.read().split())