Exemple #1
0
def gen_train_data():
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = []
    for poem in poems:
        if len(poem) != 4:
            continue  # Only consider quatrains.
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for sentence in poem:
            if len(sentence) != 7:
                #只考虑七字诀句
                valid = False
                break
            #get a list of selected words from this sentence
            #ignore all words if they are not in the ranked words list
            words = list(
                filter(lambda seg: seg in ranked_words,
                       segmenter.segment(sentence)))
            if len(words) == 0:
                valid = False
                break
            keyword = words[0]

            # from all words in this sentence, get the word with highest text_rank score
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word

            gen_line = sentence + end_of_sentence() + \
                       '\t' + keyword + '\t' + context + '\n'
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += sentence + end_of_sentence()
        if valid:
            # plan data: each line is four keywords from the 4 sentences
            plan_data.append('\t'.join(keywords) + '\n')
            gen_data.extend(gen_lines)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    with open(gen_data_path, 'w') as fout:
        for line in gen_data:
            fout.write(line)
 def _get_adjlists(self):
     print("[TextRank] Generating word graph ...")
     segmenter = Segmenter()
     poems = Poems()
     adjlists = dict()
     # Count number of co-occurrence.
     for poem in poems:
         for sentence in poem:
             words = []
             for word in segmenter.segment(sentence):
                 if word not in self.stopwords:
                     words.append(word)
             for word in words:
                 if word not in adjlists:
                     adjlists[word] = dict()
             for i in range(len(words)):
                 for j in range(i + 1, len(words)):
                     if words[j] not in adjlists[words[i]]:
                         adjlists[words[i]][words[j]] = 1.0
                     else:
                         adjlists[words[i]][words[j]] += 1.0
                     if words[i] not in adjlists[words[j]]:
                         adjlists[words[j]][words[i]] = 1.0
                     else:
                         adjlists[words[j]][words[i]] += 1.0
     # Normalize weights.
     for a in adjlists:
         sum_w = sum(w for _, w in adjlists[a].items())
         for b in adjlists[a]:
             adjlists[a][b] /= sum_w
     return adjlists
Exemple #3
0
def gen_train_data():
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = []
    for poem in poems:
        # 只处理四行七言的诗
        if len(poem) != 4:
            continue
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for sentence in poem:
            if len(sentence) != 7:
                valid = False
                break
            words = list(
                filter(lambda seg: seg in ranked_words,
                       segmenter.segment(sentence)))
            if len(words) == 0:
                valid = False
                break
            keyword = words[0]
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word
            gen_line = sentence + end_of_sentence() + \
                    '\t' + keyword + '\t' + context + '\n'
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += sentence + end_of_sentence()
        if valid:
            plan_data.append('\t'.join(keywords) + '\n')
            gen_data.extend(gen_lines)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    with open(gen_data_path, 'w') as fout:
        for line in gen_data:
            fout.write(line)
Exemple #4
0
def _gen_char2vec():
    print("Generating char2vec model ...")
    char_dict = CharDict()
    poems = Poems()
    model = models.Word2Vec(poems, size=CHAR_VEC_DIM, min_count=5)
    embedding = uniform(-1.0, 1.0, [len(char_dict), CHAR_VEC_DIM])
    for i, ch in enumerate(char_dict):
        if ch in model.wv:
            embedding[i, :] = model.wv[ch]
    np.save(char2vec_path, embedding)
Exemple #5
0
def gen_train_data():
    """获取每一句的keywords,拼起来写入文件"""
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    ranked_words = RankedWords()

    gen_data = list()
    plan_data = list()

    valid = True
    counter_line = 0
    print('len(poems)==>', len(poems))
    for poem in poems:
        # print(len(poem))
        if len(poem) != 4:
            # print(poem)
            valid = False
            continue
        context = start_of_sentence()
        keywords = list()
        for sentence in poem:
            counter_line += 1
            keyword = ''
            if len(sentence) != 7:
                valid = False
                break
            filterwords = list(
                filter(lambda x: x in ranked_words,
                       segmenter.segment(sentence)))
            if filterwords:
                keyword = filterwords[0]
            for word in filterwords:
                # print('word==>',word)
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word
            if keyword:
                gen_line = sentence + end_of_sentence() + \
                           '\t' + keyword + '\t' + context + '\n'
                keywords.append(keyword)
                gen_data.append(gen_line)
                context += sentence + end_of_sentence()
        plan_data.append(' '.join(keywords))
    with open(plan_data_path, 'w') as fw:
        for data_iter in gen_data:
            fw.write(data_iter + '\n')
    with open(gen_data_path, 'w') as fw:
        for data_iter in gen_data:
            fw.write(data_iter)

    print('counter_line==>', counter_line)
    del segmenter, poems, ranked_words
Exemple #6
0
def _gen_word2vec():
    print("Generating word2vec model ...")
    word_dict = wordDict()
    poems = Poems()
    poems = [poem[0] + poem[1] + poem[2] + poem[3] for poem in poems]
    print(poems[1])
    model = models.Word2Vec(poems, size=WORD_VEC_DIM, min_count=1)  # 低频词比较多
    embedding = uniform(-1.0, 1.0, [len(word_dict), WORD_VEC_DIM])
    for i, ch in enumerate(word_dict):
        if ch in model:
            embedding[i, :] = model[ch]
    np.save(word2vec_path, embedding)
Exemple #7
0
    def _get_adjlists(self) -> Dict[str, Dict[str, float]]:
        print("[TextRank] Generating word graph ...")
        poems = Poems()

        # 获取共现矩阵(邻接表):两个词在同一句中共同出现的次数。使用 adjlists
        # 来存储。下面展示了它的示例:
        # adjlists= dict {
        #     "word1" : dict {
        #         "word2" : 1.0,
        #         "word3" : 1.0,
        #         ...
        #     },
        #     "word2" : dict {
        #         "word1" : 1.0,
        #         "word3" : 1.0,
        #         ...
        #     }
        #     ...
        # }
        # 其中 "word*" 指明单词 V_i, V_j;对应的 1.0 表示“边权值” w_{ij}。
        # 在这个阶段,矩阵是对称的,即 w_{ij}=w{ji}。

        adjlists: Dict[str, Dict[str, float]] = dict()
        for poem in poems:
            sentence = poem[0] + poem[1] + poem[2] + poem[3]
            words: List[str] = []
            for word in sentence:
                if word not in self.stopwords:
                    words.append(word)
            for word in words:
                if word not in adjlists:
                    adjlists[word] = dict()
            for _, i in enumerate(words):
                for j in words[_ + 1:]:
                    if j not in adjlists[i]:
                        adjlists[i][j] = 1.0
                    else:
                        adjlists[i][j] += 1.0
                    if i not in adjlists[j]:
                        adjlists[j][i] = 1.0
                    else:
                        adjlists[j][i] += 1.0

        # 对矩阵 W 进行预处理(正规化)
        # $$r_{ji}=\frac{w_{ij}}{\displaystyle\sum_{k=1}^{|V|}w_{jk}}$$

        for j in adjlists:
            sum_k = sum(k for k in adjlists[j].values())
            for i in adjlists[j]:
                adjlists[j][i] /= sum_k
        return adjlists
 def _get_adjlists(self):
     poems = Poems()
     segmenter = Segmenter()
     adjlists = collections.defaultdict(dict)
     for poem_set in poems:
         for poem in poem_set:
             words = segmenter.segment(poem)
             for i in range(len(words) - 1):
                 for j in range(i + 1, len(words)):
                     if words[j] not in adjlists[words[i]]:
                         adjlists[words[i]][words[j]] = 1.0
                     else:
                         adjlists[words[i]][words[j]] += 1.0
                     if words[i] not in adjlists[words[j]]:
                         adjlists[words[j]][words[i]] = 1.0
                     else:
                         adjlists[words[j]][words[i]] += 1.0
     return adjlists
Exemple #9
0
    def _build_adjlists_from_tencent_embeddings(self):
        print("[TextRank] Generating word graph ...")
        segmenter = Segmenter()
        poems = Poems()
        adjlists = dict(
        )  # 2D dict, dict[word1][word2]=prob(going from word1 to word2)
        wv = get_tencent_embedding_keyedVectors(_tencent_embedding_path)

        # Count number of co-occurrence.

        ######################## get a 2D cos sim matrix for all words ###################
        words = set()
        for poem in poems:
            for sentence in poem:
                for word in segmenter.segment(sentence):
                    # for each word selected from the sentence
                    if word not in self.stopwords:
                        #keep only non-stopwords words
                        words.add(word)
        for word in words:
            if word not in adjlists:
                #initialize all words to a new dict()
                adjlists[word] = dict()

        for word in words:
            for other in words:

                if word == other:
                    continue

                if other in adjlists[word] or word in adjlists[other]:
                    continue

                sim = wv.similarity(word, other)
                adjlists[word][other] = sim
                adjlists[other][word] = sim

        # Normalize weights.
        for a in adjlists:
            sum_w = sum(w for _, w in adjlists[a].items())
            for b in adjlists[a]:
                adjlists[a][b] /= sum_w
        return adjlists
Exemple #10
0
def _gen_char2vec():
    print("Generating char2vec model ...")
    char_dict = CharDict()
    cpu_count = multiprocessing.cpu_count()
    poems = Poems()
    poems_str = [
        list(line) for line in list(itertools.chain.from_iterable(poems))
    ]
    # for item in poems_str:
    #     print(item)
    # model=models.Word2Vecrd2Vec(sentences=poems, size=CHAR_VEC_DIM, alpha=0.025, window=5, min_count=5)
    model = models.Word2Vec(sentences=poems_str,
                            size=CHAR_VEC_DIM,
                            alpha=0.025,
                            window=2,
                            min_count=2,
                            workers=cpu_count,
                            min_alpha=0.0001,
                            sg=0,
                            hs=1,
                            negative=5,
                            cbow_mean=1,
                            hashfxn=hash,
                            iter=30,
                            null_word=0,
                            trim_rule=None,
                            sorted_vocab=1)
    embedding = uniform(-1.0, 1.0, size=[len(char_dict), CHAR_VEC_DIM])
    # print(len(model.wv))
    # for word in model.vocabulary.:
    #     print(word)
    counter_yes, counter_no = 0, 0
    for index, word in char_dict:
        if word in model.wv:
            embedding[index] = model.wv[word]
            counter_yes += 1
        else:
            counter_no += 1
            print('{}不在wv中'.format(word))
    print('有wv的字{}个没有wv的字{}个'.format(counter_yes, counter_no))
    np.save(char2vec_path, embedding)
Exemple #11
0
    def _get_adjlists(self):
        print("[TextRank] Generating word graph ...")
        segmenter = Segmenter()
        poems = Poems()
        adjlists = dict(
        )  # 2D dict, dict[word1][word2]=prob(going from word1 to word2)
        # Count number of co-occurrence.
        """
        ######################## count relationship per sentence ###################
        for poem in poems:
            for sentence in poem:
                words = []
                for word in segmenter.segment(sentence):
                    # for each word selected from the sentence
                    if word not in self.stopwords:
                        #keep only non-stopwords words
                        words.append(word)
                for word in words:
                    if word not in adjlists:
                        #initialize all words to a new dict()
                        adjlists[word] = dict()
                for i in range(len(words)):
                    for j in range(i + 1, len(words)):
                        #### if two words present in the same sentence, their score +=1 #####
                        if words[j] not in adjlists[words[i]]:
                            adjlists[words[i]][words[j]] = 1.0
                        else:
                            adjlists[words[i]][words[j]] += 1.0
                        if words[i] not in adjlists[words[j]]:
                            adjlists[words[j]][words[i]] = 1.0
                        else:
                            adjlists[words[j]][words[i]] += 1.0

        ######################## end count relationship per sentence ###################
        """

        ######################## count relationship per poem ###################
        for poem in poems:
            for sentence in poem:
                words = []
                for word in segmenter.segment(sentence):
                    # for each word selected from the sentence
                    if word not in self.stopwords:
                        #keep only non-stopwords words
                        words.append(word)
            for word in words:
                if word not in adjlists:
                    #initialize all words to a new dict()
                    adjlists[word] = dict()
            for i in range(len(words)):
                for j in range(i + 1, len(words)):
                    #### if two words present in the same sentence, their score +=1 #####
                    if words[j] not in adjlists[words[i]]:
                        adjlists[words[i]][words[j]] = 1.0
                    else:
                        adjlists[words[i]][words[j]] += 1.0
                    if words[i] not in adjlists[words[j]]:
                        adjlists[words[j]][words[i]] = 1.0
                    else:
                        adjlists[words[j]][words[i]] += 1.0

        ######################## end count relationship per poem ###################

        # Normalize weights.
        for a in adjlists:
            sum_w = sum(w for _, w in adjlists[a].items())
            for b in adjlists[a]:
                adjlists[a][b] /= sum_w
        return adjlists
Exemple #12
0
def gen_train_data():
    print("Generating training data ...")
    '''with open(r'raw/corpus.txt', 'r',encoding='utf-8') as fin:
        for line in fin.readlines()[0 : 6]:
            for sentence in split_sentences(line):
                print(' '.join(sentence))'''
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = [[], [], [], []]
    gen_data_for_pair_train = []
    sentence2_without_keyword = []
    sentence4_without_keyword = []
    for poem in poems:
        if len(poem) != 4:
            continue  # Only consider quatrains. 只考虑绝句
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for i, sentence in enumerate(poem):
            if len(sentence) != 5:  # 选5言诗
                valid = False
                break
            temp = ''.join(sentence)
            tempList = [temp[:2], temp[1:3], temp[2:4], temp[3:5]
                        ] + sentence  #把相邻的两个字组成的对子和单个字都列出来
            words = list(filter(lambda seg: seg in ranked_words,
                                tempList))  #取出关键字, 只考虑了单个字的情况

            if len(words) == 0:  #希望句子里有排出来的高效的字
                valid = False
                break
            keyword = words[0]
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word  #找第一关键字
            if len(keyword) == 2:  #根本没有这回事, ranked_words都是单个的
                keyword = keyword[0] + ' ' + keyword[1]
            gen_line = ' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' '+ ' '.join(sentence) + \
                    '\t' + keyword + '\t' + context + '\n' #数据格式: 当前诗句 当前诗句 关键字 上文
            if i == 2:
                sentence2_without_keyword.append(' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' ' + ' '.join(sentence) +\
                    '\t' + '^' + '\t' + context + '\n')
            if i == 3:
                sentence4_without_keyword.append(' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' ' + ' '.join(sentence) +\
                    '\t' + '^' + '\t' + context + '\n')   #nokeyword数据格式: 当前诗句 当前诗句 上文
            if i == 1 or i == 3:
                gen_line_ = ' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' ' + ' '.join(sentence) +\
                    '\t' + '^' + '\t' + '^' + ' ' +  ' '.join(last_sentence) + '\n'
                gen_data_for_pair_train.append(gen_line_)
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += ' ' + ' '.join(sentence) + ' ' + end_of_sentence()
            last_sentence = sentence

        if valid:
            plan_data.append('\t'.join(keywords) + '\n')  #对每首诗提4个关键字出来
            for i, line in enumerate(gen_lines):
                gen_data[i].append(line)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    '''with open(gen_data_path, 'w') as fout:
        pass'''
    for i in range(1, 5):
        with open(train_data_path[i] + '/' + f'keyword{i}' + '.txt', 'w') as f:
            for line in gen_data[i - 1]:
                f.write(line)

    #对仗可能需要用到的上下句关系
    with open(train_data_path[0] + '/pair.txt', 'w') as f:
        for line in gen_data_for_pair_train:
            f.write(line)
    with open(train_data_path[2] + '/pair.txt', 'w') as f:
        for line in gen_data_for_pair_train:
            f.write(line)
    with open(train_data_path[4] + '/pair.txt', 'w') as f:
        for line in gen_data_for_pair_train:
            f.write(line)

    with open(train_data_path[3] + '/nokeyword3.txt', 'w') as f:
        for line in sentence2_without_keyword:
            f.write(line)
    with open(train_data_path[4] + '/nokeyword4.txt', 'w') as f:
        for line in sentence4_without_keyword:
            f.write(line)
import argparse
from train import train
from infer import generate_control
from char2vec import Char2Vec
from char_dict import CharDict
from poems import Poems
from data_utils import batch_train_data
from rank_words import RankedWords
if __name__=='__main__':
    arguementparser=argparse.ArgumentParser(description='chinese poem generation')
    arguementparser.add_argument('-t',action='store_true',dest='train',default=False)
    arguementparser.add_argument('-p',action='store_true',dest='pretrain',default=False)
    arguementparser.add_argument('-i',action='store_true',dest='infer',default=False)
    # arguementparser.add_argument('-p', dest = 'planner', default = False,action = 'store_true',
    #                              help = 'train planning model')
    args=arguementparser.parse_args()
    # print('args==>',args)
    if args.train:
        print('进入训练阶段')
        train(n_epochs=1000)
    elif args.pretrain:
        print('进入预训练阶段')
        CharDict()
        RankedWords()
        Char2Vec()
        Poems()
        batch_train_data(32)
    elif args.infer:
        print('进入测试阶段')
        generate_control()