Beispiel #1
0
def gen_train_data():
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = []
    for poem in poems:
        if len(poem) != 4:
            continue  # Only consider quatrains.
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for sentence in poem:
            if len(sentence) != 7:
                #只考虑七字诀句
                valid = False
                break
            #get a list of selected words from this sentence
            #ignore all words if they are not in the ranked words list
            words = list(
                filter(lambda seg: seg in ranked_words,
                       segmenter.segment(sentence)))
            if len(words) == 0:
                valid = False
                break
            keyword = words[0]

            # from all words in this sentence, get the word with highest text_rank score
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word

            gen_line = sentence + end_of_sentence() + \
                       '\t' + keyword + '\t' + context + '\n'
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += sentence + end_of_sentence()
        if valid:
            # plan data: each line is four keywords from the 4 sentences
            plan_data.append('\t'.join(keywords) + '\n')
            gen_data.extend(gen_lines)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    with open(gen_data_path, 'w') as fout:
        for line in gen_data:
            fout.write(line)
Beispiel #2
0
def gen_train_data():
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = []
    for poem in poems:
        # 只处理四行七言的诗
        if len(poem) != 4:
            continue
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for sentence in poem:
            if len(sentence) != 7:
                valid = False
                break
            words = list(
                filter(lambda seg: seg in ranked_words,
                       segmenter.segment(sentence)))
            if len(words) == 0:
                valid = False
                break
            keyword = words[0]
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word
            gen_line = sentence + end_of_sentence() + \
                    '\t' + keyword + '\t' + context + '\n'
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += sentence + end_of_sentence()
        if valid:
            plan_data.append('\t'.join(keywords) + '\n')
            gen_data.extend(gen_lines)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    with open(gen_data_path, 'w') as fout:
        for line in gen_data:
            fout.write(line)
Beispiel #3
0
def gen_train_data():
    print("Generating training data ...")
    '''with open(r'raw/corpus.txt', 'r',encoding='utf-8') as fin:
        for line in fin.readlines()[0 : 6]:
            for sentence in split_sentences(line):
                print(' '.join(sentence))'''
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = [[], [], [], []]
    gen_data_for_pair_train = []
    sentence2_without_keyword = []
    sentence4_without_keyword = []
    for poem in poems:
        if len(poem) != 4:
            continue  # Only consider quatrains. 只考虑绝句
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for i, sentence in enumerate(poem):
            if len(sentence) != 5:  # 选5言诗
                valid = False
                break
            temp = ''.join(sentence)
            tempList = [temp[:2], temp[1:3], temp[2:4], temp[3:5]
                        ] + sentence  #把相邻的两个字组成的对子和单个字都列出来
            words = list(filter(lambda seg: seg in ranked_words,
                                tempList))  #取出关键字, 只考虑了单个字的情况

            if len(words) == 0:  #希望句子里有排出来的高效的字
                valid = False
                break
            keyword = words[0]
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word  #找第一关键字
            if len(keyword) == 2:  #根本没有这回事, ranked_words都是单个的
                keyword = keyword[0] + ' ' + keyword[1]
            gen_line = ' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' '+ ' '.join(sentence) + \
                    '\t' + keyword + '\t' + context + '\n' #数据格式: 当前诗句 当前诗句 关键字 上文
            if i == 2:
                sentence2_without_keyword.append(' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' ' + ' '.join(sentence) +\
                    '\t' + '^' + '\t' + context + '\n')
            if i == 3:
                sentence4_without_keyword.append(' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' ' + ' '.join(sentence) +\
                    '\t' + '^' + '\t' + context + '\n')   #nokeyword数据格式: 当前诗句 当前诗句 上文
            if i == 1 or i == 3:
                gen_line_ = ' '.join(sentence) + ' ' + end_of_sentence() + '\t' + start_of_sentence() + ' ' + ' '.join(sentence) +\
                    '\t' + '^' + '\t' + '^' + ' ' +  ' '.join(last_sentence) + '\n'
                gen_data_for_pair_train.append(gen_line_)
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += ' ' + ' '.join(sentence) + ' ' + end_of_sentence()
            last_sentence = sentence

        if valid:
            plan_data.append('\t'.join(keywords) + '\n')  #对每首诗提4个关键字出来
            for i, line in enumerate(gen_lines):
                gen_data[i].append(line)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    '''with open(gen_data_path, 'w') as fout:
        pass'''
    for i in range(1, 5):
        with open(train_data_path[i] + '/' + f'keyword{i}' + '.txt', 'w') as f:
            for line in gen_data[i - 1]:
                f.write(line)

    #对仗可能需要用到的上下句关系
    with open(train_data_path[0] + '/pair.txt', 'w') as f:
        for line in gen_data_for_pair_train:
            f.write(line)
    with open(train_data_path[2] + '/pair.txt', 'w') as f:
        for line in gen_data_for_pair_train:
            f.write(line)
    with open(train_data_path[4] + '/pair.txt', 'w') as f:
        for line in gen_data_for_pair_train:
            f.write(line)

    with open(train_data_path[3] + '/nokeyword3.txt', 'w') as f:
        for line in sentence2_without_keyword:
            f.write(line)
    with open(train_data_path[4] + '/nokeyword4.txt', 'w') as f:
        for line in sentence4_without_keyword:
            f.write(line)