Beispiel #1
0
def _gen_train_data():
    segmenter = Segmenter()
    poems = get_pop_quatrains()
    random.shuffle(poems)
    ranks = get_word_ranks()
    print("Generating training data ...")
    data = []
    kw_data = []
    for idx, poem in enumerate(poems):
        sentences = poem['sentences']
        if len(sentences) == 4:
            flag = True
            rows = []
            kw_row = []
            for sentence in sentences:
                rows.append([sentence])
                segs = list(filter(lambda seg: seg in ranks, segmenter.segment(sentence)))
                if 0 == len(segs):
                    flag = False
                    break
                keyword = reduce(lambda x,y: x if ranks[x] < ranks[y] else y, segs)
                kw_row.append(keyword)
                rows[-1].append(keyword)
            if flag:
                data.extend(rows)
                kw_data.append(kw_row)
        if 0 == (idx+1)%2000:
            print("[Training Data] %d/%d poems are processed." %(idx+1, len(poems)))
    with codecs.open(train_path, 'w', 'utf-8') as fout:
        for row in data:
            fout.write('\t'.join(row)+'\n')
    with codecs.open(kw_train_path, 'w', 'utf-8') as fout:
        for kw_row in kw_data:
            fout.write('\t'.join(kw_row)+'\n')
    print("Training data is generated.")
Beispiel #2
0
def _gen_cangtou_train_data():
    poems = get_pop_quatrains()
    random.shuffle(poems)
    with codecs.open(cangtou_train_path, 'w', 'utf-8') as fout:
        for idx, poem in enumerate(poems):
            for sentence in poem['sentences']:
                fout.write(sentence + "\t" + sentence[0] + "\n")
            if 0 == (idx + 1) % 2000:
                print("[Training Data] %d/%d poems are processed." %(idx+1, len(poems)))
    print("Cangtou training data is generated.")
Beispiel #3
0
def _gen_train_data():
    sampled_poems = np.array(random_int_list(1, 70000, 4000))
    segmenter = Segmenter()  #generation   sxhy dict
    poems = get_pop_quatrains()  #获得较为流行的10万首诗
    random.shuffle(poems)  #重新排序
    ranks = get_word_ranks()  #Textrank  word  -rank_number
    print("Generating training data ...")
    data = []
    kw_data = []
    test_data = []
    for idx, poem in enumerate(poems):
        sentences = poem['sentences']
        if len(sentences) == 4:
            flag = True
            test_flag = True
            rows = []
            kw_row = []
            test_row = []
            if idx in sampled_poems:
                test_flag = False
            for sentence in sentences:
                rows.append([sentence])
                test_row.append([sentence])
                segs = list(
                    filter(lambda seg: seg in ranks,
                           segmenter.segment(sentence)))
                if 0 == len(segs):
                    flag = False
                    break
                keyword = reduce(lambda x, y: x if ranks[x] < ranks[y] else y,
                                 segs)  #选取权重比较大的keywords
                kw_row.append(keyword)
                rows[-1].append(keyword)
            if flag and test_flag:
                data.extend(rows)
                kw_data.append(kw_row)
            if flag and test_flag is False:
                test_data.extend(test_row)

        if 0 == (idx + 1) % 2000:
            print("[Training Data] %d/%d poems are processed." %
                  (idx + 1, len(poems)))
    print(test_data)
    with codecs.open(train_path, 'w', 'utf-8') as fout:
        for row in data:
            fout.write('\t'.join(row) + '\n')
    with codecs.open(kw_train_path, 'w', 'utf-8') as fout:
        for kw_row in kw_data:
            fout.write('\t'.join(kw_row) + '\n')
    with codecs.open(test_path, 'w', 'utf-8') as fout:
        for test_row in test_data:
            fout.write('\t'.join(test_row) + '\n')
    print("Training data is generated.")
def main():
    if os.path.exists(human_samples_path):
        print 'Poems already sampled, use the same human samples.'
        cleaned_poems = load_human_samples()
    else:
        print 'Poems not yet sampled, use new human samples.'
        poems = get_pop_quatrains()
        sampled_poems = sample_poems(poems)
        cleaned_poems = map(lambda poem: poem['sentences'], sampled_poems)

        print 'Generating human samples.'
        generate_human_samples(cleaned_poems)

    print 'Generating model samples'
    generate_rnn_samples(cleaned_poems)
def main():
    if os.path.exists(human_samples_path):
        print('Poems already sampled, use the same human samples.')
        cleaned_poems = load_human_samples()
    else:
        print('Poems not yet sampled, use new human samples.')
        poems = get_pop_quatrains()  #获取名气比较大的十万首古诗
        sampled_poems = sample_poems(poems)
        print(sampled_poems)
        cleaned_poems = list(map(lambda poem: poem['sentences'],
                                 sampled_poems))  #只保留古诗内容

        print('Generating human samples.')
        generate_human_samples(cleaned_poems)

    print('Generating model samples')
    generate_rnn_samples(cleaned_poems)
Beispiel #6
0
def _gen_train_data():
    segmenter = Segmenter()
    poems = get_pop_quatrains()
    random.shuffle(poems)
    ranks = get_word_ranks()
    print "Generating training data ..."
    data = []
    kw_data = []
    for idx, poem in enumerate(poems):
        sentences = poem['sentences']
        if len(sentences) == 4:
            flag = True
            lines = u''
            rows = []
            kw_row = []
            for sentence in sentences:
                rows.append([sentence])
                segs = filter(lambda seg: seg in ranks,
                              segmenter.segment(sentence))
                if 0 == len(segs):  # 只要该行诗句存在不在ranks中的词则这一首诗都不能用
                    flag = False
                    break
                keyword = reduce(lambda x, y: x
                                 if ranks[x] < ranks[y] else y, segs)
                kw_row.append(keyword)
                rows[-1].append(keyword)  # rows的每一个元素是该行诗句加上对应的关键字数组
            if flag:
                data.extend(rows)  # 用extend,data的每一个元素和rows的每一个元素相同
                kw_data.append(kw_row)  # 用append
        if 0 == (idx + 1) % 2000:
            print "[Training Data] %d/%d poems are processed." % (idx + 1,
                                                                  len(poems))
    with codecs.open(train_path, 'w', 'utf-8') as fout:
        for row in data:
            fout.write('\t'.join(row) + '\n')  # 每一行都是用tab键分隔开的一行诗加上关键字序列
    with codecs.open(kw_train_path, 'w', 'utf-8') as fout:
        for kw_row in kw_data:
            fout.write('\t'.join(kw_row) + '\n')
    print "Training data is generated."