コード例 #1
0
def refine(ch_rhyme, ch, alignment=False, topn=50):
    if alignment:
        model = models.Word2Vec.load(_w2v_model_path)
    else:
        model = models.Word2Vec.load(_w2v_model_path)
    rdict = RhymeUtil()
    int2ch, ch2int = get_vocab()
    rhyme = rdict.get_rhyme(unicode(ch_rhyme, "utf-8"))
    result = [
        t[0] for t in model.wv.most_similar(positive=[unicode(ch, "utf-8")],
                                            topn=topn)
    ]
    filtered_result = filter(lambda ch: ch in ch2int, result)
    for target in filtered_result:
        if rdict.get_rhyme(target) == rhyme:
            return target
    return ch
コード例 #2
0
def _parse_corpus(raw_file, json_file):
    print("Parsing %s ..." % raw_file, end=' ')
    #use in linux
    #sys.stdout.flush()
    rdict = RhymeUtil()
    data = []
    with codecs.open(raw_file, 'r', 'utf-8') as fin:
        tags = fin.readline().strip().split('\t')
        line = fin.readline().strip()
        while line:
            toks = line.split('\t')
            poem = {'source': os.path.basename(raw_file)}
            for idx, tok in enumerate(toks):
                if tags[idx] != 'body':
                    poem[tags[idx]] = tok
                else:
                    body = tok
            flag = True
            left = body.find('(')
            while left >= 0:
                right = body.find(')')
                if right < left:
                    flag = False
                    break
                else:
                    body = body[:left] + body[right + 1:]
                    left = body.find('(')
            if flag and body.find(')') < 0:
                poem['sentences'] = split_sentences(body)
                for sentence in poem['sentences']:
                    if not reduce(lambda x, ch: x and rdict.has_char(ch),
                                  sentence, True):
                        flag = False
                        break
                if flag:
                    data.append(poem)
            line = fin.readline().strip()
    with codecs.open(json_file, 'w', 'utf-8') as fout:
        json.dump(data, fout)
    print("Done (%d poems)" % len(data))
    return data
コード例 #3
0
ファイル: corpus.py プロジェクト: ASE-couplet/couplet
def _parse_couplet(raw_file, json_file):
    import ipdb
    print "Parsing %s ..." % raw_file,
    sys.stdout.flush()
    rdict = RhymeUtil()
    data = []
    with codecs.open(raw_file, 'r', 'utf-8') as fin:
        line1 = fin.readline().strip()
        line2 = fin.readline().strip()
        while line1 and line2:
            poem = {'source': os.path.basename(raw_file)}
            sentence = [line1]
            sentence.append(line2)
            poem['sentences'] = sentence
            data.append(poem)
            line = fin.readline().strip()
            line1 = fin.readline().strip()
            line2 = fin.readline().strip()
    with codecs.open(json_file, 'w', 'utf-8') as fout:
        json.dump(data, fout)
    print "Done (%d poems)" % len(data)
    return data