Example #1
0
            or (not os.path.isfile(path + '/' + 'chars.txt')):
        toolbox.get_vocab_tag(path, [train_file, dev_file], ngram=args.ngram)
    # 读取文本信息
    chars, tags, ngram = toolbox.read_vocab_tag(path, args.ngram)

    # 读取预训练字向量
    emb = None
    emb_dim = args.embeddings_dimension
    if args.word_vector:
        if args.embeddings is not None:
            print 'Reading embeddings...'
            short_emb = args.embeddings[args.embeddings.index('/') +
                                        1:args.embeddings.index('.')]
            if not os.path.isfile(path + '/' + short_emb + '_sub.txt'):
                toolbox.get_sample_embedding(path, args.embeddings, chars)
            emb_dim, emb = toolbox.read_sample_embedding(path, short_emb)
            assert args.embeddings_dimension == emb_dim
        else:
            print 'Using random embeddings...'
    else:
        assert args.pixels

    # 读取偏旁部首字典
    rad_dic = None
    if args.radical:
        print 'Using Radical dictionary...'
        rad_dic = toolbox.get_radical_dic()

    # 读取字符图像信息
    pixels = None
    if args.pixels:
Example #2
0
                                 tag_scheme=args.tags)

    if args.reset or not os.path.isfile(path + '/chars.txt'):
        toolbox.get_chars(path, ['raw_train.txt', 'raw_dev.txt'], sea=is_space)

    char2idx, unk_chars, idx2char, tag2idx, idx2tag, trans_dict = toolbox.get_dicts(
        path, args.sent_seg, args.tags, args.crf)

    if args.embeddings is not None:
        print 'Reading embeddings...'
        short_emb = args.embeddings[args.embeddings.index('/') +
                                    1:args.embeddings.index('.')]
        if args.reset or not os.path.isfile(path + '/' + short_emb +
                                            '_sub.txt'):
            toolbox.get_sample_embedding(path, args.embeddings, char2idx)
        emb_dim, emb, valid_chars = toolbox.read_sample_embedding(
            path, short_emb, char2idx)
        for vch in valid_chars:
            if vch in unk_chars:
                unk_chars.remove(vch)
    else:
        emb_dim = args.embeddings_dimension
        emb = None

    train_x, train_y, max_len_train = toolbox.get_input_vec(
        path,
        'tag_train.txt',
        char2idx,
        tag2idx,
        limit=args.sent_limit,
        sent_seg=args.sent_seg,
        is_space=is_space,