Beispiel #1
0
    # 读取字符图像信息
    pixels = None
    if args.pixels:
        print 'Reading characters as pixels...'
        font_name = args.font[:args.font.index('.')]
        if not os.path.isfile(path + '/' + font_name + str(args.picture_size) +
                              '_pixels.txt'):
            toolbox.get_chars_pixels(path, chars, args.font, args.picture_size)
        pixels = toolbox.read_chars_pixels(path, font_name, args.picture_size)

    char2idx, idx2char, tag2idx, idx2tag = toolbox.get_dic(chars, tags)

    # train_x: shape=(2,句子数量),2 表示字符本身+偏旁部首
    train_x, train_y, train_max_slen_c, train_max_slen_w, train_max_wlen = \
        toolbox.get_input_vec(path, train_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme)
    dev_x, dev_y, dev_max_slen_c, dev_max_slen_w, dev_max_wlen = \
        toolbox.get_input_vec(path, dev_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme)

    # 读取 ngram 向量
    nums_grams = None
    ng_embeddings = None

    if args.ngram > 1:
        gram2idx = toolbox.get_ngram_dic(ngram)
        train_gram = toolbox.get_gram_vec(path, train_file, gram2idx)
        dev_gram = toolbox.get_gram_vec(path, dev_file, gram2idx)
        # 这一句后 train_x: shape=(4,句子数量),因为加了 2gram 和 3gram
        train_x += train_gram
        dev_x += dev_gram
        nums_grams = []
Beispiel #2
0
    rad_dic = None
    if args.radical:
        print 'Using Radical dictionary...'
        rad_dic = toolbox.get_radical_dic()

    pixels = None
    if args.pixels:
        print 'Reading characters as pixels...'
        font_name = args.font[:args.font.index('.')]
        if not os.path.isfile(path + '/' + font_name + str(args.picture_size) + '_pixels.txt'):
            toolbox.get_chars_pixels(path, chars, args.font, args.picture_size)
        pixels = toolbox.read_chars_pixels(path, font_name, args.picture_size)

    char2idx, idx2char, tag2idx, idx2tag = toolbox.get_dic(chars, tags)

    train_x, train_y, train_max_slen_c, train_max_slen_w, train_max_wlen = toolbox.get_input_vec(path, train_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme)
    dev_x, dev_y, dev_max_slen_c, dev_max_slen_w, dev_max_wlen = toolbox.get_input_vec(path, dev_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme)

    nums_grams = None
    ng_embs = None

    if args.ngram > 1:
        gram2idx = toolbox.get_ngram_dic(ngram)
        train_gram = toolbox.get_gram_vec(path, train_file, gram2idx)
        dev_gram = toolbox.get_gram_vec(path, dev_file, gram2idx)
        train_x += train_gram
        dev_x += dev_gram
        nums_grams = []
        for dic in gram2idx:
            nums_grams.append(len(dic.keys()))
Beispiel #3
0
                                            '_sub.txt'):
            toolbox.get_sample_embedding(path, args.embeddings, char2idx)
        emb_dim, emb, valid_chars = toolbox.read_sample_embedding(
            path, short_emb, char2idx)
        for vch in valid_chars:
            if vch in unk_chars:
                unk_chars.remove(vch)
    else:
        emb_dim = args.embeddings_dimension
        emb = None

    train_x, train_y, max_len_train = toolbox.get_input_vec(
        path,
        'tag_train.txt',
        char2idx,
        tag2idx,
        limit=args.sent_limit,
        sent_seg=args.sent_seg,
        is_space=is_space,
        train_size=args.train_size,
        ignore_space=args.ignore_space)

    dev_x, max_len_dev = toolbox.get_input_vec_raw(
        path,
        'raw_dev.txt',
        char2idx,
        limit=args.sent_limit,
        sent_seg=args.sent_seg,
        is_space=is_space,
        ignore_space=args.ignore_space)
    if args.sent_seg:
        print 'Joint sentence segmentation...'
        short_emb = args.embeddings[args.embeddings.index('/') +
                                    1:args.embeddings.index('.')]
        if not os.path.isfile(path + '/' + short_emb + '_sub.txt'):
            toolbox.get_sample_embedding(path, args.embeddings,
                                         map(lambda x: x[0], chars))
        emb_dim, emb = toolbox.read_sample_embedding(path, short_emb)
        assert args.embeddings_dimension == emb_dim
    else:
        print 'Using random embeddings...'

    char2idx, idx2char, char2freq, tag2idx, idx2tag = toolbox.get_dic(
        chars, tags, args.char_freq_loss)

    # train_x: shape=(2,句子数量),2 表示字符本身+偏旁部首
    train_x, train_y, train_max_slen_c, train_max_slen_w, train_max_wlen = \
        toolbox.get_input_vec(path, train_file, char2idx, tag2idx, char2freq, tag_scheme=args.tag_scheme)
    dev_x, dev_y, dev_max_slen_c, dev_max_slen_w, dev_max_wlen = \
        toolbox.get_input_vec(path, dev_file, char2idx, tag2idx, char2freq, tag_scheme=args.tag_scheme)

    # 读取 ngram 向量
    nums_grams = None
    ng_embeddings = None
    if ngram > 1:
        gram2idx = toolbox.get_ngram_dic(ngram)
        train_gram = toolbox.get_gram_vec(path, train_file, gram2idx)
        dev_gram = toolbox.get_gram_vec(path, dev_file, gram2idx)
        # 这一句后 train_x: shape=(4,句子数量),因为加了 2gram 和 3gram
        train_x += train_gram
        dev_x += dev_gram
        nums_grams = []
        for dic in gram2idx: