Example #1
0
    char2idx, idx2char, tag2idx, idx2tag = toolbox.get_dic(chars, tags)

    # train_x: shape=(2,句子数量),2 表示字符本身+偏旁部首
    train_x, train_y, train_max_slen_c, train_max_slen_w, train_max_wlen = \
        toolbox.get_input_vec(path, train_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme)
    dev_x, dev_y, dev_max_slen_c, dev_max_slen_w, dev_max_wlen = \
        toolbox.get_input_vec(path, dev_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme)

    # 读取 ngram 向量
    nums_grams = None
    ng_embeddings = None

    if args.ngram > 1:
        gram2idx = toolbox.get_ngram_dic(ngram)
        train_gram = toolbox.get_gram_vec(path, train_file, gram2idx)
        dev_gram = toolbox.get_gram_vec(path, dev_file, gram2idx)
        # 这一句后 train_x: shape=(4,句子数量),因为加了 2gram 和 3gram
        train_x += train_gram
        dev_x += dev_gram
        nums_grams = []
        for dic in gram2idx:
            nums_grams.append(len(dic.keys()))

        if args.ngram_embeddings is not None:
            print 'Reading N-gram Embeddings...'
            short_ng_emb = args.ngram_embeddings[args.ngram_embeddings.
                                                 index('/') + 1:]
            if not os.path.isfile(path + '/' + short_ng_emb + '_' +
                                  str(args.ngram) + 'gram_sub.txt'):
                toolbox.get_ngram_embedding(path, args.ngram_embeddings, ngram)
Example #2
0
    nums_grams = None
    ng_embs = None

    if args.ngram > 1 and (
            args.reset
            or not os.path.isfile(path + '/' + str(args.ngram) + 'gram.txt')):
        toolbox.get_ngrams(path, args.ngram, is_space)

    ngram = toolbox.read_ngrams(path, args.ngram)

    if args.ngram > 1:
        gram2idx = toolbox.get_ngram_dic(ngram)
        train_gram = toolbox.get_gram_vec(path,
                                          'tag_train.txt',
                                          gram2idx,
                                          limit=args.sent_limit,
                                          sent_seg=args.sent_seg,
                                          is_space=is_space,
                                          ignore_space=args.ignore_space)
        dev_gram = toolbox.get_gram_vec(path,
                                        'raw_dev.txt',
                                        gram2idx,
                                        is_raw=True,
                                        limit=args.sent_limit,
                                        sent_seg=args.sent_seg,
                                        is_space=is_space,
                                        ignore_space=args.ignore_space)
        train_x += train_gram
        dev_x += dev_gram
        nums_grams = []
        for dic in gram2idx: