char2idx, idx2char, tag2idx, idx2tag = toolbox.get_dic(chars, tags) # train_x: shape=(2,句子数量),2 表示字符本身+偏旁部首 train_x, train_y, train_max_slen_c, train_max_slen_w, train_max_wlen = \ toolbox.get_input_vec(path, train_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme) dev_x, dev_y, dev_max_slen_c, dev_max_slen_w, dev_max_wlen = \ toolbox.get_input_vec(path, dev_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme) # 读取 ngram 向量 nums_grams = None ng_embeddings = None if args.ngram > 1: gram2idx = toolbox.get_ngram_dic(ngram) train_gram = toolbox.get_gram_vec(path, train_file, gram2idx) dev_gram = toolbox.get_gram_vec(path, dev_file, gram2idx) # 这一句后 train_x: shape=(4,句子数量),因为加了 2gram 和 3gram train_x += train_gram dev_x += dev_gram nums_grams = [] for dic in gram2idx: nums_grams.append(len(dic.keys())) if args.ngram_embeddings is not None: print 'Reading N-gram Embeddings...' short_ng_emb = args.ngram_embeddings[args.ngram_embeddings. index('/') + 1:] if not os.path.isfile(path + '/' + short_ng_emb + '_' + str(args.ngram) + 'gram_sub.txt'): toolbox.get_ngram_embedding(path, args.ngram_embeddings, ngram)
nums_grams = None ng_embs = None if args.ngram > 1 and ( args.reset or not os.path.isfile(path + '/' + str(args.ngram) + 'gram.txt')): toolbox.get_ngrams(path, args.ngram, is_space) ngram = toolbox.read_ngrams(path, args.ngram) if args.ngram > 1: gram2idx = toolbox.get_ngram_dic(ngram) train_gram = toolbox.get_gram_vec(path, 'tag_train.txt', gram2idx, limit=args.sent_limit, sent_seg=args.sent_seg, is_space=is_space, ignore_space=args.ignore_space) dev_gram = toolbox.get_gram_vec(path, 'raw_dev.txt', gram2idx, is_raw=True, limit=args.sent_limit, sent_seg=args.sent_seg, is_space=is_space, ignore_space=args.ignore_space) train_x += train_gram dev_x += dev_gram nums_grams = [] for dic in gram2idx: