# 读取字符图像信息 pixels = None if args.pixels: print 'Reading characters as pixels...' font_name = args.font[:args.font.index('.')] if not os.path.isfile(path + '/' + font_name + str(args.picture_size) + '_pixels.txt'): toolbox.get_chars_pixels(path, chars, args.font, args.picture_size) pixels = toolbox.read_chars_pixels(path, font_name, args.picture_size) char2idx, idx2char, tag2idx, idx2tag = toolbox.get_dic(chars, tags) # train_x: shape=(2,句子数量),2 表示字符本身+偏旁部首 train_x, train_y, train_max_slen_c, train_max_slen_w, train_max_wlen = \ toolbox.get_input_vec(path, train_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme) dev_x, dev_y, dev_max_slen_c, dev_max_slen_w, dev_max_wlen = \ toolbox.get_input_vec(path, dev_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme) # 读取 ngram 向量 nums_grams = None ng_embeddings = None if args.ngram > 1: gram2idx = toolbox.get_ngram_dic(ngram) train_gram = toolbox.get_gram_vec(path, train_file, gram2idx) dev_gram = toolbox.get_gram_vec(path, dev_file, gram2idx) # 这一句后 train_x: shape=(4,句子数量),因为加了 2gram 和 3gram train_x += train_gram dev_x += dev_gram nums_grams = []
rad_dic = None if args.radical: print 'Using Radical dictionary...' rad_dic = toolbox.get_radical_dic() pixels = None if args.pixels: print 'Reading characters as pixels...' font_name = args.font[:args.font.index('.')] if not os.path.isfile(path + '/' + font_name + str(args.picture_size) + '_pixels.txt'): toolbox.get_chars_pixels(path, chars, args.font, args.picture_size) pixels = toolbox.read_chars_pixels(path, font_name, args.picture_size) char2idx, idx2char, tag2idx, idx2tag = toolbox.get_dic(chars, tags) train_x, train_y, train_max_slen_c, train_max_slen_w, train_max_wlen = toolbox.get_input_vec(path, train_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme) dev_x, dev_y, dev_max_slen_c, dev_max_slen_w, dev_max_wlen = toolbox.get_input_vec(path, dev_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme) nums_grams = None ng_embs = None if args.ngram > 1: gram2idx = toolbox.get_ngram_dic(ngram) train_gram = toolbox.get_gram_vec(path, train_file, gram2idx) dev_gram = toolbox.get_gram_vec(path, dev_file, gram2idx) train_x += train_gram dev_x += dev_gram nums_grams = [] for dic in gram2idx: nums_grams.append(len(dic.keys()))
'_sub.txt'): toolbox.get_sample_embedding(path, args.embeddings, char2idx) emb_dim, emb, valid_chars = toolbox.read_sample_embedding( path, short_emb, char2idx) for vch in valid_chars: if vch in unk_chars: unk_chars.remove(vch) else: emb_dim = args.embeddings_dimension emb = None train_x, train_y, max_len_train = toolbox.get_input_vec( path, 'tag_train.txt', char2idx, tag2idx, limit=args.sent_limit, sent_seg=args.sent_seg, is_space=is_space, train_size=args.train_size, ignore_space=args.ignore_space) dev_x, max_len_dev = toolbox.get_input_vec_raw( path, 'raw_dev.txt', char2idx, limit=args.sent_limit, sent_seg=args.sent_seg, is_space=is_space, ignore_space=args.ignore_space) if args.sent_seg: print 'Joint sentence segmentation...'
short_emb = args.embeddings[args.embeddings.index('/') + 1:args.embeddings.index('.')] if not os.path.isfile(path + '/' + short_emb + '_sub.txt'): toolbox.get_sample_embedding(path, args.embeddings, map(lambda x: x[0], chars)) emb_dim, emb = toolbox.read_sample_embedding(path, short_emb) assert args.embeddings_dimension == emb_dim else: print 'Using random embeddings...' char2idx, idx2char, char2freq, tag2idx, idx2tag = toolbox.get_dic( chars, tags, args.char_freq_loss) # train_x: shape=(2,句子数量),2 表示字符本身+偏旁部首 train_x, train_y, train_max_slen_c, train_max_slen_w, train_max_wlen = \ toolbox.get_input_vec(path, train_file, char2idx, tag2idx, char2freq, tag_scheme=args.tag_scheme) dev_x, dev_y, dev_max_slen_c, dev_max_slen_w, dev_max_wlen = \ toolbox.get_input_vec(path, dev_file, char2idx, tag2idx, char2freq, tag_scheme=args.tag_scheme) # 读取 ngram 向量 nums_grams = None ng_embeddings = None if ngram > 1: gram2idx = toolbox.get_ngram_dic(ngram) train_gram = toolbox.get_gram_vec(path, train_file, gram2idx) dev_gram = toolbox.get_gram_vec(path, dev_file, gram2idx) # 这一句后 train_x: shape=(4,句子数量),因为加了 2gram 和 3gram train_x += train_gram dev_x += dev_gram nums_grams = [] for dic in gram2idx: