ngram = args.ngram if not os.path.isfile(path + '/' + str(ngram) + 'gram.txt') \ or (not os.path.isfile(path + '/' + 'chars.txt')): toolbox.get_vocab_tag(path, [train_file, dev_file], ngram=ngram) # 读取文本信息 chars, tags, ngram = toolbox.read_vocab_tag(path, ngram) emb = None emb_dim = args.embeddings_dimension if args.embeddings is not None: # 读取预训练字向量 print 'Reading embeddings...' short_emb = args.embeddings[args.embeddings.index('/') + 1:args.embeddings.index('.')] if not os.path.isfile(path + '/' + short_emb + '_sub.txt'): toolbox.get_sample_embedding(path, args.embeddings, map(lambda x: x[0], chars)) emb_dim, emb = toolbox.read_sample_embedding(path, short_emb) assert args.embeddings_dimension == emb_dim else: print 'Using random embeddings...' char2idx, idx2char, char2freq, tag2idx, idx2tag = toolbox.get_dic( chars, tags, args.char_freq_loss) # train_x: shape=(2,句子数量),2 表示字符本身+偏旁部首 train_x, train_y, train_max_slen_c, train_max_slen_w, train_max_wlen = \ toolbox.get_input_vec(path, train_file, char2idx, tag2idx, char2freq, tag_scheme=args.tag_scheme) dev_x, dev_y, dev_max_slen_c, dev_max_slen_w, dev_max_wlen = \ toolbox.get_input_vec(path, dev_file, char2idx, tag2idx, char2freq, tag_scheme=args.tag_scheme) # 读取 ngram 向量
if args.ngram > 1 and not os.path.isfile(path + '/' + str(args.ngram) + 'gram.txt') \ or (not os.path.isfile(path + '/' + 'chars.txt')): toolbox.get_vocab_tag(path, [train_file, dev_file], ngram=args.ngram) # 读取文本信息 chars, tags, ngram = toolbox.read_vocab_tag(path, args.ngram) # 读取预训练字向量 emb = None emb_dim = args.embeddings_dimension if args.word_vector: if args.embeddings is not None: print 'Reading embeddings...' short_emb = args.embeddings[args.embeddings.index('/') + 1:args.embeddings.index('.')] if not os.path.isfile(path + '/' + short_emb + '_sub.txt'): toolbox.get_sample_embedding(path, args.embeddings, chars) emb_dim, emb = toolbox.read_sample_embedding(path, short_emb) assert args.embeddings_dimension == emb_dim else: print 'Using random embeddings...' else: assert args.pixels # 读取偏旁部首字典 rad_dic = None if args.radical: print 'Using Radical dictionary...' rad_dic = toolbox.get_radical_dic() # 读取字符图像信息 pixels = None
tag_scheme=args.tags) if args.reset or not os.path.isfile(path + '/chars.txt'): toolbox.get_chars(path, ['raw_train.txt', 'raw_dev.txt'], sea=is_space) char2idx, unk_chars_idx, idx2char, tag2idx, idx2tag, trans_dict = toolbox.get_dicts_new( path_, args.sent_seg, args.tags, args.crf) if args.embeddings is not None: print 'Reading embeddings...' short_emb = args.embeddings[args.embeddings.index('/') + 1:args.embeddings.index('.')] if args.reset or not os.path.isfile(path_ + '/' + short_emb + '_sub.txt'): toolbox.get_sample_embedding(path_, args.embeddings, char2idx) emb_dim, emb, valid_chars = toolbox.read_sample_embedding( path_, short_emb, char2idx) for vch in valid_chars: if char2idx[vch] in unk_chars_idx: unk_chars_idx.remove(char2idx[vch]) else: emb_dim = args.emb_dimension emb = None train_x1, train_x2, train_y, max_len_train = toolbox.get_input_vec_new( path_, char2idx, tag2idx, limit=args.sent_limit, sent_seg=args.sent_seg,