or (not os.path.isfile(path + '/' + 'chars.txt')): toolbox.get_vocab_tag(path, [train_file, dev_file], ngram=args.ngram) # 读取文本信息 chars, tags, ngram = toolbox.read_vocab_tag(path, args.ngram) # 读取预训练字向量 emb = None emb_dim = args.embeddings_dimension if args.word_vector: if args.embeddings is not None: print 'Reading embeddings...' short_emb = args.embeddings[args.embeddings.index('/') + 1:args.embeddings.index('.')] if not os.path.isfile(path + '/' + short_emb + '_sub.txt'): toolbox.get_sample_embedding(path, args.embeddings, chars) emb_dim, emb = toolbox.read_sample_embedding(path, short_emb) assert args.embeddings_dimension == emb_dim else: print 'Using random embeddings...' else: assert args.pixels # 读取偏旁部首字典 rad_dic = None if args.radical: print 'Using Radical dictionary...' rad_dic = toolbox.get_radical_dic() # 读取字符图像信息 pixels = None if args.pixels:
tag_scheme=args.tags) if args.reset or not os.path.isfile(path + '/chars.txt'): toolbox.get_chars(path, ['raw_train.txt', 'raw_dev.txt'], sea=is_space) char2idx, unk_chars, idx2char, tag2idx, idx2tag, trans_dict = toolbox.get_dicts( path, args.sent_seg, args.tags, args.crf) if args.embeddings is not None: print 'Reading embeddings...' short_emb = args.embeddings[args.embeddings.index('/') + 1:args.embeddings.index('.')] if args.reset or not os.path.isfile(path + '/' + short_emb + '_sub.txt'): toolbox.get_sample_embedding(path, args.embeddings, char2idx) emb_dim, emb, valid_chars = toolbox.read_sample_embedding( path, short_emb, char2idx) for vch in valid_chars: if vch in unk_chars: unk_chars.remove(vch) else: emb_dim = args.embeddings_dimension emb = None train_x, train_y, max_len_train = toolbox.get_input_vec( path, 'tag_train.txt', char2idx, tag2idx, limit=args.sent_limit, sent_seg=args.sent_seg, is_space=is_space,