pixels = toolbox.read_chars_pixels(path, font_name, args.picture_size) char2idx, idx2char, tag2idx, idx2tag = toolbox.get_dic(chars, tags) # train_x: shape=(2,句子数量),2 表示字符本身+偏旁部首 train_x, train_y, train_max_slen_c, train_max_slen_w, train_max_wlen = \ toolbox.get_input_vec(path, train_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme) dev_x, dev_y, dev_max_slen_c, dev_max_slen_w, dev_max_wlen = \ toolbox.get_input_vec(path, dev_file, char2idx, tag2idx, rad_dic=rad_dic, tag_scheme=args.tag_scheme) # 读取 ngram 向量 nums_grams = None ng_embeddings = None if args.ngram > 1: gram2idx = toolbox.get_ngram_dic(ngram) train_gram = toolbox.get_gram_vec(path, train_file, gram2idx) dev_gram = toolbox.get_gram_vec(path, dev_file, gram2idx) # 这一句后 train_x: shape=(4,句子数量),因为加了 2gram 和 3gram train_x += train_gram dev_x += dev_gram nums_grams = [] for dic in gram2idx: nums_grams.append(len(dic.keys())) if args.ngram_embeddings is not None: print 'Reading N-gram Embeddings...' short_ng_emb = args.ngram_embeddings[args.ngram_embeddings. index('/') + 1:] if not os.path.isfile(path + '/' + short_ng_emb + '_' + str(args.ngram) + 'gram_sub.txt'):
def tokenize(self, text): """Tokenize the input text :param text: the text to tokenize. :return: a list of the sentences contained in the text. Each element is a list of tokens and each token is a tuple made of its raw string and its offset in the text. """ t = time() grams, gram2idx = None, None (char2idx, unk_chars_idx, idx2char, tag2idx, idx2tag, _) = toolbox.get_dicts(self.path, True, self.tag_scheme, self.crf) if self.ngram > 1: grams = toolbox.read_ngrams(self.path, self.ngram) raw_file_f = tempfile.NamedTemporaryFile(mode='w') raw_file_f.write(text) raw_file_f.flush() raw_file = raw_file_f.name new_chars, new_grams = None, None new_chars = toolbox.get_new_chars(raw_file, char2idx, self.is_space) if self.emb_path is not None: valid_chars = toolbox.get_valid_chars(new_chars, self.emb_path) else: valid_chars = None (char2idx, idx2char, unk_chars_idx, sub_dict) = toolbox.update_char_dict(char2idx, new_chars, unk_chars_idx, valid_chars) raw_x, raw_len = toolbox.get_input_vec_tag(None, raw_file, char2idx, limit=self.sent_limit + 100, is_space=self.is_space) #print(f'Got raw_x={raw_x}, raw_len={raw_len}') if self.ngram > 1: gram2idx = toolbox.get_ngram_dic(grams) new_grams = toolbox.get_new_grams(raw_file, gram2idx, is_raw=True, is_space=self.is_space) raw_grams = toolbox.get_gram_vec_tag(None, raw_file, gram2idx, limit=self.sent_limit + 100, is_space=self.is_space) raw_x += raw_grams for k in range(len(raw_x)): raw_x[k] = toolbox.pad_zeros(raw_x[k], self.sent_limit + 100) with self.main_graph.as_default(): self.model.define_updates(new_chars=new_chars, emb_path=self.emb_path, char2idx=char2idx) with tf.device(self.gpu_config): #print('Running updates....', file=sys.stderr) self.model.run_updates(self.main_sess) #print('Updated.', file=sys.stderr) #print(f'Done loading data. Time consumed: {time() - t:3.2f} seconds', #file=sys.stderr) #sys.stderr.flush() #t = time() #output_path = f'{raw_file}.conllu' #print(f'Before tag {raw_x}.', file=sys.stderr) sentences = self.model.tag(raw_x, idx2tag, idx2char, unk_chars_idx, sub_dict, self.sess, batch_size=self.tag_batch) #print(f'Done tokenizing. Time consumed: {time() - t:3.2f} seconds', #file=sys.stderr) #sys.stderr.flush() #t = time() return sentences