print 'Test set: %d instances.' % len(test_x[0]) max_step = test_max_slen_c print 'Longest sentence by character is %d. ' % test_max_slen_c print 'Longest sentence by word is %d. ' % test_max_slen_w print 'Longest word is %d. ' % test_max_wlen if graphic: new_pixels = toolbox.get_new_pixels(new_chars, font, pic_size) pixels += new_pixels if ngram > 1: gram2idx = toolbox.get_ngram_dic(grams) new_grams = toolbox.get_new_grams(path + '/' + test_file, gram2idx) if args.ngram_embeddings is not None: new_grams = toolbox.get_valid_grams(new_grams, args.ngram_embeddings) gram2idx = toolbox.update_gram_dicts(gram2idx, new_grams) test_gram = toolbox.get_gram_vec(path, test_file, gram2idx) test_x += test_gram for k in range(len(test_x)): test_x[k] = toolbox.pad_zeros(test_x[k], max_step) for k in range(len(test_y)): test_y[k] = toolbox.pad_zeros(test_y[k], max_step) elif args.action == 'tag': assert args.raw is not None
def tokenize(self, text): """Tokenize the input text :param text: the text to tokenize. :return: a list of the sentences contained in the text. Each element is a list of tokens and each token is a tuple made of its raw string and its offset in the text. """ t = time() grams, gram2idx = None, None (char2idx, unk_chars_idx, idx2char, tag2idx, idx2tag, _) = toolbox.get_dicts(self.path, True, self.tag_scheme, self.crf) if self.ngram > 1: grams = toolbox.read_ngrams(self.path, self.ngram) raw_file_f = tempfile.NamedTemporaryFile(mode='w') raw_file_f.write(text) raw_file_f.flush() raw_file = raw_file_f.name new_chars, new_grams = None, None new_chars = toolbox.get_new_chars(raw_file, char2idx, self.is_space) if self.emb_path is not None: valid_chars = toolbox.get_valid_chars(new_chars, self.emb_path) else: valid_chars = None (char2idx, idx2char, unk_chars_idx, sub_dict) = toolbox.update_char_dict(char2idx, new_chars, unk_chars_idx, valid_chars) raw_x, raw_len = toolbox.get_input_vec_tag(None, raw_file, char2idx, limit=self.sent_limit + 100, is_space=self.is_space) #print(f'Got raw_x={raw_x}, raw_len={raw_len}') if self.ngram > 1: gram2idx = toolbox.get_ngram_dic(grams) new_grams = toolbox.get_new_grams(raw_file, gram2idx, is_raw=True, is_space=self.is_space) raw_grams = toolbox.get_gram_vec_tag(None, raw_file, gram2idx, limit=self.sent_limit + 100, is_space=self.is_space) raw_x += raw_grams for k in range(len(raw_x)): raw_x[k] = toolbox.pad_zeros(raw_x[k], self.sent_limit + 100) with self.main_graph.as_default(): self.model.define_updates(new_chars=new_chars, emb_path=self.emb_path, char2idx=char2idx) with tf.device(self.gpu_config): #print('Running updates....', file=sys.stderr) self.model.run_updates(self.main_sess) #print('Updated.', file=sys.stderr) #print(f'Done loading data. Time consumed: {time() - t:3.2f} seconds', #file=sys.stderr) #sys.stderr.flush() #t = time() #output_path = f'{raw_file}.conllu' #print(f'Before tag {raw_x}.', file=sys.stderr) sentences = self.model.tag(raw_x, idx2tag, idx2char, unk_chars_idx, sub_dict, self.sess, batch_size=self.tag_batch) #print(f'Done tokenizing. Time consumed: {time() - t:3.2f} seconds', #file=sys.stderr) #sys.stderr.flush() #t = time() return sentences
limit=args.sent_limit + 100, sent_seg=sent_seg, is_space=is_space, ignore_space=args.ignore_space) max_step = max_len_test if sent_seg: print 'Joint sentence segmentation...' else: print 'Test set: %d instances.' % len(test_x[0]) if ngram > 1: gram2idx = toolbox.get_ngram_dic(grams) new_grams = toolbox.get_new_grams(path + '/' + test_file, gram2idx, is_space=is_space) test_grams = toolbox.get_gram_vec(path, 'raw_test.txt', gram2idx, is_raw=True, limit=args.sent_limit + 100, sent_seg=sent_seg, is_space=is_space, ignore_space=args.ignore_space) test_x += test_grams for k in range(len(test_x)): test_x[k] = toolbox.pad_zeros(test_x[k], max_step)
if sent_seg: print 'Joint sentence segmentation...' else: print 'Raw setences: %d instances.' % len(raw_x[0]) max_step = raw_len else: max_step = args.sent_limit if ngram > 1: gram2idx = toolbox.get_ngram_dic(grams) new_grams = toolbox.get_new_grams(raw_file, gram2idx, is_raw=True, is_space=is_space) if not args.segment_large: if sent_seg: raw_grams = toolbox.get_gram_vec_tag( None, raw_file, gram2idx, limit=args.sent_limit + 100, is_space=is_space) else: raw_grams = toolbox.get_gram_vec(None, raw_file, gram2idx, is_raw=True,