def predict(args, processor): # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True config = config_model(args) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file) with tf.Session(config=tf_config) as sess: model = create_model(sess, NERModel, args.output_dir, config, logger) test_data = [] with open(str(args.data_dir / "test.json"), 'r') as f: idx = 0 for line in f: tokens = [] json_d = {} line = json.loads(line.strip()) textlist = list(line['text']) for i, word in enumerate(textlist): token = tokenizer.tokenize(word) assert len(token) == 1 tokens.extend(token) assert len(tokens) < args.max_seq_len ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") # 句子开始设置CLS 标志 segment_ids.append(0) for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) ntokens.append("[SEP]") segment_ids.append(0) # append("O") or append("[SEP]") not sure! input_ids = tokenizer.convert_tokens_to_ids(ntokens) input_len = len(input_ids) input_mask = [1] * len(input_ids) while len(input_ids) < args.max_seq_len: input_ids.append(0) input_mask.append(0) segment_ids.append(0) raw_text = [] raw_text.append('[CLS]') raw_text.extend(textlist) raw_text.append('[SEP]') assert len(raw_text) == len(ntokens) assert len(input_ids) == args.max_seq_len assert len(input_mask) == args.max_seq_len assert len(segment_ids) == args.max_seq_len json_d['id'] = idx json_d['input_ids'] = input_ids json_d['input_mask'] = input_mask json_d['segment_ids'] = segment_ids json_d['input_len'] = input_len json_d['text'] = raw_text idx += 1 test_data.append(json_d) results = [] train_data = processor.get_train_examples() test_train = load_pickle(args.data_dir / 'train_test.bin') for step, line in enumerate(test_data): a_input_ids = [] a_input_mask = [] a_label_ids = [] a_input_lens = [] a_segment_ids = [] aux_sentence = [ train_data[i] for i in test_train[step][:args.aug_num] ] for s in aux_sentence: a_input_ids.append(s['input_ids']) # a_label_ids.append(s['label_ids']) #地址信息增强,将所有的标签信息改成adress标签,全1 a_label_ids.append(s['input_mask']) a_input_mask.append(s['input_mask']) a_input_lens.append(s['input_len']) a_segment_ids.append(s['segment_ids']) input_ids = line['input_ids'] input_mask = line['input_mask'] input_lens = line['input_len'] segment_ids = line['segment_ids'] batch = { 'ori': ([input_ids], [input_mask], [[]], [input_lens], [segment_ids]), 'aug': ([a_input_ids], [a_input_mask], [a_label_ids], [a_input_lens], [a_segment_ids]) } tags = model.evaluate_line(sess, batch) label_entities = get_entities(tags[0], args.id2label) json_d = {} json_d['id'] = step tags[0] = [args.id2label[idx] for idx in tags[0]] json_d['tag_seq'] = " ".join(tags[0]) json_d['entities'] = label_entities results.append(json_d) print(" ") output_predic_file = str(args.output_dir / "test_prediction.json") output_submit_file = str(args.output_dir / "cluener_submit.json") with open(output_predic_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') test_text = [] test_submit = [] for x, y in zip(test_data, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] #加了标记 words = x['text'] if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def predict(args,model,processor): model_path = args.output_dir / 'best-model.bin' model = load_model(model, model_path=str(model_path)) test_data = [] with open(str(args.data_dir / "test.json"), 'r') as f: idx = 0 for line in f: json_d = {} line = json.loads(line.strip()) text = line['text'] words = list(text) labels = ['O'] * len(words) json_d['id'] = idx json_d['context'] = " ".join(words) json_d['tag'] = " ".join(labels) json_d['raw_context'] = "".join(words) idx += 1 test_data.append(json_d) pbar = ProgressBar(n_total=len(test_data)) results = [] for step, line in enumerate(test_data): token_a = line['context'].split(" ") input_ids = [processor.vocab.to_index(w) for w in token_a] input_mask = [1] * len(token_a) input_lens = [len(token_a)] model.eval() with torch.no_grad(): input_ids = torch.tensor([input_ids], dtype=torch.long) input_mask = torch.tensor([input_mask], dtype=torch.long) input_lens = torch.tensor([input_lens], dtype=torch.long) input_ids = input_ids.to(args.device) input_mask = input_mask.to(args.device) features = model.forward_loss(input_ids, input_mask, input_lens, input_tags=None) tags, _ = model.crf._obtain_labels(features, args.id2label, input_lens) label_entities = get_entities(tags[0], args.id2label) json_d = {} json_d['id'] = step json_d['tag_seq'] = " ".join(tags[0]) json_d['entities'] = label_entities results.append(json_d) pbar(step=step) print(" ") output_predic_file = str(args.output_dir / "test_prediction.json") output_submit_file = str(args.output_dir / "test_submit.json") with open(output_predic_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') test_text = [] with open(str(args.data_dir / 'test.json'), 'r') as fr: for line in fr: test_text.append(json.loads(line)) test_submit = [] for x, y in zip(test_text, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] words = list(x['text']) if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def predict(args, model, processor): # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True config = config_model(args) config['vocab_size'] = len(processor.vocab) config['keep_prob'] = 1.0 with tf.Session(config=tf_config) as sess: model = create_model(sess, NERModel, args.output_dir, config, logger) test_data = [] with open(str(args.data_dir / "test.json"), 'r') as f: idx = 0 for line in f: json_d = {} line = json.loads(line.strip()) text = line['text'] words = list(text) labels = ['O'] * len(words) json_d['id'] = idx json_d['context'] = " ".join(words) json_d['tag'] = " ".join(labels) json_d['raw_context'] = "".join(words) idx += 1 test_data.append(json_d) results = [] for step, line in enumerate(test_data): token_a = line['context'].split(" ") input_ids = [processor.vocab.to_index(w) for w in token_a] input_mask = [1] * len(token_a) input_lens = [len(token_a)] tags = model.evaluate_line( sess, ([input_ids], [input_mask], [[]], input_lens)) label_entities = get_entities(tags[0], args.id2label) json_d = {} json_d['id'] = step tags[0] = [args.id2label[idx] for idx in tags[0]] json_d['tag_seq'] = " ".join(tags[0]) json_d['entities'] = label_entities results.append(json_d) print(" ") output_predic_file = str(args.output_dir / "test_prediction.json") output_submit_file = str(args.output_dir / "cluener_submit.json") with open(output_predic_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') test_text = [] with open(str(args.data_dir / 'test.json'), 'r') as fr: for line in fr: test_text.append(json.loads(line)) test_submit = [] for x, y in zip(test_text, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] words = list(x['text']) if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)