예제 #1
0
def predict(args, processor):
    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True

    config = config_model(args)
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file)
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, NERModel, args.output_dir, config, logger)
        test_data = []
        with open(str(args.data_dir / "test.json"), 'r') as f:
            idx = 0
            for line in f:
                tokens = []
                json_d = {}
                line = json.loads(line.strip())
                textlist = list(line['text'])
                for i, word in enumerate(textlist):
                    token = tokenizer.tokenize(word)
                    assert len(token) == 1
                    tokens.extend(token)
                assert len(tokens) < args.max_seq_len
                ntokens = []
                segment_ids = []
                label_ids = []
                ntokens.append("[CLS]")  # 句子开始设置CLS 标志
                segment_ids.append(0)
                for i, token in enumerate(tokens):
                    ntokens.append(token)
                    segment_ids.append(0)
                ntokens.append("[SEP]")
                segment_ids.append(0)
                # append("O") or append("[SEP]") not sure!
                input_ids = tokenizer.convert_tokens_to_ids(ntokens)
                input_len = len(input_ids)
                input_mask = [1] * len(input_ids)
                while len(input_ids) < args.max_seq_len:
                    input_ids.append(0)
                    input_mask.append(0)
                    segment_ids.append(0)
                raw_text = []
                raw_text.append('[CLS]')
                raw_text.extend(textlist)
                raw_text.append('[SEP]')
                assert len(raw_text) == len(ntokens)
                assert len(input_ids) == args.max_seq_len
                assert len(input_mask) == args.max_seq_len
                assert len(segment_ids) == args.max_seq_len

                json_d['id'] = idx
                json_d['input_ids'] = input_ids
                json_d['input_mask'] = input_mask
                json_d['segment_ids'] = segment_ids
                json_d['input_len'] = input_len
                json_d['text'] = raw_text
                idx += 1
                test_data.append(json_d)
        results = []
        train_data = processor.get_train_examples()
        test_train = load_pickle(args.data_dir / 'train_test.bin')
        for step, line in enumerate(test_data):
            a_input_ids = []
            a_input_mask = []
            a_label_ids = []
            a_input_lens = []
            a_segment_ids = []
            aux_sentence = [
                train_data[i] for i in test_train[step][:args.aug_num]
            ]
            for s in aux_sentence:
                a_input_ids.append(s['input_ids'])
                #                 a_label_ids.append(s['label_ids'])
                #地址信息增强,将所有的标签信息改成adress标签,全1
                a_label_ids.append(s['input_mask'])
                a_input_mask.append(s['input_mask'])
                a_input_lens.append(s['input_len'])
                a_segment_ids.append(s['segment_ids'])
            input_ids = line['input_ids']
            input_mask = line['input_mask']
            input_lens = line['input_len']
            segment_ids = line['segment_ids']
            batch = {
                'ori':
                ([input_ids], [input_mask], [[]], [input_lens], [segment_ids]),
                'aug': ([a_input_ids], [a_input_mask], [a_label_ids],
                        [a_input_lens], [a_segment_ids])
            }
            tags = model.evaluate_line(sess, batch)
            label_entities = get_entities(tags[0], args.id2label)
            json_d = {}
            json_d['id'] = step
            tags[0] = [args.id2label[idx] for idx in tags[0]]
            json_d['tag_seq'] = " ".join(tags[0])
            json_d['entities'] = label_entities
            results.append(json_d)
        print(" ")
        output_predic_file = str(args.output_dir / "test_prediction.json")
        output_submit_file = str(args.output_dir / "cluener_submit.json")
        with open(output_predic_file, "w") as writer:
            for record in results:
                writer.write(json.dumps(record) + '\n')
        test_text = []

        test_submit = []
        for x, y in zip(test_data, results):
            json_d = {}
            json_d['id'] = x['id']
            json_d['label'] = {}
            entities = y['entities']
            #加了标记
            words = x['text']
            if len(entities) != 0:
                for subject in entities:
                    tag = subject[0]
                    start = subject[1]
                    end = subject[2]
                    word = "".join(words[start:end + 1])
                    if tag in json_d['label']:
                        if word in json_d['label'][tag]:
                            json_d['label'][tag][word].append([start, end])
                        else:
                            json_d['label'][tag][word] = [[start, end]]
                    else:
                        json_d['label'][tag] = {}
                        json_d['label'][tag][word] = [[start, end]]
            test_submit.append(json_d)
        json_to_text(output_submit_file, test_submit)
예제 #2
0
def predict(args,model,processor):
    model_path = args.output_dir / 'best-model.bin'
    model = load_model(model, model_path=str(model_path))
    test_data = []
    with open(str(args.data_dir / "test.json"), 'r') as f:
        idx = 0
        for line in f:
            json_d = {}
            line = json.loads(line.strip())
            text = line['text']
            words = list(text)
            labels = ['O'] * len(words)
            json_d['id'] = idx
            json_d['context'] = " ".join(words)
            json_d['tag'] = " ".join(labels)
            json_d['raw_context'] = "".join(words)
            idx += 1
            test_data.append(json_d)
    pbar = ProgressBar(n_total=len(test_data))
    results = []
    for step, line in enumerate(test_data):
        token_a = line['context'].split(" ")
        input_ids = [processor.vocab.to_index(w) for w in token_a]
        input_mask = [1] * len(token_a)
        input_lens = [len(token_a)]
        model.eval()
        with torch.no_grad():
            input_ids = torch.tensor([input_ids], dtype=torch.long)
            input_mask = torch.tensor([input_mask], dtype=torch.long)
            input_lens = torch.tensor([input_lens], dtype=torch.long)
            input_ids = input_ids.to(args.device)
            input_mask = input_mask.to(args.device)
            features = model.forward_loss(input_ids, input_mask, input_lens, input_tags=None)
            tags, _ = model.crf._obtain_labels(features, args.id2label, input_lens)
        label_entities = get_entities(tags[0], args.id2label)
        json_d = {}
        json_d['id'] = step
        json_d['tag_seq'] = " ".join(tags[0])
        json_d['entities'] = label_entities
        results.append(json_d)
        pbar(step=step)
    print(" ")
    output_predic_file = str(args.output_dir / "test_prediction.json")
    output_submit_file = str(args.output_dir / "test_submit.json")
    with open(output_predic_file, "w") as writer:
        for record in results:
            writer.write(json.dumps(record) + '\n')
    test_text = []
    with open(str(args.data_dir / 'test.json'), 'r') as fr:
        for line in fr:
            test_text.append(json.loads(line))
    test_submit = []
    for x, y in zip(test_text, results):
        json_d = {}
        json_d['id'] = x['id']
        json_d['label'] = {}
        entities = y['entities']
        words = list(x['text'])
        if len(entities) != 0:
            for subject in entities:
                tag = subject[0]
                start = subject[1]
                end = subject[2]
                word = "".join(words[start:end + 1])
                if tag in json_d['label']:
                    if word in json_d['label'][tag]:
                        json_d['label'][tag][word].append([start, end])
                    else:
                        json_d['label'][tag][word] = [[start, end]]
                else:
                    json_d['label'][tag] = {}
                    json_d['label'][tag][word] = [[start, end]]
        test_submit.append(json_d)
    json_to_text(output_submit_file, test_submit)
예제 #3
0
def predict(args, model, processor):
    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True

    config = config_model(args)
    config['vocab_size'] = len(processor.vocab)
    config['keep_prob'] = 1.0
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, NERModel, args.output_dir, config, logger)
        test_data = []
        with open(str(args.data_dir / "test.json"), 'r') as f:
            idx = 0
            for line in f:
                json_d = {}
                line = json.loads(line.strip())
                text = line['text']
                words = list(text)
                labels = ['O'] * len(words)
                json_d['id'] = idx
                json_d['context'] = " ".join(words)
                json_d['tag'] = " ".join(labels)
                json_d['raw_context'] = "".join(words)
                idx += 1
                test_data.append(json_d)
        results = []
        for step, line in enumerate(test_data):
            token_a = line['context'].split(" ")
            input_ids = [processor.vocab.to_index(w) for w in token_a]
            input_mask = [1] * len(token_a)
            input_lens = [len(token_a)]

            tags = model.evaluate_line(
                sess, ([input_ids], [input_mask], [[]], input_lens))
            label_entities = get_entities(tags[0], args.id2label)
            json_d = {}
            json_d['id'] = step
            tags[0] = [args.id2label[idx] for idx in tags[0]]
            json_d['tag_seq'] = " ".join(tags[0])
            json_d['entities'] = label_entities
            results.append(json_d)
        print(" ")
        output_predic_file = str(args.output_dir / "test_prediction.json")
        output_submit_file = str(args.output_dir / "cluener_submit.json")
        with open(output_predic_file, "w") as writer:
            for record in results:
                writer.write(json.dumps(record) + '\n')
        test_text = []
        with open(str(args.data_dir / 'test.json'), 'r') as fr:
            for line in fr:
                test_text.append(json.loads(line))
        test_submit = []
        for x, y in zip(test_text, results):
            json_d = {}
            json_d['id'] = x['id']
            json_d['label'] = {}
            entities = y['entities']
            words = list(x['text'])
            if len(entities) != 0:
                for subject in entities:
                    tag = subject[0]
                    start = subject[1]
                    end = subject[2]
                    word = "".join(words[start:end + 1])
                    if tag in json_d['label']:
                        if word in json_d['label'][tag]:
                            json_d['label'][tag][word].append([start, end])
                        else:
                            json_d['label'][tag][word] = [[start, end]]
                    else:
                        json_d['label'][tag] = {}
                        json_d['label'][tag][word] = [[start, end]]
            test_submit.append(json_d)
        json_to_text(output_submit_file, test_submit)