Ejemplo n.º 1
0
    def compute_train_pre(self, label_paths, pred_paths):
        """
        train过程中计算每个batch的精确率
        """
        #origin = []
        found = []
        right = []
        for label_path, pre_path in zip(label_paths, pred_paths):
            label_entities = get_entities(label_path,self.id2label)
            pre_entities = get_entities(pre_path,self.id2label)
            #origin.extend(label_entities)
            found.extend(pre_entities)
            right.extend([pre_entity for pre_entity in pre_entities if pre_entity in label_entities])

        return 0 if len(found) == 0 else len(right) / len(found)
Ejemplo n.º 2
0
    def update(self, label_paths, pred_paths):
        '''
        labels_paths: [[],[],[],....]
        pred_paths: [[],[],[],.....]

        :param label_paths:
        :param pred_paths:
        :return:
        Example:
            >>> labels_paths = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
            >>> pred_paths = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        '''
        for label_path, pre_path in zip(label_paths, pred_paths):
            label_entities = get_entities(label_path, self.id2label,self.markup)
            pre_entities = get_entities(pre_path, self.id2label,self.markup)
            self.origins.extend(label_entities)
            self.founds.extend(pre_entities)
            self.rights.extend([pre_entity for pre_entity in pre_entities if pre_entity in label_entities])
Ejemplo n.º 3
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text_a = line['words']
         labels = line['labels']
         subject = get_entities(labels, id2label=None, markup='bios')
         examples.append(InputExample(guid=guid, text_a=text_a, subject=subject))
     return examples
Ejemplo n.º 4
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         text_a = line['words']
         labels = []
         for x in line['labels']:
             if 'M-' in x:
                 labels.append(x.replace('M-', 'I-'))
             elif 'E-' in x:
                 labels.append(x.replace('E-', 'I-'))
             else:
                 labels.append(x)
         if i<2:
             print(text_a)
             print(labels)
         subject = get_entities(labels, id2label=None, markup='bios')
         examples.append(InputExample(guid=guid, text_a=text_a, subject=subject))
     return examples
Ejemplo n.º 5
0
def predict(args,model,processor):
    model_path = args.output_dir / 'best-model.bin'
    model = load_model(model, model_path=str(model_path))
    test_data = []
    with open(str(args.data_dir / "test.json"), 'r') as f:
        idx = 0
        for line in f:
            json_d = {}
            line = json.loads(line.strip())
            text = line['text']
            words = list(text)
            labels = ['O'] * len(words)
            json_d['id'] = idx
            json_d['context'] = " ".join(words)
            json_d['tag'] = " ".join(labels)
            json_d['raw_context'] = "".join(words)
            idx += 1
            test_data.append(json_d)
    pbar = ProgressBar(n_total=len(test_data))
    results = []
    for step, line in enumerate(test_data):
        token_a = line['context'].split(" ")
        input_ids = [processor.vocab.to_index(w) for w in token_a]
        input_mask = [1] * len(token_a)
        input_lens = [len(token_a)]
        model.eval()
        with torch.no_grad():
            input_ids = torch.tensor([input_ids], dtype=torch.long)
            input_mask = torch.tensor([input_mask], dtype=torch.long)
            input_lens = torch.tensor([input_lens], dtype=torch.long)
            input_ids = input_ids.to(args.device)
            input_mask = input_mask.to(args.device)
            features = model.forward_loss(input_ids, input_mask, input_lens, input_tags=None)
            tags, _ = model.crf._obtain_labels(features, args.id2label, input_lens)
        label_entities = get_entities(tags[0], args.id2label)
        json_d = {}
        json_d['id'] = step
        json_d['tag_seq'] = " ".join(tags[0])
        json_d['entities'] = label_entities
        results.append(json_d)
        pbar(step=step)
    print(" ")
    output_predic_file = str(args.output_dir / "test_prediction.json")
    output_submit_file = str(args.output_dir / "test_submit.json")
    with open(output_predic_file, "w") as writer:
        for record in results:
            writer.write(json.dumps(record) + '\n')
    test_text = []
    with open(str(args.data_dir / 'test.json'), 'r') as fr:
        for line in fr:
            test_text.append(json.loads(line))
    test_submit = []
    for x, y in zip(test_text, results):
        json_d = {}
        json_d['id'] = x['id']
        json_d['label'] = {}
        entities = y['entities']
        words = list(x['text'])
        if len(entities) != 0:
            for subject in entities:
                tag = subject[0]
                start = subject[1]
                end = subject[2]
                word = "".join(words[start:end + 1])
                if tag in json_d['label']:
                    if word in json_d['label'][tag]:
                        json_d['label'][tag][word].append([start, end])
                    else:
                        json_d['label'][tag][word] = [[start, end]]
                else:
                    json_d['label'][tag] = {}
                    json_d['label'][tag][word] = [[start, end]]
        test_submit.append(json_d)
    json_to_text(output_submit_file, test_submit)
Ejemplo n.º 6
0
def predict(args, processor):
    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True

    config = config_model(args)
    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file)
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, NERModel, args.output_dir, config, logger)
        test_data = []
        with open(str(args.data_dir / "test.json"), 'r') as f:
            idx = 0
            for line in f:
                tokens = []
                json_d = {}
                line = json.loads(line.strip())
                textlist = list(line['text'])
                for i, word in enumerate(textlist):
                    token = tokenizer.tokenize(word)
                    assert len(token) == 1
                    tokens.extend(token)
                assert len(tokens) < args.max_seq_len
                ntokens = []
                segment_ids = []
                label_ids = []
                ntokens.append("[CLS]")  # 句子开始设置CLS 标志
                segment_ids.append(0)
                for i, token in enumerate(tokens):
                    ntokens.append(token)
                    segment_ids.append(0)
                ntokens.append("[SEP]")
                segment_ids.append(0)
                # append("O") or append("[SEP]") not sure!
                input_ids = tokenizer.convert_tokens_to_ids(ntokens)
                input_len = len(input_ids)
                input_mask = [1] * len(input_ids)
                while len(input_ids) < args.max_seq_len:
                    input_ids.append(0)
                    input_mask.append(0)
                    segment_ids.append(0)
                raw_text = []
                raw_text.append('[CLS]')
                raw_text.extend(textlist)
                raw_text.append('[SEP]')
                assert len(raw_text) == len(ntokens)
                assert len(input_ids) == args.max_seq_len
                assert len(input_mask) == args.max_seq_len
                assert len(segment_ids) == args.max_seq_len

                json_d['id'] = idx
                json_d['input_ids'] = input_ids
                json_d['input_mask'] = input_mask
                json_d['segment_ids'] = segment_ids
                json_d['input_len'] = input_len
                json_d['text'] = raw_text
                idx += 1
                test_data.append(json_d)
        results = []
        train_data = processor.get_train_examples()
        test_train = load_pickle(args.data_dir / 'train_test.bin')
        for step, line in enumerate(test_data):
            a_input_ids = []
            a_input_mask = []
            a_label_ids = []
            a_input_lens = []
            a_segment_ids = []
            aux_sentence = [
                train_data[i] for i in test_train[step][:args.aug_num]
            ]
            for s in aux_sentence:
                a_input_ids.append(s['input_ids'])
                #                 a_label_ids.append(s['label_ids'])
                #地址信息增强,将所有的标签信息改成adress标签,全1
                a_label_ids.append(s['input_mask'])
                a_input_mask.append(s['input_mask'])
                a_input_lens.append(s['input_len'])
                a_segment_ids.append(s['segment_ids'])
            input_ids = line['input_ids']
            input_mask = line['input_mask']
            input_lens = line['input_len']
            segment_ids = line['segment_ids']
            batch = {
                'ori':
                ([input_ids], [input_mask], [[]], [input_lens], [segment_ids]),
                'aug': ([a_input_ids], [a_input_mask], [a_label_ids],
                        [a_input_lens], [a_segment_ids])
            }
            tags = model.evaluate_line(sess, batch)
            label_entities = get_entities(tags[0], args.id2label)
            json_d = {}
            json_d['id'] = step
            tags[0] = [args.id2label[idx] for idx in tags[0]]
            json_d['tag_seq'] = " ".join(tags[0])
            json_d['entities'] = label_entities
            results.append(json_d)
        print(" ")
        output_predic_file = str(args.output_dir / "test_prediction.json")
        output_submit_file = str(args.output_dir / "cluener_submit.json")
        with open(output_predic_file, "w") as writer:
            for record in results:
                writer.write(json.dumps(record) + '\n')
        test_text = []

        test_submit = []
        for x, y in zip(test_data, results):
            json_d = {}
            json_d['id'] = x['id']
            json_d['label'] = {}
            entities = y['entities']
            #加了标记
            words = x['text']
            if len(entities) != 0:
                for subject in entities:
                    tag = subject[0]
                    start = subject[1]
                    end = subject[2]
                    word = "".join(words[start:end + 1])
                    if tag in json_d['label']:
                        if word in json_d['label'][tag]:
                            json_d['label'][tag][word].append([start, end])
                        else:
                            json_d['label'][tag][word] = [[start, end]]
                    else:
                        json_d['label'][tag] = {}
                        json_d['label'][tag][word] = [[start, end]]
            test_submit.append(json_d)
        json_to_text(output_submit_file, test_submit)
Ejemplo n.º 7
0
def predict(args, model, processor):
    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True

    config = config_model(args)
    config['vocab_size'] = len(processor.vocab)
    config['keep_prob'] = 1.0
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, NERModel, args.output_dir, config, logger)
        test_data = []
        with open(str(args.data_dir / "test.json"), 'r') as f:
            idx = 0
            for line in f:
                json_d = {}
                line = json.loads(line.strip())
                text = line['text']
                words = list(text)
                labels = ['O'] * len(words)
                json_d['id'] = idx
                json_d['context'] = " ".join(words)
                json_d['tag'] = " ".join(labels)
                json_d['raw_context'] = "".join(words)
                idx += 1
                test_data.append(json_d)
        results = []
        for step, line in enumerate(test_data):
            token_a = line['context'].split(" ")
            input_ids = [processor.vocab.to_index(w) for w in token_a]
            input_mask = [1] * len(token_a)
            input_lens = [len(token_a)]

            tags = model.evaluate_line(
                sess, ([input_ids], [input_mask], [[]], input_lens))
            label_entities = get_entities(tags[0], args.id2label)
            json_d = {}
            json_d['id'] = step
            tags[0] = [args.id2label[idx] for idx in tags[0]]
            json_d['tag_seq'] = " ".join(tags[0])
            json_d['entities'] = label_entities
            results.append(json_d)
        print(" ")
        output_predic_file = str(args.output_dir / "test_prediction.json")
        output_submit_file = str(args.output_dir / "cluener_submit.json")
        with open(output_predic_file, "w") as writer:
            for record in results:
                writer.write(json.dumps(record) + '\n')
        test_text = []
        with open(str(args.data_dir / 'test.json'), 'r') as fr:
            for line in fr:
                test_text.append(json.loads(line))
        test_submit = []
        for x, y in zip(test_text, results):
            json_d = {}
            json_d['id'] = x['id']
            json_d['label'] = {}
            entities = y['entities']
            words = list(x['text'])
            if len(entities) != 0:
                for subject in entities:
                    tag = subject[0]
                    start = subject[1]
                    end = subject[2]
                    word = "".join(words[start:end + 1])
                    if tag in json_d['label']:
                        if word in json_d['label'][tag]:
                            json_d['label'][tag][word].append([start, end])
                        else:
                            json_d['label'][tag][word] = [[start, end]]
                    else:
                        json_d['label'][tag] = {}
                        json_d['label'][tag][word] = [[start, end]]
            test_submit.append(json_d)
        json_to_text(output_submit_file, test_submit)
Ejemplo n.º 8
0
def predict(args, model, processor):
    model_path = args.output_dir / 'best-model.bin'
    model = load_model(model, model_path=str(model_path))

    # metric = SeqEntityScore(args.id2label, markup=args.markup)
    # 取数据 test_data = [{id: ,context: ,tag: ,raw_context: },{},{}...]
    start_time = time.time()
    test_data = load_and_cache_examples(args, processor, data_type='test')
    # test_data [{'context':,'tag':},{},{}]
    origins = []
    founds = []
    rights = []

    results = []
    for step, line in enumerate(test_data):
        token_a = line['context'].split(" ")
        tag_a = line['tag'].split(" ")
        input_ids = [processor.vocab.to_index(w) for w in token_a]
        input_mask = [1] * len(token_a)
        input_lens = [len(token_a)]
        model.eval()
        with torch.no_grad():
            input_ids = torch.tensor([input_ids], dtype=torch.long)
            input_mask = torch.tensor([input_mask], dtype=torch.long)
            input_lens = torch.tensor([input_lens], dtype=torch.long)
            input_ids = input_ids.to(args.device)
            input_mask = input_mask.to(args.device)
            features = model.forward_loss(input_ids,
                                          input_mask,
                                          input_lens,
                                          input_tags=None)
            tags, _ = model.crf._obtain_labels(features, args.id2label,
                                               input_lens)
        label_entities = get_entities(tags[0], args.id2label)
        gold_entities = get_entities(tag_a, args.id2label)
        # 记录标签
        origins.extend(gold_entities)
        founds.extend(label_entities)
        rights.extend([
            pre_entity for pre_entity in label_entities
            if pre_entity in gold_entities
        ])

        json_d = {}
        # json_d['tag_seq'] = " ".join(tags[0])
        json_d['pre'] = label_entities
        json_d['gold'] = gold_entities
        results.append(json_d)
    # result [{'pre': ,'gold': },{},{}]

    test_submit = []
    for x, y in zip(test_data, results):
        json_d = {}
        context = list(x['context'])
        json_d['context'] = ''.join(context)
        json_d['label'] = y['pre']
        # entities = y['pre']
        # if len(entities) != 0:
        #     for subject in entities:
        #         tag = subject[0]
        #         start = subject[1]
        #         end = subject[2]
        #         word = "".join(context[start:end + 1])
        #         json_d['label'][tag] = word

        json_d['gold'] = y['gold']
        test_submit.append(json_d)

    output_submit_file = str(args.output_dir / "test_submit.json")
    with open(output_submit_file, 'w') as writer:
        for x in test_submit:
            writer.write(json.dumps(x, ensure_ascii=False) + '\n')

    precision = len(rights) / len(founds)
    recall = len(rights) / len(origins)
    test_f1 = (2 * precision * recall) / (precision + recall)
    logger.info(
        f'test_time: {time.time() - start_time:.1f}  test_f1: {test_f1}')