Beispiel #1
0
def get_ner_BMES(outputs, return_str_or_not):
    entities = []

    for idx in range(len(outputs)):
        labelName = outputs[idx]

        if labelName[0] == 'S' or labelName[0] == 'B':
            entity = Entity()
            entity.type = labelName[2:]
            entity.tkSpans.append([idx, idx])
            entity.labelSpans.append([labelName])
            entities.append(entity)

        elif labelName[0] == 'M' or labelName[0] == 'E':
            if checkWrongState_BMES(outputs, idx + 1):
                entity = entities[-1]
                entity.tkSpans[-1][1] = idx
                entity.labelSpans[-1].append(labelName)

    anwserEntities = entities

    if return_str_or_not:
        # transfer Entity class into its str representation
        strEntities = []
        for answer in anwserEntities:
            strEntity = answer.type
            for tkSpan in answer.tkSpans:
                strEntity += '[' + str(tkSpan[0]) + ',' + str(tkSpan[1]) + ']'
            strEntities.append(strEntity)
        return strEntities
    else:
        return anwserEntities
Beispiel #2
0
def parse_one_gold_file(annotation_dir, corpus_dir, fileName):
    document = Document()
    document.name = fileName[:fileName.find('.')]

    annotation_file = get_bioc_file(os.path.join(annotation_dir, fileName))
    bioc_passage = annotation_file[0].passages[0]
    entities = []

    for entity in bioc_passage.annotations:
        if entity.infons['type'] not in type_we_care:
            continue

        entity_ = Entity()
        entity_.id = entity.id
        processed_name = entity.text.replace('\\n', ' ')
        if len(processed_name) == 0:
            logging.debug("{}: entity {} name is empty".format(
                fileName, entity.id))
            continue
        entity_.name = processed_name
        entity_.type = entity.infons['type']
        entity_.spans.append(
            [entity.locations[0].offset, entity.locations[0].end])

        if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A') \
                and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'):
            entity_.norm_ids.append(entity.infons['SNOMED code'])
            entity_.norm_names.append(entity.infons['SNOMED term'])

        elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A') \
                and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'):
            entity_.norm_ids.append(entity.infons['MedDRA code'])
            entity_.norm_names.append(entity.infons['MedDRA term'])
        else:
            logging.debug("{}: no norm id in entity {}".format(
                fileName, entity.id))
            # some entities may have no norm id
            continue

        entities.append(entity_)

    document.entities = entities

    corpus_file = get_text_file(
        os.path.join(corpus_dir,
                     fileName.split('.bioc')[0]))
    document.text = corpus_file

    return document
Beispiel #3
0
def combineTwoEntity(a, b):
    c = Entity()
    c.type = a.type

    if (a.tkSpans[0][0] < b.tkSpans[0][0]):
        if (a.tkSpans[0][1] + 1 == b.tkSpans[0][0]):
            c.tkSpans.append([a.tkSpans[0][0], b.tkSpans[0][1]])
        else:
            c.tkSpans.append(a.tkSpans[0])
            c.tkSpans.append(b.tkSpans[0])
    else:
        if (b.tkSpans[0][1] + 1 == a.tkSpans[0][0]):
            c.tkSpans.append([b.tkSpans[0][0], a.tkSpans[0][1]])
        else:
            c.tkSpans.append(b.tkSpans[0])
            c.tkSpans.append(a.tkSpans[0])

    return c
Beispiel #4
0
def evaluate(documents, dictionary, dictionary_reverse, model):
    model.eval()

    ct_predicted = 0
    ct_gold = 0
    ct_correct = 0

    for document in documents:

        # copy entities from gold entities
        pred_entities = []
        for gold in document.entities:
            pred = Entity()
            pred.id = gold.id
            pred.type = gold.type
            pred.spans = gold.spans
            pred.section = gold.section
            pred.name = gold.name
            pred_entities.append(pred)

        model.process_one_doc(document, pred_entities, dictionary,
                              dictionary_reverse)

        p1, p2, p3 = evaluate_for_ehr(document.entities, pred_entities,
                                      dictionary)

        ct_gold += p1
        ct_predicted += p2
        ct_correct += p3

    if ct_gold == 0:
        precision = 0
        recall = 0
    else:
        precision = ct_correct * 1.0 / ct_predicted
        recall = ct_correct * 1.0 / ct_gold

    if precision + recall == 0:
        f_measure = 0
    else:
        f_measure = 2 * precision * recall / (precision + recall)

    return precision, recall, f_measure
Beispiel #5
0
    def metamap_ner_my_norm(d):
        print("load umls ...")

        UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(d.config['norm_dict'])

        predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap"
        annotation_dir = os.path.join(opt.test_file, 'bioc')
        corpus_dir = os.path.join(opt.test_file, 'txt')
        annotation_files = [f for f in os.listdir(annotation_dir) if os.path.isfile(os.path.join(annotation_dir, f))]



        if opt.test_in_cpu:
            model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu')
        else:
            model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'))
        model.eval()


        ct_norm_predict = 0
        ct_norm_gold = 0
        ct_norm_correct = 0

        correct_counter = Counter()
        wrong_counter = Counter()

        for gold_file_name in annotation_files:
            print("# begin {}".format(gold_file_name))
            gold_document = parse_one_gold_file(annotation_dir, corpus_dir, gold_file_name)

            predict_document = metamap.load_metamap_result_from_file(
                os.path.join(predict_dir, gold_file_name[:gold_file_name.find('.')] + ".field.txt"))

            # copy entities from metamap entities
            pred_entities = []
            for gold in predict_document.entities:
                pred = Entity()
                pred.id = gold.id
                pred.type = gold.type
                pred.spans = gold.spans
                pred.section = gold.section
                pred.name = gold.name
                pred_entities.append(pred)


            model.process_one_doc(gold_document, pred_entities, UMLS_dict, UMLS_dict_reverse)


            p1, p2, p3 = evaluate_for_ehr(gold_document.entities, pred_entities, UMLS_dict,
                                          predict_document.entities, correct_counter, wrong_counter)

            ct_norm_gold += p1
            ct_norm_predict += p2
            ct_norm_correct += p3




        sorted_correct_entities = OrderedDict(correct_counter.most_common())
        sorted_correct_entities = json.dumps(sorted_correct_entities, indent=4)
        with codecs.open("sorted_correct_entities.txt", 'w', 'UTF-8') as fp:
            fp.write(sorted_correct_entities)

        sorted_wrong_entities = OrderedDict(wrong_counter.most_common())
        sorted_wrong_entities = json.dumps(sorted_wrong_entities, indent=4)
        with codecs.open("sorted_wrong_entities.txt", 'w', 'UTF-8') as fp:
            fp.write(sorted_wrong_entities)

        p = ct_norm_correct * 1.0 / ct_norm_predict
        r = ct_norm_correct * 1.0 / ct_norm_gold
        f1 = 2.0 * p * r / (p + r)
        print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))
Beispiel #6
0
    for gold_file_name in annotation_files:
        print("# begin {}".format(gold_file_name))

        if file_count < 1:
            file_count += 1
            continue

        file_count += 1

        gold_document = parse_one_gold_file(annotation_dir, corpus_dir, gold_file_name)

        pred_entities = []
        for gold in gold_document.entities:
            pred = Entity()
            pred.id = gold.id
            pred.type = gold.type
            pred.spans = gold.spans
            pred.section = gold.section
            pred.name = gold.name
            pred_entities.append(pred)

        Xs, Ys = generate_instances_ehr(pred_entities, model.dict_alphabet, UMLS_dict_reverse)

        data_loader = DataLoader(MyDataset(Xs, Ys), opt.batch_size, shuffle=False, collate_fn=my_collate)
        data_iter = iter(data_loader)
        num_iter = len(data_loader)

        entity_start = 0

        for i in range(num_iter):
Beispiel #7
0
def processOneFile(fileName, annotation_dir, corpus_dir, nlp_tool, isTraining,
                   types, type_filter):
    document = Document()
    document.name = fileName[:fileName.find('.')]

    ct_snomed = 0
    ct_meddra = 0
    ct_unnormed = 0

    if annotation_dir:
        annotation_file = get_bioc_file(join(annotation_dir, fileName))
        bioc_passage = annotation_file[0].passages[0]
        entities = []

        for entity in bioc_passage.annotations:
            if types and (entity.infons['type'] not in type_filter):
                continue
            entity_ = Entity()
            entity_.id = entity.id
            processed_name = entity.text.replace('\\n', ' ')
            if len(processed_name) == 0:
                logging.debug("{}: entity {} name is empty".format(
                    fileName, entity.id))
                continue
            entity_.name = processed_name

            entity_.type = entity.infons['type']
            entity_.spans.append(
                [entity.locations[0].offset, entity.locations[0].end])
            if ('SNOMED code' in entity.infons and entity.infons['SNOMED code'] != 'N/A')\
                    and ('SNOMED term' in entity.infons and entity.infons['SNOMED term'] != 'N/A'):
                entity_.norm_ids.append(entity.infons['SNOMED code'])
                entity_.norm_names.append(entity.infons['SNOMED term'])
                ct_snomed += 1
            elif ('MedDRA code' in entity.infons and entity.infons['MedDRA code'] != 'N/A')\
                    and ('MedDRA term' in entity.infons and entity.infons['MedDRA term'] != 'N/A'):
                entity_.norm_ids.append(entity.infons['MedDRA code'])
                entity_.norm_names.append(entity.infons['MedDRA term'])
                ct_meddra += 1
            else:
                logging.debug("{}: no norm id in entity {}".format(
                    fileName, entity.id))
                ct_unnormed += 1
                continue

            entities.append(entity_)

        document.entities = entities

    corpus_file = get_text_file(join(corpus_dir, fileName.split('.bioc')[0]))
    document.text = corpus_file

    if opt.nlp_tool == "spacy":
        if isTraining:
            sentences = get_sentences_and_tokens_from_spacy(
                corpus_file, nlp_tool, document.entities)
        else:
            sentences = get_sentences_and_tokens_from_spacy(
                corpus_file, nlp_tool, None)
    elif opt.nlp_tool == "nltk":
        if isTraining:
            sentences = get_sentences_and_tokens_from_nltk(
                corpus_file, nlp_tool, document.entities, None, None)
        else:
            sentences = get_sentences_and_tokens_from_nltk(
                corpus_file, nlp_tool, None, None, None)
    elif opt.nlp_tool == "stanford":
        if isTraining:
            sentences = get_sentences_and_tokens_from_stanford(
                corpus_file, nlp_tool, document.entities)
        else:
            sentences = get_sentences_and_tokens_from_stanford(
                corpus_file, nlp_tool, None)
    else:
        raise RuntimeError("invalid nlp tool")

    document.sentences = sentences

    return document, ct_snomed, ct_meddra, ct_unnormed
Beispiel #8
0
def evaluate(documents, dictionary, dictionary_reverse, vsm_model,
             neural_model, ensemble_model, d, isMeddra_dict):
    if vsm_model is not None:
        vsm_model.eval()

    if neural_model is not None:
        neural_model.eval()

    if ensemble_model is not None:
        ensemble_model.eval()

    ct_predicted = 0
    ct_gold = 0
    ct_correct = 0

    # if opt.norm_rule and opt.norm_vsm and opt.norm_neural:
    #     ct_correct_rule = 0
    #     ct_correct_vsm = 0
    #     ct_correct_neural = 0
    #     ct_correct_all = 0
    #     ct_correct_rule_vsm = 0
    #     ct_correct_rule_neural = 0
    #     ct_correct_vsm_neural = 0

    for document in documents:

        # copy entities from gold entities
        pred_entities = []
        for gold in document.entities:
            pred = Entity()
            pred.id = gold.id
            pred.type = gold.type
            pred.spans = gold.spans
            pred.section = gold.section
            pred.name = gold.name
            pred_entities.append(pred)

        if opt.norm_rule and opt.norm_vsm and opt.norm_neural:
            if opt.ensemble == 'learn':
                ensemble_model.process_one_doc(document, pred_entities,
                                               dictionary, dictionary_reverse,
                                               isMeddra_dict)
            else:
                pred_entities2 = copy.deepcopy(pred_entities)
                pred_entities3 = copy.deepcopy(pred_entities)
                merge_entities = copy.deepcopy(pred_entities)
                multi_sieve.runMultiPassSieve(document, pred_entities,
                                              dictionary, isMeddra_dict)
                vsm_model.process_one_doc(document, pred_entities2, dictionary,
                                          dictionary_reverse, isMeddra_dict)
                neural_model.process_one_doc(document, pred_entities3,
                                             dictionary, dictionary_reverse,
                                             isMeddra_dict)
        elif opt.norm_rule:
            multi_sieve.runMultiPassSieve(document, pred_entities, dictionary,
                                          isMeddra_dict)
        elif opt.norm_vsm:
            vsm_model.process_one_doc(document, pred_entities, dictionary,
                                      dictionary_reverse, isMeddra_dict)
        elif opt.norm_neural:
            neural_model.process_one_doc(document, pred_entities, dictionary,
                                         dictionary_reverse, isMeddra_dict)
        else:
            raise RuntimeError("wrong configuration")

        if opt.norm_rule and opt.norm_vsm and opt.norm_neural:

            # ct_gold += len(document.entities)
            # ct_predicted += len(pred_entities)
            # up bound of ensemble, if at least one system makes a correct prediction, we count it as correct.
            # for idx, gold in enumerate(document.entities):
            # if (pred_entities[idx].rule_id is not None and pred_entities[idx].rule_id in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is not None and pred_entities2[idx].vsm_id in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is not None and pred_entities3[idx].neural_id in gold.norm_ids):
            #     ct_correct_all += 1
            #     ct_correct += 1
            #
            # if (pred_entities[idx].rule_id is not None and pred_entities[idx].rule_id in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is None or pred_entities2[idx].vsm_id not in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is None or pred_entities3[idx].neural_id not in gold.norm_ids):
            #     ct_correct_rule += 1
            #     ct_correct += 1
            #
            # if (pred_entities[idx].rule_id is None or pred_entities[idx].rule_id not in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is not None and pred_entities2[idx].vsm_id in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is None or pred_entities3[idx].neural_id not in gold.norm_ids):
            #     ct_correct_vsm += 1
            #     ct_correct += 1
            #
            # if (pred_entities[idx].rule_id is None or pred_entities[idx].rule_id not in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is None or pred_entities2[idx].vsm_id not in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is not None and pred_entities3[idx].neural_id in gold.norm_ids):
            #     ct_correct_neural += 1
            #     ct_correct += 1
            #
            # if (pred_entities[idx].rule_id is not None and pred_entities[idx].rule_id in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is not None and pred_entities2[idx].vsm_id in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is None or pred_entities3[idx].neural_id not in gold.norm_ids):
            #     ct_correct_rule_vsm += 1
            #     ct_correct += 1
            #
            # if (pred_entities[idx].rule_id is not None and pred_entities[idx].rule_id in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is None or pred_entities2[idx].vsm_id not in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is not None and pred_entities3[idx].neural_id in gold.norm_ids):
            #     ct_correct_rule_neural += 1
            #     ct_correct += 1
            #
            # if (pred_entities[idx].rule_id is None or pred_entities[idx].rule_id not in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is not None and pred_entities2[idx].vsm_id in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is not None and pred_entities3[idx].neural_id in gold.norm_ids):
            #     ct_correct_vsm_neural += 1
            #     ct_correct += 1

            if opt.ensemble == 'learn':

                if isMeddra_dict:
                    p1, p2, p3 = evaluate_for_fda(document.entities,
                                                  pred_entities)
                else:
                    p1, p2, p3 = evaluate_for_ehr(document.entities,
                                                  pred_entities, dictionary)

                ct_gold += p1
                ct_predicted += p2
                ct_correct += p3

            else:
                ensemble.merge_result(pred_entities, pred_entities2,
                                      pred_entities3, merge_entities,
                                      dictionary, isMeddra_dict,
                                      vsm_model.dict_alphabet, d)

                if isMeddra_dict:
                    p1, p2, p3 = evaluate_for_fda(document.entities,
                                                  merge_entities)
                else:
                    p1, p2, p3 = evaluate_for_ehr(document.entities,
                                                  merge_entities, dictionary)

                ct_gold += p1
                ct_predicted += p2
                ct_correct += p3

        else:

            if isMeddra_dict:
                p1, p2, p3 = evaluate_for_fda(document.entities, pred_entities)
            else:
                p1, p2, p3 = evaluate_for_ehr(document.entities, pred_entities,
                                              dictionary)

            ct_gold += p1
            ct_predicted += p2
            ct_correct += p3

    # if opt.norm_rule and opt.norm_vsm and opt.norm_neural:
    #     logging.info("ensemble correct. all:{} rule:{} vsm:{} neural:{} rule_vsm:{} rule_neural:{} vsm_neural:{}"
    #                  .format(ct_correct_all, ct_correct_rule, ct_correct_vsm, ct_correct_neural, ct_correct_rule_vsm,
    #                          ct_correct_rule_neural, ct_correct_vsm_neural))
    #
    # logging.info("gold:{} pred:{} correct:{}".format(ct_gold, ct_predicted, ct_correct))

    if ct_gold == 0:
        precision = 0
        recall = 0
    else:
        precision = ct_correct * 1.0 / ct_predicted
        recall = ct_correct * 1.0 / ct_gold

    if precision + recall == 0:
        f_measure = 0
    else:
        f_measure = 2 * precision * recall / (precision + recall)

    return precision, recall, f_measure
Beispiel #9
0
def metamap_ner_my_norm(d):
    print("load umls ...")

    UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(
        d.config['norm_dict'])

    predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap"
    annotation_dir = os.path.join(opt.test_file, 'bioc')
    corpus_dir = os.path.join(opt.test_file, 'txt')
    annotation_files = [
        f for f in listdir(annotation_dir) if isfile(join(annotation_dir, f))
    ]

    if opt.norm_rule:
        multi_sieve.init(opt, None, d, UMLS_dict, UMLS_dict_reverse, False)
    elif opt.norm_neural:
        logging.info("use neural-based normer")
        if opt.test_in_cpu:
            neural_model = torch.load(os.path.join(opt.output,
                                                   'norm_neural.pkl'),
                                      map_location='cpu')
        else:
            neural_model = torch.load(
                os.path.join(opt.output, 'norm_neural.pkl'))
        neural_model.eval()
    elif opt.norm_vsm:
        logging.info("use vsm-based normer")
        if opt.test_in_cpu:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'),
                                   map_location='cpu')
        else:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
        vsm_model.eval()

    ct_norm_predict = 0
    ct_norm_gold = 0
    ct_norm_correct = 0

    for gold_file_name in annotation_files:
        print("# begin {}".format(gold_file_name))
        gold_document = parse_one_gold_file(annotation_dir, corpus_dir,
                                            gold_file_name)

        predict_document = metamap.load_metamap_result_from_file(
            join(predict_dir,
                 gold_file_name[:gold_file_name.find('.')] + ".field.txt"))

        # copy entities from metamap entities
        pred_entities = []
        for gold in predict_document.entities:
            pred = Entity()
            pred.id = gold.id
            pred.type = gold.type
            pred.spans = gold.spans
            pred.section = gold.section
            pred.name = gold.name
            pred_entities.append(pred)

        if opt.norm_rule:
            multi_sieve.runMultiPassSieve(gold_document, pred_entities,
                                          UMLS_dict, False)
        elif opt.norm_neural:
            neural_model.process_one_doc(gold_document, pred_entities,
                                         UMLS_dict, UMLS_dict_reverse, False)
        elif opt.norm_vsm:
            vsm_model.process_one_doc(gold_document, pred_entities, UMLS_dict,
                                      UMLS_dict_reverse, False)
        else:
            raise RuntimeError("wrong configuration")

        p1, p2, p3 = evaluate_for_ehr(gold_document.entities, pred_entities,
                                      UMLS_dict)

        ct_norm_gold += p1
        ct_norm_predict += p2
        ct_norm_correct += p3

    p = ct_norm_correct * 1.0 / ct_norm_predict
    r = ct_norm_correct * 1.0 / ct_norm_gold
    f1 = 2.0 * p * r / (p + r)
    print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))
Beispiel #10
0
def generate_instances(document, word_alphabet, dict_alphabet, dictionary,
                       dictionary_reverse, isMeddra_dict):
    Xs = []
    Ys = []

    # copy entities from gold entities
    pred_entities = []
    for gold in document.entities:
        pred = Entity()
        pred.id = gold.id
        pred.type = gold.type
        pred.spans = gold.spans
        pred.section = gold.section
        pred.name = gold.name
        pred_entities.append(pred)

    multi_sieve.runMultiPassSieve(document, pred_entities, dictionary,
                                  isMeddra_dict)

    for idx, entity in enumerate(document.entities):

        if isMeddra_dict:
            if len(entity.norm_ids) > 0:
                Y = norm_utils.get_dict_index(dict_alphabet,
                                              entity.norm_ids[0])
                if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                    Ys.append(Y)
                else:
                    continue
            else:
                Ys.append(0)
        else:
            if len(entity.norm_ids) > 0:
                if entity.norm_ids[0] in dictionary_reverse:
                    cui_list = dictionary_reverse[entity.norm_ids[0]]
                    Y = norm_utils.get_dict_index(
                        dict_alphabet,
                        cui_list[0])  # use the first id to generate instance
                    if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                        Ys.append(Y)
                    else:
                        raise RuntimeError(
                            "entity {}, {}, cui not in dict_alphabet".format(
                                entity.id, entity.name))
                else:
                    logging.info(
                        "entity {}, {}, can't map to umls, ignored".format(
                            entity.id, entity.name))
                    continue
            else:
                Ys.append(0)

        X = dict()

        tokens = my_tokenize(entity.name)
        word_ids = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            word_ids.append(word_id)
        X['word'] = word_ids

        if pred_entities[idx].rule_id is None:
            X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet)
        else:
            X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet)
            X['rule'][norm_utils.get_dict_index(
                dict_alphabet, pred_entities[idx].rule_id)] = 1

        Xs.append(X)

    return Xs, Ys
Beispiel #11
0
def get_ner_BIOHD_1234(outputs, return_str_or_not):
    entities = []
    for idx in range(len(outputs)):
        labelName = outputs[idx]

        if labelName == 'B-X' or labelName == 'HB-X' or labelName == 'D1B-X' or labelName == 'D2B-X' \
                or labelName == 'D3B-X' or labelName == 'D4B-X':
            entity = Entity()
            entity.type = 'X'
            entity.tkSpans.append([idx, idx])
            entity.labelSpans.append([labelName])
            entities.append(entity)
        elif labelName == 'I-X' or labelName == 'HI-X' or labelName == 'D1I-X' or labelName == 'D2I-X' \
     or labelName == 'D3I-X' or labelName == 'D4I-X':
            if checkWrongState(outputs, idx + 1):
                entity = entities[-1]
                entity.tkSpans[-1][1] = idx
                entity.labelSpans[-1].append(labelName)

    # post-processing to rebuild entities
    postEntities = []
    HB_HI = []
    D1B_D1I = []
    D2B_D2I = []
    D3B_D3I = []
    D4B_D4I = []

    for temp in entities:
        labelSpan = temp.labelSpans[0]

        if labelSpan[0] == 'HB-X':
            HB_HI.append(temp)
        elif labelSpan[0] == 'D1B-X':
            D1B_D1I.append(temp)
        elif (labelSpan[0] == 'D2B-X'):
            D2B_D2I.append(temp)
        elif (labelSpan[0] == 'D3B-X'):
            D3B_D3I.append(temp)
        elif (labelSpan[0] == 'D4B-X'):
            D4B_D4I.append(temp)
        else:
            postEntities.append(temp)

    if len(HB_HI) != 0:
        for d1b in D1B_D1I:
            # combine with the nearest head entity at left
            target = None
            for hb in HB_HI:
                if (hb.tkSpans[0][0] < d1b.tkSpans[0][0]):
                    target = hb
                else:
                    break

            if target is None:
                pass
            else:
                combined = combineTwoEntity(d1b, target)
                postEntities.append(combined)
                if len(D1B_D1I) == 1:
                    postEntities.append(target)

        for d3b in D3B_D3I:
            # combine with the nearest head entity at right
            target = None
            for hb in reversed(HB_HI):
                if (hb.tkSpans[0][0] > d3b.tkSpans[0][0]):
                    target = hb
                else:
                    break

            if target is None:
                pass
            else:
                combined = combineTwoEntity(d3b, target)
                postEntities.append(combined)
                if len(D3B_D3I) == 1:
                    postEntities.append(target)

    else:
        for d2b in D2B_D2I:

            # combine with the nearest non-head entity at left
            target = None
            for db in D1B_D1I:
                if (db.tkSpans[0][0] < d2b.tkSpans[0][0]):
                    target = db
                else:
                    break

            for db in D2B_D2I:
                if (db.tkSpans[0][0] < d2b.tkSpans[0][0]):
                    if (target is not None
                            and target.tkSpans[0][0] < db.tkSpans[0][0]):
                        target = db
                    else:
                        target = db
                else:
                    break

            for db in D3B_D3I:
                if (db.tkSpans[0][0] < d2b.tkSpans[0][0]):
                    if (target is not None
                            and target.tkSpans[0][0] < db.tkSpans[0][0]):
                        target = db
                    else:
                        target = db
                else:
                    break

            for db in D4B_D4I:
                if (db.tkSpans[0][0] < d2b.tkSpans[0][0]):
                    if (target is not None
                            and target.tkSpans[0][0] < db.tkSpans[0][0]):
                        target = db
                    else:
                        target = db
                else:
                    break

            if target is None:
                pass
            else:
                combined = combineTwoEntity(d2b, target)
                postEntities.append(combined)

        for d4b in D4B_D4I:

            # combine with the nearest non-head entity at right
            target = None
            for db in reversed(D1B_D1I):
                if (db.tkSpans[0][0] > d4b.tkSpans[0][0]):
                    target = db
                else:
                    break

            for db in reversed(D2B_D2I):
                if (db.tkSpans[0][0] > d4b.tkSpans[0][0]):
                    if (target is not None
                            and target.tkSpans[0][0] > db.tkSpans[0][0]):
                        target = db
                    else:
                        target = db
                else:
                    break

            for db in reversed(D3B_D3I):
                if (db.tkSpans[0][0] > d4b.tkSpans[0][0]):
                    if (target is not None
                            and target.tkSpans[0][0] > db.tkSpans[0][0]):
                        target = db
                    else:
                        target = db
                else:
                    break

            for db in reversed(D4B_D4I):
                if (db.tkSpans[0][0] > d4b.tkSpans[0][0]):
                    if (target is not None
                            and target.tkSpans[0][0] > db.tkSpans[0][0]):
                        target = db
                    else:
                        target = db
                else:
                    break

            if target is None:
                pass
            else:
                combined = combineTwoEntity(d4b, target)
                postEntities.append(combined)

    # resort by start position and remove the same entity
    anwserEntities = []
    for temp in postEntities:
        isIn = False
        for anwser in anwserEntities:
            if anwser.equalsTkSpan(temp):
                isIn = True
                break

        if isIn == False:
            iter = 0
            for old in anwserEntities:
                if old.tkSpans[0][0] > temp.tkSpans[0][0]:
                    break
                iter += 1

            anwserEntities.insert(iter, temp)

    if return_str_or_not:
        # transfer Entity class into its str representation
        strEntities = []
        for answer in anwserEntities:
            strEntity = 'X'
            for tkSpan in answer.tkSpans:
                strEntity += '[' + str(tkSpan[0]) + ',' + str(tkSpan[1]) + ']'
            strEntities.append(strEntity)
        return strEntities
    else:
        return anwserEntities
Beispiel #12
0
def load_data_pubtator(file_path):

    # stat
    ct_doc = 0
    ct_entity = 0

    documents = []
    with codecs.open(file_path, 'r', 'UTF-8') as fp:

        document = None

        for line in fp:

            line = line.strip()

            if line == '':
                if document is None:
                    continue
                else:
                    # save the document
                    documents.append(document)
                    document = None
                    ct_doc += 1
            elif line.find('|t|') != -1:
                # a new document
                document = Document()
                columns = line.split('|t|')
                document.name = columns[0]
                document.text = columns[1] + " "  # offset need + 1

            elif line.find('|a|') != -1:

                columns = line.split('|a|')

                document.text += columns[1]

                generator = nlp_tool.span_tokenize(document.text)
                for t in generator:
                    document.all_sents_inds.append(t)

                for ind in range(len(document.all_sents_inds)):
                    t_start = document.all_sents_inds[ind][0]
                    t_end = document.all_sents_inds[ind][1]

                    tmp_tokens = FoxTokenizer.tokenize(
                        t_start, document.text[t_start:t_end], False)
                    sentence_tokens = []
                    for token_idx, token in enumerate(tmp_tokens):
                        token_dict = {}
                        token_dict['start'], token_dict['end'] = token[
                            1], token[2]
                        token_dict['text'] = token[0]

                        sentence_tokens.append(token_dict)

                    document.sentences.append(sentence_tokens)

            else:
                columns = line.split('\t')

                if columns[1] == 'CID':  # for cdr corpus, we ignore relation
                    continue

                if columns[4].find(
                        "Chemical"
                ) != -1:  # for cdr corpus, we ignore chemical
                    continue

                entity = Entity()
                entity.spans.append([int(columns[1]), int(columns[2])])
                entity.name = columns[3]
                entity.type = columns[4]

                if columns[5].find('|') != -1:
                    ids = columns[5].split('|')
                    for id in ids:
                        if id == '-1':
                            raise RuntimeError("id == -1")
                        if id.find("OMIM:") != -1:
                            id = id[id.find("OMIM:") + len("OMIM:"):]
                            entity.norm_ids.append(id)
                        else:
                            entity.norm_ids.append(id)
                elif columns[5].find('+') != -1:
                    ids = columns[5].split('+')
                    for id in ids:
                        if id == '-1':
                            raise RuntimeError("id == -1")
                        if id.find("OMIM:") != -1:
                            id = id[id.find("OMIM:") + len("OMIM:"):]
                            entity.norm_ids.append(id)
                        else:
                            entity.norm_ids.append(id)
                else:
                    id = columns[5]
                    if id.find("OMIM:") != -1:
                        id = id[id.find("OMIM:") + len("OMIM:"):]
                        entity.norm_ids.append(id)
                    else:
                        entity.norm_ids.append(id)

                # columns[6], cdr may has Individual mentions, we don't use it yet

                for sent_idx, (sent_start,
                               sent_end) in enumerate(document.all_sents_inds):
                    if entity.spans[0][0] >= sent_start and entity.spans[0][
                            1] <= sent_end:  # we assume entity has only one span
                        entity.sent_idx = sent_idx
                        break
                if entity.sent_idx == -1:
                    logging.debug("can't find entity.sent_idx: {} ".format(
                        entity.name))
                    continue
                    # raise RuntimeError("can't find entity.sent_idx")

                tkStart = -1
                tkEnd = -1
                for tkidx, token_dict in enumerate(
                        document.sentences[entity.sent_idx]):
                    if token_dict['start'] == entity.spans[0][0]:
                        tkStart = tkidx

                    if token_dict['end'] == entity.spans[0][1]:
                        tkEnd = tkidx

                    if tkStart != -1 and tkEnd != -1:
                        break

                if tkStart == -1 or tkEnd == -1:
                    raise RuntimeError('tkStart == -1 or tkEnd == -1')

                entity.tkSpans.append([tkStart, tkEnd])

                document.entities.append(entity)
                ct_entity += 1

    logging.info("document number {}, entity number {}".format(
        ct_doc, ct_entity))

    return documents
Beispiel #13
0
def error_analysis(d, dictionary, dictionary_reverse, opt, isMeddra_dict):
    logging.info("error_analysis ...")

    test_data = loadData(opt.test_file, False, opt.types, opt.type_filter)

    logging.info("use my tokenizer")
    nlp_tool = None

    logging.info("use neural-based normer")
    if opt.test_in_cpu:
        neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu')
    else:
        neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'))
    neural_model.eval()

    ct_predicted = 0
    ct_gold = 0
    ct_correct = 0

    for document in test_data:

        logging.info("###### begin {}".format(document.name))

        # copy entities from gold entities
        pred_entities = []
        for gold in document.entities:
            pred = Entity()
            pred.id = gold.id
            pred.type = gold.type
            pred.spans = gold.spans
            pred.name = gold.name
            pred_entities.append(pred)

        neural_model.process_one_doc(document, pred_entities, dictionary, dictionary_reverse, isMeddra_dict)

        ct_norm_gold = len(document.entities)
        ct_norm_predict = len(pred_entities)
        ct_norm_correct = 0

        for predict_entity in pred_entities:



            for gold_entity in document.entities:

                if predict_entity.equals_span(gold_entity):

                    b_right = False

                    if len(gold_entity.norm_ids) == 0:
                        # if gold_entity not annotated, we count it as TP
                        b_right = True
                        ct_norm_correct += 1
                    else:

                        if len(predict_entity.norm_ids) != 0 and predict_entity.norm_ids[0] in dictionary:
                            concept = dictionary[predict_entity.norm_ids[0]]

                            if gold_entity.norm_ids[0] in concept.codes:
                                ct_norm_correct += 1
                                b_right = True

                    if b_right == False:
                        if len(predict_entity.norm_ids) != 0 and predict_entity.norm_ids[0] in dictionary:
                            concept = dictionary[predict_entity.norm_ids[0]]
                            logging.info("entity name: {} | gold id, name: {}, {} | pred cui, codes, names: {}, {}, {}"
                                         .format(predict_entity.name, gold_entity.norm_ids[0], gold_entity.norm_names[0],
                                                 concept.cui, concept.codes, concept.names))

                    break




        ct_predicted += ct_norm_predict
        ct_gold += ct_norm_gold
        ct_correct += ct_norm_correct


    if ct_gold == 0:
        precision = 0
        recall = 0
    else:
        precision = ct_correct * 1.0 / ct_predicted
        recall = ct_correct * 1.0 / ct_gold

    if precision+recall == 0:
        f_measure = 0
    else:
        f_measure = 2*precision*recall/(precision+recall)

    logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (precision, recall, f_measure))