Ejemplo n.º 1
0
def train_ner(output_dir: str,
              data_path: str,
              run_test: bool = None,
              model: str = None,
              n_iter: int = 100,
              label_granularity: int = None):

    if label_granularity is not None:
        umls_tree = construct_umls_tree_from_tsv(
            "data/umls_semantic_type_tree.tsv")
        label_mapping = umls_tree.get_collapsed_type_id_map_at_level(
            label_granularity)
        if label_granularity == 0:
            span_only = True
    else:
        label_mapping = None
        span_only = False
    train_data, dev_data, test_data = read_full_med_mentions(
        data_path, label_mapping, span_only)
    os.makedirs(output_dir, exist_ok=True)
    if run_test:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
        evaluate_ner(nlp,
                     dev_data,
                     dump_path=os.path.join(output_dir, "dev_metrics.json"))
        evaluate_ner(nlp,
                     test_data,
                     dump_path=os.path.join(output_dir, "test_metrics.json"))
    else:
        train(model, train_data, dev_data, output_dir, n_iter)
Ejemplo n.º 2
0
def main(model_path: str, dataset: str, output_path: str, code: Optional[str],
         med_mentions_folder_path: Optional[str]):
    if code is not None:
        # need to import code before loading a spacy model
        spec = importlib.util.spec_from_file_location(name, str(loc))
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)

    nlp = spacy.load(model_path)
    if dataset.startswith("medmentions"):
        train_data, dev_data, test_data = read_full_med_mentions(
            med_mentions_folder_path, None, False)
        data_split = dataset.split("-")[1]
        if data_split == "train":
            data = train_data
        elif data_split == "dev":
            data = dev_data
        elif data_split == "test":
            data = test_data
        else:
            raise Exception(f"Unrecognized split {data_split}")
    else:
        data = read_ner_from_tsv(dataset)

    evaluate_ner(nlp, data, dump_path=output_path)
Ejemplo n.º 3
0
def med_mentions_reader(directory_path: str,
                        split: str) -> Callable[[Language], Iterator[Example]]:
    train, dev, test = read_full_med_mentions(directory_path,
                                              label_mapping=None,
                                              span_only=True,
                                              spacy_format=True)

    def corpus(nlp: Language) -> Iterator[Example]:
        if split == "train":
            original_examples = train
        elif split == "dev":
            original_examples = dev
        elif split == "test":
            original_examples = test
        else:
            raise Exception(f"Unexpected split {split}")

        for original_example in original_examples:
            doc = nlp.make_doc(original_example[0])
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", category=UserWarning)
                spacy_example = Example.from_dict(doc, original_example[1])
            yield spacy_example

    return corpus
Ejemplo n.º 4
0
def main(medmentions_path: str, umls_path: str):

    umls_concept_dict_by_id, umls_concept_dict_by_name = prepare_umls_indices(
        umls_path)

    print('Reading corpus ... ')
    train_examples, dev_examples, test_examples = data_util.read_full_med_mentions(
        medmentions_path, spacy_format=False)

    missing_entity_ids = []  # entities in MedMentions but not in UMLS
    found_entity_ids = []  # entities in MedMentions and in UMLS
    entity_correct_links_count = 0  # number of correctly linked entities
    entity_wrong_links_count = 0  # number of wrongly linked entities
    entity_no_links_count = 0  # number of entities that are not linked
    print('Linking ... ')
    for example in dev_examples:  # only loop over the dev examples for now because we don't have a trained model
        for entity in example.entities:
            if entity.umls_id not in umls_concept_dict_by_id:
                missing_entity_ids.append(entity)
                continue
            found_entity_ids.append(entity)

            predicted_umls_concept = linking(entity, umls_concept_dict_by_name)
            if predicted_umls_concept is None:
                entity_no_links_count += 1
            elif predicted_umls_concept['concept_id'] == entity.umls_id:
                entity_correct_links_count += 1
            else:
                entity_wrong_links_count += 1

    print(f'MedMentions entities not in UMLS: {len(missing_entity_ids)}')
    print(f'MedMentions entities found in UMLS: {len(found_entity_ids)}')
    print('Correct linking: {0:.2f}%'.format(100 * entity_correct_links_count /
                                             len(found_entity_ids)))
    print('Wrong linking: {0:.2f}%'.format(100 * entity_wrong_links_count /
                                           len(found_entity_ids)))
    print('No linking: {0:.2f}%'.format(100 * entity_no_links_count /
                                        len(found_entity_ids)))
Ejemplo n.º 5
0
def main(medmentions_path: str,
         umls_path: str,
         model_path: str,
         ks: str,
         thresholds,
         use_gold_mentions: bool = False,
         train: bool = False,
         spacy_model: str = "",
         generate_linker_data: bool = False,
         use_soft_matching: bool = False,
         substitute_abbreviations: bool = False):

    umls_concept_list = load_umls_kb(umls_path)
    umls_concept_dict_by_id = {c['concept_id']: c for c in umls_concept_list}

    # We need to keep around a map from text to possible canonical ids that they map to.
    text_to_concept_id: Dict[str, Set[str]] = defaultdict(set)

    for concept in umls_concept_list:
        for alias in set(concept["aliases"]).union({concept["canonical_name"]
                                                    }):
            text_to_concept_id[alias].add(concept["concept_id"])

    if train:
        create_tfidf_ann_index(model_path, text_to_concept_id)
    ann_concept_aliases_list, tfidf_vectorizer, ann_index = load_tfidf_ann_index(
        model_path)
    candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer,
                                             ann_concept_aliases_list,
                                             text_to_concept_id, False)

    linking_classifier = load_linking_classifier(model_path)
    linker = Linker(umls_concept_dict_by_id, linking_classifier)

    print('Reading MedMentions...')
    train_examples, dev_examples, test_examples = data_util.read_full_med_mentions(
        medmentions_path, spacy_format=False)

    k_list = [int(k) for k in ks.split(',')]
    if thresholds is None:
        thresholds = [1.0]
    else:
        thresholds = [float(x) for x in thresholds.split(",")]

    if len(thresholds) > 1 or len(k_list) > 1:
        assert not generate_linker_data, \
            'generating linker training data should be for a single threshold and k'

    nlp = spacy.load(spacy_model)
    if substitute_abbreviations:
        abbreviation_detector = AbbreviationDetector(nlp)
        nlp.add_pipe(abbreviation_detector, last=True)

    if generate_linker_data:
        examples_list = [train_examples, dev_examples, test_examples]
        filenames = [
            f'{model_path}/train.jsonl', f'{model_path}/dev.jsonl',
            f'{model_path}/test.jsonl'
        ]
        for examples, filename in zip(examples_list, filenames):
            supervised_data = eval_candidate_generation_and_linking(
                examples, umls_concept_dict_by_id, candidate_generator, k_list,
                thresholds, use_gold_mentions, nlp, generate_linker_data,
                linker, use_soft_matching, substitute_abbreviations)
            with open(filename, 'w') as f:
                for d in supervised_data:
                    f.write(f'{json.dumps(d)}\n')
    else:
        print('Results on the DEV set')
        eval_candidate_generation_and_linking(
            dev_examples, umls_concept_dict_by_id, candidate_generator, k_list,
            thresholds, use_gold_mentions, nlp, generate_linker_data, linker,
            use_soft_matching, substitute_abbreviations)
Ejemplo n.º 6
0
def main(medmentions_path: str, umls_path: str, ann_index_path: str,
         tfidf_vectorizer_path: str, ks: str):

    umls_concept_list = load_umls_kb(umls_path)
    umls_concept_dict_by_id = dict(
        (c['concept_id'], c) for c in umls_concept_list)

    ann_concept_id_list, tfidf_vectorizer, ann_index = \
            create_load_tfidf_ann_index(ann_index_path, tfidf_vectorizer_path, umls_concept_list)

    print('Reading MedMentions ... ')
    train_examples, dev_examples, test_examples = data_util.read_full_med_mentions(
        medmentions_path, spacy_format=False)

    missing_entity_ids = []  # entities in MedMentions but not in UMLS
    found_entity_ids = []  # entities in MedMentions and in UMLS

    # don't care about context for now. Just do the processing based on mention text only
    # collect all the data in one list to use ann.knnQueryBatch which is a lot faster than
    # calling ann.knnQuery for each individual example
    mention_texts = []
    gold_umls_ids = []

    # only loop over the dev examples for now because we don't have a trained model
    for example in dev_examples:
        for entity in example.entities:
            if entity.umls_id not in umls_concept_dict_by_id:
                missing_entity_ids.append(
                    entity
                )  # the UMLS release doesn't contan all UMLS concepts
                continue
            found_entity_ids.append(entity)

            mention_texts.append(entity.mention_text)
            gold_umls_ids.append(entity.umls_id)
            continue

    k_list = [int(k) for k in ks.split(',')]
    for k in k_list:
        print(f'for k = {k}')
        entity_correct_links_count = 0  # number of correctly linked entities
        entity_wrong_links_count = 0  # number of wrongly linked entities
        entity_no_links_count = 0  # number of entities that are not linked

        candidate_neighbor_ids = generate_candidates(mention_texts, k,
                                                     tfidf_vectorizer,
                                                     ann_index,
                                                     ann_concept_id_list)

        for mention_text, gold_umls_id, candidate_neighbor_ids in zip(
                mention_texts, gold_umls_ids, candidate_neighbor_ids):
            gold_canonical_name = umls_concept_dict_by_id[gold_umls_id][
                'canonical_name']
            if len(candidate_neighbor_ids) == 0:
                entity_no_links_count += 1
                # print(f'No candidates. Mention Text: {mention_text}, Canonical Name: {gold_canonical_name}')
            elif gold_umls_id in candidate_neighbor_ids:
                entity_correct_links_count += 1
            else:
                entity_wrong_links_count += 1
                # print(f'Wrong candidates. Mention Text: {mention_text}, Canonical Name: {gold_canonical_name}')

        print(f'MedMentions entities not in UMLS: {len(missing_entity_ids)}')
        print(f'MedMentions entities found in UMLS: {len(found_entity_ids)}')
        print(f'K: {k}')
        print('Gold concept in candidates: {0:.2f}%'.format(
            100 * entity_correct_links_count / len(found_entity_ids)))
        print('Gold concept not in candidates: {0:.2f}%'.format(
            100 * entity_wrong_links_count / len(found_entity_ids)))
        print('Candidate generation failed: {0:.2f}%'.format(
            100 * entity_no_links_count / len(found_entity_ids)))