コード例 #1
0
    def __init__(
        self,
        file_path: str = None,
    ):
        if file_path is None:
            raise ValueError(
                "Do not use the default arguments to KnowledgeBase. "
                "Instead, use a subclass (e.g UmlsKnowledgeBase) or pass a path to a kb."
            )
        if file_path.endswith("jsonl"):
            raw = (json.loads(line) for line in open(cached_path(file_path)))
        else:
            raw = json.load(open(cached_path(file_path)))

        alias_to_cuis: Dict[str, Set[str]] = defaultdict(set)
        self.cui_to_entity: Dict[str, Entity] = {}

        for concept in raw:
            unique_aliases = set(concept["aliases"])
            unique_aliases.add(concept["canonical_name"])
            for alias in unique_aliases:
                alias_to_cuis[alias].add(concept["concept_id"])
            self.cui_to_entity[concept["concept_id"]] = Entity(**concept)

        self.alias_to_cuis: Dict[str, Set[str]] = {**alias_to_cuis}
コード例 #2
0
def load_approximate_nearest_neighbours_index(
        tfidf_vectors_path: str = DEFAULT_PATHS["tfidf_umls_vectors"],
        ann_index_path: str = DEFAULT_PATHS["ann_index"],
        ef_search: int = 200) -> FloatIndex:
    """
    Load an approximate nearest neighbours index from disk.

    Parameters
    ----------
    tfidf_vectors_path : str, required.
        The path to the tfidf vectors of the items in the index.
    ann_index_path : str, required.
        The path to the ann index.
    ef_search: int, optional (default = 200)
        Controls speed performance at query time. Max value is 2000,
        but reducing to around ~100 will increase query speed by an order
        of magnitude for a small performance hit.
    """
    uml_concept_alias_tfidfs = scipy.sparse.load_npz(
        cached_path(tfidf_vectors_path)).astype(numpy.float32)
    ann_index = nmslib.init(method='hnsw',
                            space='cosinesimil_sparse',
                            data_type=nmslib.DataType.SPARSE_VECTOR)
    ann_index.addDataPointBatch(uml_concept_alias_tfidfs)
    ann_index.loadIndex(cached_path(ann_index_path))
    query_time_params = {'efSearch': ef_search}
    ann_index.setQueryTimeParams(query_time_params)

    return ann_index
コード例 #3
0
def load_approximate_nearest_neighbours_index(
    linker_paths: LinkerPaths,
    ef_search: int = 200,
) -> FloatIndex:
    """
    Load an approximate nearest neighbours index from disk.

    Parameters
    ----------
    linker_paths: LinkerPaths, required.
        Contains the paths to the data required for the entity linker.
    ef_search: int, optional (default = 200)
        Controls speed performance at query time. Max value is 2000,
        but reducing to around ~100 will increase query speed by an order
        of magnitude for a small performance hit.
    """
    concept_alias_tfidfs = scipy.sparse.load_npz(
        cached_path(linker_paths.tfidf_vectors)).astype(numpy.float32)
    ann_index = nmslib.init(
        method="hnsw",
        space="cosinesimil_sparse",
        data_type=nmslib.DataType.SPARSE_VECTOR,
    )
    ann_index.addDataPointBatch(concept_alias_tfidfs)
    ann_index.loadIndex(cached_path(linker_paths.ann_index))
    query_time_params = {"efSearch": ef_search}
    ann_index.setQueryTimeParams(query_time_params)

    return ann_index
コード例 #4
0
def init_model(lang, output_dir, freqs_loc=None,
               vectors_loc=None, no_expand_vectors=False,
               meta_overrides=None, prune_vectors=-1, min_word_frequency=50):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
    and word vectors.
    """
    output_dir = ensure_path(output_dir)
    if vectors_loc is not None:
        vectors_loc = cached_path(vectors_loc)
        vectors_loc = ensure_path(vectors_loc)
    if freqs_loc is not None:
        freqs_loc = cached_path(freqs_loc)
        freqs_loc = ensure_path(freqs_loc)

    if freqs_loc is not None and not freqs_loc.exists():
        msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
    probs, oov_prob = read_freqs(freqs_loc, min_freq=min_word_frequency) if freqs_loc is not None else ({}, -20)
    vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
    nlp = create_model(lang, probs, oov_prob, vectors_data, vector_keys, not no_expand_vectors, prune_vectors)

    # Insert our custom tokenizer into the base model.
    #nlp.tokenizer = combined_rule_tokenizer(nlp)

    nlp.tokenizer = Tokenizer(nlp.vocab)

    if meta_overrides is not None:
        metadata = json.load(open(meta_overrides))
        nlp.meta.update(metadata)
        nlp.meta["version"] = VERSION

    if not output_dir.exists():
        os.makedirs(output_dir, exist_ok=True)
    nlp.to_disk(output_dir)
    return nlp
コード例 #5
0
def construct_umls_tree_from_tsv(filepath: str) -> UmlsSemanticTypeTree:

    """
    Reads in a tsv file which is formatted as a depth first traversal of
    a hierarchy tree, where nodes are of the format:

    Name TAB UMLS Semantic Type TAB Tree Depth

    Event	T051	1
      Activity	T052	2
        Behavior	T053	3
          Social Behavior	T054	4
          Individual Behavior	T055	4
        Daily or Recreational Activity	T056	3
    """

    node_stack: Deque[SemanticTypeNode] = deque()
    for line in open(cached_path(filepath), "r"):
        name, type_id, level = line.split("\t")
        name = name.strip()
        int_level = int(level.strip())
        node = SemanticTypeNode(type_id, name, [], int_level)

        node_stack.append(node)

    def attach_children(node: SemanticTypeNode, stack: Deque[SemanticTypeNode]):
        while stack and stack[0].level > node.level:
            popped = stack.popleft()
            attach_children(popped, stack)
            node.children.append(popped)

    first = node_stack.popleft()
    attach_children(first, node_stack)

    return UmlsSemanticTypeTree(first)
コード例 #6
0
    def __init__(self,
                 ann_index: FloatIndex = None,
                 tfidf_vectorizer: TfidfVectorizer = None,
                 ann_concept_aliases_list: List[str] = None,
                 umls: UmlsKnowledgeBase = None,
                 verbose: bool = False,
                 ef_search: int = 200) -> None:

        self.ann_index = ann_index or load_approximate_nearest_neighbours_index(ef_search=ef_search)

        self.vectorizer = tfidf_vectorizer or joblib.load(cached_path(DEFAULT_PATHS["tfidf_vectorizer"]))
        self.ann_concept_aliases_list = ann_concept_aliases_list or \
            json.load(open(cached_path(DEFAULT_PATHS["concept_aliases_list"])))

        self.umls = umls or UmlsKnowledgeBase()
        self.verbose = verbose
コード例 #7
0
def init_umls_nlp_linker():
    base_dir = ''
    tfidf_path = base_dir + 'tfidf_vectors_sparse.npz'
    ann_path = base_dir + 'nmslib_index.bin'
    ann_index = load_approximate_nearest_neighbours_index(
        tfidf_vectors_path=tfidf_path, ann_index_path=ann_path)
    vec = joblib.load(cached_path(base_dir + 'tfidf_vectorizer.joblib'))
    ann_concept = json.load(
        open(cached_path(base_dir + 'concept_aliases.json')))
    umlsknowlegebase = UmlsKnowledgeBase(
        file_path=base_dir + 'umls_2017_aa_cat0129.json',
        types_file_path=base_dir + 'umls_semantic_type_tree.tsv')
    cg = CandidateGenerator(ann_index=ann_index,
                            tfidf_vectorizer=vec,
                            ann_concept_aliases_list=ann_concept,
                            umls=umlsknowlegebase)
    linker = UmlsEntityLinker(candidate_generator=cg,
                              max_entities_per_mention=1)
    nlp.add_pipe(linker)
    return linker
コード例 #8
0
ファイル: mm_reader.py プロジェクト: scott-8/MedLinker
def read_full_med_mentions(directory_path: str,
                           label_mapping: Dict[str, str] = None,
                           span_only: bool = False):

    def _cleanup_dir(dir_path: str):
        if os.path.exists(dir_path):
            shutil.rmtree(dir_path)

    resolved_directory_path = cached_path(directory_path)
    if "tar.gz" in directory_path:
        # Extract dataset to temp dir
        tempdir = tempfile.mkdtemp()
        print(f"extracting dataset directory {resolved_directory_path} to temp dir {tempdir}")
        with tarfile.open(resolved_directory_path, 'r:gz') as archive:
            archive.extractall(tempdir)
        # Postpone cleanup until exit in case the unarchived
        # contents are needed outside this function.
        atexit.register(_cleanup_dir, tempdir)

        resolved_directory_path = tempdir

    expected_names = ["corpus_pubtator.txt",
                      "corpus_pubtator_pmids_all.txt",
                      "corpus_pubtator_pmids_dev.txt",
                      "corpus_pubtator_pmids_test.txt",
                      "corpus_pubtator_pmids_trng.txt"]

    corpus = os.path.join(resolved_directory_path, expected_names[0])
    examples = med_mentions_example_iterator(corpus)

    train_ids = {x.strip() for x in open(os.path.join(resolved_directory_path, expected_names[4]))}
    dev_ids = {x.strip() for x in open(os.path.join(resolved_directory_path, expected_names[2]))}
    test_ids = {x.strip() for x in open(os.path.join(resolved_directory_path, expected_names[3]))}

    train_examples = []
    dev_examples = []
    test_examples = []

    for example in examples:
        if example.pubmed_id in train_ids:
            train_examples.append(example)

        elif example.pubmed_id in dev_ids:
            dev_examples.append(example)

        elif example.pubmed_id in test_ids:
            test_examples.append(example)

    return train_examples, dev_examples, test_examples
コード例 #9
0
    def __init__(
        self,
        ann_index: FloatIndex = None,
        tfidf_vectorizer: TfidfVectorizer = None,
        ann_concept_aliases_list: List[str] = None,
        kb: KnowledgeBase = None,
        verbose: bool = False,
        ef_search: int = 200,
        name: str = None,
    ) -> None:

        if name is not None and any(
            [ann_index, tfidf_vectorizer, ann_concept_aliases_list, kb]):
            raise ValueError(
                "You cannot pass both a name argument and other constuctor arguments."
            )

        # Set the name to the default, after we have checked
        # the compatability with the args above.
        if name is None:
            name = "umls"

        linker_paths = DEFAULT_PATHS.get(name, UmlsLinkerPaths)

        self.ann_index = ann_index or load_approximate_nearest_neighbours_index(
            linker_paths=linker_paths, ef_search=ef_search)
        self.vectorizer = tfidf_vectorizer or joblib.load(
            cached_path(linker_paths.tfidf_vectorizer))
        self.ann_concept_aliases_list = ann_concept_aliases_list or json.load(
            open(cached_path(linker_paths.concept_aliases_list)))

        self.kb = kb or DEFAULT_KNOWLEDGE_BASES[name]()
        self.verbose = verbose

        # TODO(Mark): Remove in scispacy v1.0.
        self.umls = self.kb
コード例 #10
0
    def __init__(self,
                 file_path: str = DEFAULT_UMLS_PATH,
                 types_file_path: str = DEFAULT_UMLS_TYPES_PATH):
        raw = json.load(open(cached_path(file_path)))

        alias_to_cuis: Dict[str, Set[str]] = defaultdict(set)
        self.cui_to_entity: Dict[str, UmlsEntity] = {}

        for concept in raw:
            unique_aliases = set(concept["aliases"])
            unique_aliases.add(concept["canonical_name"])
            for alias in unique_aliases:
                alias_to_cuis[alias].add(concept["concept_id"])
            self.cui_to_entity[concept["concept_id"]] = UmlsEntity(**concept)

        self.alias_to_cuis: Dict[str, Set[str]] = {**alias_to_cuis}
        self.semantic_type_tree: UmlsSemanticTypeTree = construct_umls_tree_from_tsv(
            types_file_path)
コード例 #11
0
def read_ner_from_tsv(filename: str) -> List[SpacyNerExample]:
    """
    Reads BIO formatted NER data from a TSV file, such as the
    NER data found here:
    https://github.com/cambridgeltl/MTL-Bioinformatics-2016

    Data is expected to be 2 tab seperated tokens per line, with
    sentences denoted by empty lines. Sentences read by this
    function will be already tokenized, but returned as a string,
    as this is the format required by SpaCy. Consider using the
    WhitespaceTokenizer(scispacy/util.py) to split this data
    with a SpaCy model.

    Parameters
    ----------
    filename : str
        The path to the tsv data.

    Returns
    -------
    spacy_format_data : List[SpacyNerExample]
        The BIO tagged NER examples.
    """
    spacy_format_data = []
    examples: List[Tuple[str, str]] = []
    for line in open(cached_path(filename)):
        line = line.strip()
        if line.startswith("-DOCSTART-"):
            continue
        # We have reached the end of a sentence.
        if not line:
            if not examples:
                continue
            spacy_format_data.append(_handle_sentence(examples))
            examples = []
        else:
            word, entity = line.split("\t")
            examples.append((word, entity))
    if examples:
        spacy_format_data.append(_handle_sentence(examples))

    return spacy_format_data
コード例 #12
0
ファイル: convert_freqs.py プロジェクト: swipswaps/scispacy
def main(input_path: str, output_path: str, min_word_frequency: int):
    if input_path is not None:
        input_path = cached_path(input_path)
        input_path = ensure_path(input_path)

    probs, oov_prob = (
        read_freqs(input_path, min_freq=min_word_frequency)
        if input_path is not None
        else ({}, -20)
    )

    with open(output_path, "w") as _jsonl_file:
        _jsonl_file.write(
            json.dumps({"lang": "en", "settings": {"oov_prob": -20.502029418945312}})
        )
        _jsonl_file.write("\n")

        for word, prob in probs.items():
            _jsonl_file.write(json.dumps({"orth": word, "prob": prob}))
            _jsonl_file.write("\n")
コード例 #13
0
def train_parser_and_tagger(train_json_path: str,
                            dev_json_path: str,
                            test_json_path: str,
                            model_output_dir: str,
                            model_path: str = None,
                            ontonotes_path: str = None,
                            ontonotes_train_percent: float = 0.0):
    """Function to train the spacy parser and tagger from a blank model, with the default, en_core_web_sm vocab.
       Training setup is mostly copied from the spacy cli train command.

       @param train_json_path: path to the conll formatted training data
       @param dev_json_path: path to the conll formatted dev data
       @param test_json_path: path to the conll formatted test data
       @param model_output_dir: path to the output directory for the trained models
       @param model_path: path to the model to load
       @param ontonotes_path: path to the directory containnig ontonotes in spacy format (optional)
       @param ontonotes_train_percent: percentage of the ontonotes training data to use (optional)
    """
    msg = Printer()

    train_json_path = cached_path(train_json_path)
    dev_json_path = cached_path(dev_json_path)
    test_json_path = cached_path(test_json_path)

    if model_path is not None:
        nlp = spacy.load(model_path)
    else:
        lang_class = util.get_lang_class('en')
        nlp = lang_class()

    if 'tagger' not in nlp.pipe_names:
        tagger = nlp.create_pipe('tagger')
        nlp.add_pipe(tagger, first=True)
    else:
        tagger = nlp.get_pipe('tagger')

    if 'parser' not in nlp.pipe_names:
        parser = nlp.create_pipe('parser')
        nlp.add_pipe(parser)
    else:
        parser = nlp.get_pipe('parser')

    train_corpus = GoldCorpus(train_json_path, dev_json_path)
    test_corpus = GoldCorpus(train_json_path, test_json_path)

    if ontonotes_path:
        onto_train_path = os.path.join(ontonotes_path, "train")
        onto_dev_path = os.path.join(ontonotes_path, "dev")
        onto_test_path = os.path.join(ontonotes_path, "test")
        onto_train_corpus = GoldCorpus(onto_train_path, onto_dev_path)
        onto_test_corpus = GoldCorpus(onto_train_path, onto_test_path)

    dropout_rates = util.decaying(0.2, 0.2, 0.0)
    batch_sizes = util.compounding(1., 16., 1.001)

    if model_path is not None:
        meta = nlp.meta
    else:
        meta = {}
        meta["lang"] = "en"
        meta["pipeline"] = ["tagger", "parser"]
        meta["name"] = "scispacy_core_web_sm"
        meta["license"] = "CC BY-SA 3.0"
        meta["author"] = "Allen Institute for Artificial Intelligence"
        meta["url"] = "allenai.org"
        meta["sources"] = ["OntoNotes 5", "Common Crawl", "GENIA 1.0"]
        meta["version"] = "1.0.0"
        meta["spacy_version"] = ">=2.2.1"
        meta["parent_package"] = "spacy"
        meta["email"] = "*****@*****.**"

    n_train_words = train_corpus.count_train()

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in  ['tagger', 'parser']]
    with nlp.disable_pipes(*other_pipes):
        if ontonotes_path:
            optimizer = nlp.begin_training(lambda: itertools.chain(train_corpus.train_tuples, onto_train_corpus.train_tuples))
        else:
            optimizer = nlp.begin_training(lambda: train_corpus.train_tuples)
        nlp._optimizer = None

    train_docs = train_corpus.train_docs(nlp)
    train_docs = list(train_docs)

    train_mixture = train_docs
    if ontonotes_path:
        onto_train_docs = onto_train_corpus.train_docs(nlp)
        onto_train_docs = list(onto_train_docs)
        num_onto_docs = int(float(ontonotes_train_percent)*len(onto_train_docs))
        randomly_sampled_onto = random.sample(onto_train_docs, num_onto_docs)
        train_mixture += randomly_sampled_onto

    row_head, output_stats = _configure_training_output(nlp.pipe_names, -1, False)
    row_widths = [len(w) for w in row_head]
    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}

    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)

    best_epoch = 0
    best_epoch_uas = 0.0
    for i in range(20):
        random.shuffle(train_mixture)
        with nlp.disable_pipes(*other_pipes):
            with tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                minibatches = list(util.minibatch(train_docs, size=batch_sizes))
                for batch in minibatches:
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
                               drop=next(dropout_rates), losses=losses)
                    pbar.update(sum(len(doc) for doc in docs))

        # save intermediate model and output results on the dev set
        with nlp.use_params(optimizer.averages):
            epoch_model_path = os.path.join(model_output_dir, "model"+str(i))
            os.makedirs(epoch_model_path, exist_ok=True)
            nlp.to_disk(epoch_model_path)

            with open(os.path.join(model_output_dir, "model"+str(i), "meta.json"), "w") as meta_fp:
                meta_fp.write(json.dumps(meta))

            nlp_loaded = util.load_model_from_path(epoch_model_path)
            dev_docs = train_corpus.dev_docs(nlp_loaded)
            dev_docs = list(dev_docs)
            nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
            start_time = timer()
            scorer = nlp_loaded.evaluate(dev_docs)
            end_time = timer()
            gpu_wps = None
            cpu_wps = nwords/(end_time-start_time)

            if ontonotes_path:
                onto_dev_docs = list(onto_train_corpus.dev_docs(nlp_loaded))
                onto_scorer = nlp_loaded.evaluate(onto_dev_docs)


        if scorer.scores["uas"] > best_epoch_uas:
            best_epoch_uas = scorer.scores["uas"]
            best_epoch = i
        progress = _get_progress(
            i, losses, scorer.scores, output_stats, cpu_wps=cpu_wps, gpu_wps=gpu_wps
        )
        msg.row(progress, **row_settings)

        if ontonotes_path:
            progress = _get_progress(
                i, losses, onto_scorer.scores, output_stats, cpu_wps=cpu_wps, gpu_wps=gpu_wps
            )
            msg.row(progress, **row_settings)

    # save final model and output results on the test set
    final_model_path = os.path.join(model_output_dir, "best")
    if os.path.exists(final_model_path):
        shutil.rmtree(final_model_path)
    shutil.copytree(os.path.join(model_output_dir, "model" + str(best_epoch)),
                    final_model_path)

    nlp_loaded = util.load_model_from_path(final_model_path)
    start_time = timer()
    test_docs = test_corpus.dev_docs(nlp_loaded)
    test_docs = list(test_docs)
    nwords = sum(len(doc_gold[0]) for doc_gold in test_docs)
    scorer = nlp_loaded.evaluate(test_docs)
    end_time = timer()
    gpu_wps = None
    cpu_wps = nwords/(end_time-start_time)
    meta["speed"] = {"gpu": None, "nwords": nwords, "cpu": cpu_wps}

    print("Retrained genia evaluation")
    print("Test results:")
    print("UAS:", scorer.uas)
    print("LAS:", scorer.las)
    print("Tag %:", scorer.tags_acc)
    print("Token acc:", scorer.token_acc)
    with open(os.path.join(model_output_dir, "genia_test.json"), "w+") as metric_file:
        json.dump(scorer.scores, metric_file)
    with open(os.path.join(model_output_dir, "best", "meta.json"), "w") as meta_fp:
        meta_fp.write(json.dumps(meta))

    if ontonotes_path:
        onto_test_docs = list(onto_test_corpus.dev_docs(nlp_loaded))
        print("Retrained ontonotes evaluation")
        scorer_onto_retrained = nlp_loaded.evaluate(onto_test_docs)
        print("Test results:")
        print("UAS:", scorer_onto_retrained.uas)
        print("LAS:", scorer_onto_retrained.las)
        print("Tag %:", scorer_onto_retrained.tags_acc)
        print("Token acc:", scorer_onto_retrained.token_acc)

        with open(os.path.join(model_output_dir, "ontonotes_test.json"), "w+") as metric_file:
            json.dump(scorer_onto_retrained.scores, metric_file)
コード例 #14
0
def read_full_med_mentions(
    directory_path: str,
    label_mapping: Dict[str, str] = None,
    span_only: bool = False,
    spacy_format: bool = True,
):
    def _cleanup_dir(dir_path: str):
        if os.path.exists(dir_path):
            shutil.rmtree(dir_path)

    resolved_directory_path = cached_path(directory_path)
    if "tar.gz" in directory_path:
        # Extract dataset to temp dir
        tempdir = tempfile.mkdtemp()
        print(
            f"extracting dataset directory {resolved_directory_path} to temp dir {tempdir}"
        )
        with tarfile.open(resolved_directory_path, "r:gz") as archive:
            archive.extractall(tempdir)
        # Postpone cleanup until exit in case the unarchived
        # contents are needed outside this function.
        atexit.register(_cleanup_dir, tempdir)

        resolved_directory_path = tempdir

    expected_names = [
        "corpus_pubtator.txt",
        "corpus_pubtator_pmids_all.txt",
        "corpus_pubtator_pmids_dev.txt",
        "corpus_pubtator_pmids_test.txt",
        "corpus_pubtator_pmids_trng.txt",
    ]

    corpus = os.path.join(resolved_directory_path, expected_names[0])
    examples = med_mentions_example_iterator(corpus)

    train_ids = {
        x.strip()
        for x in open(os.path.join(resolved_directory_path, expected_names[4]))
    }
    dev_ids = {
        x.strip()
        for x in open(os.path.join(resolved_directory_path, expected_names[2]))
    }
    test_ids = {
        x.strip()
        for x in open(os.path.join(resolved_directory_path, expected_names[3]))
    }

    train_examples = []
    dev_examples = []
    test_examples = []

    def label_function(label):
        if span_only:
            return "ENTITY"
        if label_mapping is None:
            return label
        else:
            return label_mapping[label]

    for example in examples:
        spacy_format_entities = [
            (x.start, x.end, label_function(x.mention_type)) for x in example.entities
        ]
        spacy_format_entities = remove_overlapping_entities(
            sorted(spacy_format_entities, key=lambda x: x[0])
        )
        spacy_example = (example.text, {"entities": spacy_format_entities})
        if example.pubmed_id in train_ids:
            train_examples.append(spacy_example if spacy_format else example)

        elif example.pubmed_id in dev_ids:
            dev_examples.append(spacy_example if spacy_format else example)

        elif example.pubmed_id in test_ids:
            test_examples.append(spacy_example if spacy_format else example)

    return train_examples, dev_examples, test_examples