Example #1
0
def get_spacy_model(spacy_model_name: str,
                    pos_tags: bool,
                    parse: bool,
                    ner: bool,
                    with_custom_tokenizer: bool = False,
                    with_sentence_segmenter: bool = False) -> SpacyModelType:
    """
    In order to avoid loading spacy models repeatedly,
    we'll save references to them, keyed by the options
    we used to create the spacy model, so any particular
    configuration only gets loaded once.
    """
    options = (spacy_model_name, pos_tags, parse, ner, with_custom_tokenizer, with_sentence_segmenter)
    if options not in LOADED_SPACY_MODELS:
        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            print(f"Spacy models '{spacy_model_name}' not found.  Downloading and installing.")
            spacy_download(spacy_model_name)
            spacy_model = spacy.load(spacy_model_name, disable=disable)

        if with_custom_tokenizer:
            spacy_model.tokenizer = combined_rule_tokenizer(spacy_model)
        if with_sentence_segmenter:
            spacy_model.add_pipe(combined_rule_sentence_segmenter, first=True)

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]
Example #2
0
def tmdm_one_sent_pipeline(getter: Callable[[
    Any,
], Tuple[str, str]] = default_one_sent_getter,
                           model="en_core_sci_lg"):
    nlp = spacy.load(model, disable=['ner', 'parser'])
    nlp.tokenizer = IDTokenizer(combined_rule_tokenizer(nlp), getter=getter)
    nlp.add_pipe(OneSentSentencizer())
    return nlp
def count_frequencies(language_class: Language, input_path: Path):
    """
    Given a file containing single documents per line
    (for scispacy, these are Pubmed abstracts), split the text
    using a science specific tokenizer and compute word and
    document frequencies for all words.
    """
    print(f"Processing {input_path}.")
    tokenizer = combined_rule_tokenizer(language_class())
    counts = Counter()
    doc_counts = Counter()
    for line in open(input_path, "r"):
        words = [t.text for t in tokenizer(line)]
        counts.update(words)
        doc_counts.update(set(words))

    return counts, doc_counts
Example #4
0
def init_model(lang,
               output_dir,
               freqs_loc=None,
               vectors_loc=None,
               no_expand_vectors=False,
               meta_overrides=None,
               prune_vectors=-1,
               min_word_frequency=50,
               vectors_name=None):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
    and word vectors.
    """
    output_dir = ensure_path(output_dir)
    if vectors_loc is not None:
        vectors_loc = cached_path(vectors_loc)
        vectors_loc = ensure_path(vectors_loc)
    if freqs_loc is not None:
        freqs_loc = cached_path(freqs_loc)
        freqs_loc = ensure_path(freqs_loc)

    if freqs_loc is not None and not freqs_loc.exists():
        msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
    probs, oov_prob = read_freqs(
        freqs_loc,
        min_freq=min_word_frequency) if freqs_loc is not None else ({}, -20)
    vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (
        None, None)
    nlp = create_model(lang, probs, oov_prob, vectors_data, vector_keys,
                       not no_expand_vectors, prune_vectors, vectors_name)

    # Insert our custom tokenizer into the base model.
    nlp.tokenizer = combined_rule_tokenizer(nlp)

    if meta_overrides is not None:
        metadata = json.load(open(meta_overrides))
        nlp.meta.update(metadata)
        nlp.meta["version"] = VERSION

    if not output_dir.exists():
        os.makedirs(output_dir, exist_ok=True)
    nlp.to_disk(output_dir)
    return nlp
Example #5
0
    def add_pipe(self, pipe):
        """Add Spacy pipes

        Args:
            pipe (str): pipe name
        """
        print('Loading Spacy pipe: {}'.format(pipe))
        pipe = pipe.lower()
        if pipe == 'abbreviation':  # Abbreviation extraction
            abbreviation_pipe = AbbreviationDetector(self.nlp)
            self.nlp.add_pipe(abbreviation_pipe)
        elif pipe == 'entitylinker':  # Entity linker
            linker = UmlsEntityLinker(resolve_abbreviations=True)
            self.nlp.add_pipe(linker)
        elif pipe == 'segmenter':  # Rule Segmenter
            self.nlp.add_pipe(combined_rule_sentence_segmenter, first=True)
        elif pipe == 'tokenizer':  # Tokenizer
            self.nlp.tokenizer = combined_rule_tokenizer(self.nlp)
        elif pipe == 'textrank':  # Textrank
            tr = pytextrank.TextRank()
            self.nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)
        print('Pipe loaded.')
Example #6
0
def combined_rule_tokenizer_fixture():
    nlp = get_spacy_model('en_core_web_sm', True, True, True)
    tokenizer = combined_rule_tokenizer(nlp)
    return tokenizer
Example #7
0
def create_combined_rule_model() -> Language:
    nlp = spacy.load('en_core_web_sm')
    nlp.tokenizer = combined_rule_tokenizer(nlp)
    nlp.add_pipe(combined_rule_sentence_segmenter, first=True)
    return nlp
def evaluate_sentence_splitting(model_path: str,
                                data_directory: str,
                                rule_segmenter: bool = False,
                                custom_tokenizer: bool = False,
                                citation_data_path: str = None):

    model = spacy.load(model_path)
    if rule_segmenter:
        model.add_pipe(combined_rule_sentence_segmenter, first=True)
    if custom_tokenizer:
        model.tokenizer = combined_rule_tokenizer(model)

    total_correct = 0
    total = 0
    total_abstracts = 0
    perfect = 0
    for abstract_name in os.listdir(data_directory):

        abstract_sentences = [x.strip() for x in
                              open(os.path.join(data_directory, abstract_name), "r")]

        full_abstract = " ".join(abstract_sentences)

        doc = model(full_abstract)

        sentences = [x.text for x in doc.sents]

        correct = []
        for sentence in sentences:
            if sentence in abstract_sentences:
                correct.append(1)
            else:
                correct.append(0)

        total += len(correct)
        total_correct += sum(correct)
        perfect += all(correct)
        total_abstracts += 1

    print(f"Sentence splitting performance for {model_path} :\n")

    print(f"Sentence level accuracy: {total_correct} of {total}, {total_correct / total}. ")
    print(f"Abstract level accuracy: {perfect} of {total_abstracts}, {perfect / total_abstracts}. ")

    if citation_data_path is None:
        return

    skipped = 0
    citation_total = 0
    citation_correct = 0
    for line in open(citation_data_path, "r"):

        sentence = remove_new_lines(json.loads(line)["string"])

        # Skip sentence if it doesn't look roughly like a sentence,
        # or it is > 2 std deviations above the mean length.
        if not sentence[0].isupper() or sentence[-1] != "." or len(sentence) > 450:
            skipped += 1
            continue

        sentences = list(model(sentence).sents)

        if len(sentences) == 1:
            citation_correct += 1
        citation_total +=1
    print(f"Citation handling performance for {model_path}, skipped {skipped} examples :\n")
    print(f"Citation level accuracy: {citation_correct} of {citation_total}, {citation_correct / citation_total}. ")
Example #9
0
def create_combined_rule_model() -> Language:
    nlp = spacy.load("en_core_web_sm")
    nlp.tokenizer = combined_rule_tokenizer(nlp)
    nlp.add_pipe(pysbd_sentencizer, first=True)
    return nlp
Example #10
0
 def replace_tokenizer(nlp: Language) -> Language:
     nlp.tokenizer = combined_rule_tokenizer(nlp)
     return nlp