def __init__(self, nlp: Language):
        """Initialise components"""

        extensions = [
            "_n_sentences",
            "_n_tokens",
            "_n_syllables",
            "token_length",
            "sentence_length",
            "syllables",
            "counts",
        ]
        ext_funs = [
            n_sentences,
            n_tokens,
            n_syllables,
            self.token_length,
            self.sentence_length,
            self.syllables,
            self.counts,
        ]
        for ext, fun in zip(extensions, ext_funs):
            if ext not in ["_n_sentences", "sentence_length", "syllables"]:
                if not Span.has_extension(ext):
                    Span.set_extension(ext, getter=fun)
            if not Doc.has_extension(ext):
                Doc.set_extension(ext, getter=fun)

        if not Doc.has_extension("_filtered_tokens"):
            Doc.set_extension("_filtered_tokens", default=[])
        if not Span.has_extension("_filtered_tokens"):
            Span.set_extension("_filtered_tokens", getter=filtered_tokens)
 def set_extensions():
     if not Doc.has_extension('coref_chains'):
         Doc.set_extension('coref_chains', default=None)
     if not Token.has_extension('coref_chains'):
         Token.set_extension('coref_chains', default=None)
     if not Doc.has_extension('holmes_document_info'):
         Doc.set_extension('holmes_document_info', default=None)
     if not Token.has_extension('holmes'):
         Token.set_extension('holmes', default=None)
Beispiel #3
0
    def __init__(self):

        if not Doc.has_extension("association_scores_mean"):
            Doc.set_extension("association_scores_mean", default=[])
        if not Doc.has_extension("association_scores_min"):
            Doc.set_extension("association_scores_min", default=[])
        if not Doc.has_extension("association_scores_max"):
            Doc.set_extension("association_scores_max", default=[])

        print('Created inside AssociationScores.')
        self.database_manager = DatabaseManager()
def install_classification_extensions(
    category: str,
    labels: list,
    doc_extention: str,
):
    prop_getter, label_getter = make_classification_getter(
        category, labels, doc_extention)
    if not Doc.has_extension(f"{category}_prop"):
        Doc.set_extension(f"{category}_prop", getter=prop_getter)
    if not Doc.has_extension(category):
        Doc.set_extension(category, getter=label_getter)
def test_json_to_doc_underscore(doc):
    if not Doc.has_extension("json_test1"):
        Doc.set_extension("json_test1", default=False)
    if not Doc.has_extension("json_test2"):
        Doc.set_extension("json_test2", default=False)

    doc._.json_test1 = "hello world"
    doc._.json_test2 = [1, 2, 3]
    json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
    new_doc = Doc(doc.vocab).from_json(json_doc, validate=True)
    assert all([new_doc.has_extension(f"json_test{i}") for i in range(1, 3)])
    assert new_doc._.json_test1 == "hello world"
    assert new_doc._.json_test2 == [1, 2, 3]
Beispiel #6
0
    def __call__(self, doc):
        #put the row on the matrix
        feature_row = []

        # Some feature extraction classes might not have been added to the pipeline.
        # So we should only collect the features which have been extracted.
        if Doc.has_extension('features_lv'):
            feature_row += doc._.features_lv
        if Doc.has_extension('features_ls'):
            feature_row += doc._.features_ls
        if Doc.has_extension('features_la'):
            feature_row += doc._.features_la
        if Doc.has_extension('features_ca'):
            feature_row += doc._.features_ca
        if Doc.has_extension('features_ld'):
            feature_row += doc._.features_ld
        if Doc.has_extension('association_scores_mean'):
            feature_row += doc._.association_scores_mean
        if Doc.has_extension('association_scores_min'):
            feature_row += doc._.association_scores_min
        if Doc.has_extension('association_scores_max'):
            feature_row += doc._.association_scores_max

        # todo: collect features of new feature extractors
        # if Doc.has_extension('features_XX'):
        #     feature_row += doc._.features_XX

        self.feature_matrix.append(feature_row)

        # it's useful to have access to the feature vector
        # at document level
        doc._.features = feature_row
        return doc
    def test_read(self):
        if Doc.has_extension("concepts"):
            Doc.remove_extension("concepts")
        ereader = EhostDocReader(
            nlp=English(),
            schema_file='data/ehost_test_corpus/config/projectschema.xml')
        doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt')
        self.eval(doc)

        def test_read_doc_name(self):
            ereader = EhostDocReader(
                nlp=English(),
                schema_file='data/ehost_test_corpus/config/projectschema.xml')

        doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt')
        assert (doc._.doc_name == 'doc1.txt')
        ereader.doc_name_depth = 1
        doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt')
        assert (doc._.doc_name == r'corpus/doc1.txt')
        ereader = EhostDocReader(
            nlp=English(),
            schema_file='data/ehost_test_corpus/config/projectschema.xml',
            doc_name_depth=2)
        doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt')
        assert (doc._.doc_name == r'ehost_test_corpus/corpus/doc1.txt')
Beispiel #8
0
def test_component_initialize():
    data_path = Path(__file__).parent / "data"
    # With from_disk
    nlp = spacy.blank("en")
    s2v = nlp.add_pipe("sense2vec")
    if Doc.has_extension("s2v_phrases"):
        s2v.first_run = False  # don't set up extensions again
    s2v.from_disk(data_path)
    doc = Doc(nlp.vocab, words=["beekeepers"], pos=["NOUN"])
    s2v(doc)
    assert doc[0]._.s2v_key == "beekeepers|NOUN"
    most_similar = [item for item, score in doc[0]._.s2v_most_similar(2)]
    assert most_similar[0] == ("honey bees", "NOUN")
    assert most_similar[1] == ("Beekeepers", "NOUN")

    # With initialize
    nlp = spacy.blank("en")
    s2v = nlp.add_pipe("sense2vec")
    s2v.first_run = False  # don't set up extensions again
    init_cfg = {"sense2vec": {"data_path": str(data_path)}}
    nlp.config["initialize"]["components"] = init_cfg
    nlp.initialize()
    doc = Doc(nlp.vocab, words=["beekeepers"], pos=["NOUN"])
    s2v(doc)
    assert doc[0]._.s2v_key == "beekeepers|NOUN"
    most_similar = [item for item, score in doc[0]._.s2v_most_similar(2)]
    assert most_similar[0] == ("honey bees", "NOUN")
    assert most_similar[1] == ("Beekeepers", "NOUN")
Beispiel #9
0
    def __call__(self, doc: Doc) -> Doc:
        mentions = []
        if self.resolve_abbreviations and Doc.has_extension("abbreviations"):

            for ent in doc.ents:
                # TODO: This is possibly sub-optimal - we might
                # prefer to look up both the long and short forms.
                if ent._.long_form is not None:
                    mentions.append(ent._.long_form)
                else:
                    mentions.append(ent)
        else:
            mentions = doc.ents

        mention_strings = [x.text for x in mentions]
        batch_candidates = self.candidate_generator(mention_strings, self.k)

        for mention, candidates in zip(doc.ents, batch_candidates):
            predicted = []
            for cand in candidates:
                score = max(cand.similarities)
                if (self.filter_for_definitions
                        and self.umls.cui_to_entity[cand.concept_id].definition
                        is None and score < self.no_definition_threshold):
                    continue
                if score > self.threshold:
                    predicted.append((cand.concept_id, score))
            sorted_predicted = sorted(predicted,
                                      reverse=True,
                                      key=lambda x: x[1])
            mention._.umls_ents = sorted_predicted[:self.
                                                   max_entities_per_mention]

        return doc
Beispiel #10
0
    def __call__(self, doc: Doc):
        """
        Collect all labels related to patterns matching tokens.
        Each token have a `labels` extension in which to store them.
        The supplied `Doc` have a `labelings` extension in which
        all labeled spans are collected.
        If the doc has abbrs, they contribute to label spans.

        Parameters
        ----------
        doc: Doc
            The doc to label over.

        Returns
        -------
        Doc
            The doc after labeling.
        """
        for key, start, end in self._matcher(doc):
            label = doc.vocab.strings[key]
            span = Span(doc, start, end, label)
            for token in span:
                if label in token._.labels:
                    continue
                token._.labels.append(label)
            doc._.labelings.append(span)
        _sort_labelings(doc)
        if doc.has_extension("abbrs"):
            _merge_abbrs_labelings(doc)
        if self._only_longest:
            _fix_overlabelings(doc)
        return doc
    def _analyze_health_text(self, doc: Doc):
        """
        Getter method. Makes the API call and aggregates the response.
        """

        assert doc.has_extension(STAGE.HEALTH_ANALYZER)
        if not self._endpoint:
            return {}

        headers = (
            {}
        )  # FIXME authorization / API key. Right now this goes to a preview deployment
        # FIXME change to new Azure Web API
        url = f"{self._endpoint}/text/analytics/v3.2-preview.1/entities/health"
        # TODO language
        language = "en"
        try:
            documents = self._split_into_documents(str(doc.text), language)
            response = requests.post(url, headers=headers, json=documents)
            if response.ok:
                docs = response.json()["documents"]
                result = self._collect_entities(docs)
                return result
            else:
                raise Exception(response.reason)

        except Exception as e:
            raise Exception(e)
Beispiel #12
0
    def __init__(self, hunspell_object=None):
        self.stopwords_list = [
            'the',
            'a',
            'an',
            'are',
            'on',
            'to',
            'at',
            'every',
            'this'
        ]

        if hunspell_object is None:
            hunspell_object = get_hunspell_default()

        self.hobj = hunspell_object

        self.nlp = spacy.load('en_core_web_lg')  # we need another spaCy model as we want to tag the document with
        # corrected spelling mistakes
        # Todo: maybe always do that at the beginning and just save the
        #       spelling mistakes

        if not Doc.has_extension("lstFilteredDepParseCorpus"):
            Doc.set_extension("lstFilteredDepParseCorpus", default=[])
 def test_dir_reader2(self):
     if Doc.has_extension("concepts"):
         Doc.remove_extension("concepts")
     dir_reader = EhostDirReader(
         nlp=English(),
         support_overlap=True,
         recursive=True,
         schema_file='data/ehost_test_corpus/config/projectschema.xml')
     docs = dir_reader.read(txt_dir='data/ehost_test_corpus/')
     assert (len(docs) == 2)
     for doc in docs:
         assert (len(doc._.concepts) == 7)
         assert ('Doc_Level_Purulence_Assessment' in doc._.concepts)
         assert (str(
             doc._.concepts['Doc_Level_Purulence_Assessment'][0]) == 'CHIEF'
                 )
         assert ('Purulent' in doc._.concepts)
         assert (str(doc._.concepts['Purulent'][0]) == 'Abdominal pain')
         assert ('Non-Purulent' in doc._.concepts)
         assert (str(doc._.concepts['Non-Purulent'][0]) == 'PRESENT')
         assert ('Incision_and_Drainage' in doc._.concepts)
         assert (str(
             doc._.concepts['Incision_and_Drainage'][0]) == 'patient')
         assert ('PreAnnotated' in doc._.concepts)
         assert (str(doc._.concepts['PreAnnotated'][0]) == '71-year-old')
         assert ('Nonspecific_SSTI' in doc._.concepts)
         assert (str(
             doc._.concepts['Nonspecific_SSTI'][0]) == 'X. The patient')
         assert ('Exclusions' in doc._.concepts)
         assert (str(doc._.concepts['Exclusions'][0]) == 'presented')
Beispiel #14
0
def set_doc_extensions():
    """
    Set :mod:`textacy.extract` custom property and method doc extensions
    on the global :class:`spacy.tokens.Doc`.
    """
    for name, kwargs in get_doc_extensions().items():
        if not Doc.has_extension(name):
            Doc.set_extension(name, **kwargs)
Beispiel #15
0
    def __init__(self):
        """Initialise the pipeline component.
        """
        if not Doc.has_extension("flesch_kincaid_grade_level"):
            Doc.set_extension("flesch_kincaid_grade_level",
                              getter=self.fk_grade)

        if not Doc.has_extension("flesch_kincaid_reading_ease"):
            Doc.set_extension("flesch_kincaid_reading_ease",
                              getter=self.fk_ease)

        if not Doc.has_extension("dale_chall"):
            Doc.set_extension("dale_chall", getter=self.dale_chall)

        if not Doc.has_extension("smog"):
            Doc.set_extension("smog", getter=self.smog)

        if not Doc.has_extension("coleman_liau_index"):
            Doc.set_extension("coleman_liau_index", getter=self.coleman_liau)

        if not Doc.has_extension("automated_readability_index"):
            Doc.set_extension("automated_readability_index", getter=self.ari)

        if not Doc.has_extension("forcast"):
            Doc.set_extension("forcast", getter=self.forcast)
Beispiel #16
0
    def __init__(self, hunspell_objects=[]):
        if not hunspell_objects:
            hunspell_objects = [get_hunspell('en_US'), get_hunspell('en_GB')]

        self.hunspell_objects = hunspell_objects
        self.token_pattern = re.compile("^[A-Za-z]+$")

        if not Doc.has_extension("spell_errors"):
            Doc.set_extension("spell_errors", default=[])
Beispiel #17
0
    def __init__(self, nlp: Language, use_pos: bool):
        """Initialise components"""
        self.use_pos = use_pos

        if not Doc.has_extension("pos_proportions"):
            Doc.set_extension("pos_proportions", getter=self.pos_proportions)

        if not Span.has_extension("pos_proportions"):
            Span.set_extension("pos_proportions", getter=self.pos_proportions)
 def test_parse_to_dicts(self):
     if Doc.has_extension("concepts"):
         Doc.remove_extension("concepts")
     ereader = EhostDocReader(nlp=English())
     spans, classes, attributes, relations = ereader.parse_to_dicts(
         'data/ehost_test_corpus/saved/doc1.txt.knowtator.xml')
     assert (len(spans) == 7)
     assert (len(classes) == 7)
     assert (len(attributes) == 6)
    def __call__(self, doc: Doc):
        if not doc.has_extension(STAGE.HEALTH_ANALYZER) and self._endpoint:
            doc.set_extension(STAGE.HEALTH_ANALYZER,
                              getter=self._analyze_health_text)
        if not self._endpoint:
            log.warning(
                "No endpoint for Azure Text Analytics for health, pls configure env vars ('AZ_TA_FOR_HEALTH_ENDPOINT' etc..)"
            )

        return doc
Beispiel #20
0
    def _summarize(self, doc: Doc):
        assert doc.has_extension(STAGE.SUMMARIZER)

        summary_modes = {
            "gensim": self._createSummaryWithGensim,
        }
        selected_mode = "gensim"  # for now, the only one that produces ok results
        summary_sentences = summary_modes.get(selected_mode, "gensim")(doc)

        return summary_sentences[0:self.num_sentences]
 def test_parse_to_dicts(self):
     if Doc.has_extension("concepts"):
         Doc.remove_extension("concepts")
     breader = BratDocReader(nlp=English())
     spans, classes, attributes, relations = breader.parse_to_dicts(
         Path('data/brat_test_corpus/000-introduction.ann').read_text())
     assert (len(spans) == 12)
     assert (len(classes) == 17)
     assert (len(attributes) == 6)
     assert (len(relations) == 5)
 def test_set_attributes(self):
     if Doc.has_extension("concepts"):
         Doc.remove_extension("concepts")
     BratDocReader(nlp=English(),
                   schema_file='data/brat_test_corpus/annotation.conf')
     nlp = English()
     doc = nlp('test status attribute')
     span = doc[1:2]
     assert (hasattr(span._, 'Negation'))
     assert (hasattr(span._, 'Confidence'))
 def test_set_attributes(self):
     if Doc.has_extension("concepts"):
         Doc.remove_extension("concepts")
     EhostDocReader(
         nlp=English(),
         schema_file='data/ehost_test_corpus/config/projectschema.xml')
     nlp = English()
     doc = nlp('test status attribute')
     assert (hasattr(doc[1:2]._, 'status'))
     assert (doc[1:2]._.status == 'present')
Beispiel #24
0
 def __init__(self, nlp: Language):
     """Initialise components"""
     if not Token.has_extension("dependency_distance"):
         Token.set_extension("dependency_distance",
                             getter=self.token_dependency)
     if not Span.has_extension("dependency_distance"):
         Span.set_extension("dependency_distance",
                            getter=self.span_dependency)
     if not Doc.has_extension("dependency_distance"):
         Doc.set_extension("dependency_distance",
                           getter=self.doc_dependency)
 def test_dir_reader(self):
     if Doc.has_extension("concepts"):
         Doc.remove_extension("concepts")
     dir_reader = EhostDirReader(
         nlp=English(),
         recursive=True,
         schema_file='data/ehost_test_corpus/config/projectschema.xml')
     docs = dir_reader.read(txt_dir='data/ehost_test_corpus/')
     assert (len(docs) == 2)
     for doc in docs:
         self.eval(doc)
Beispiel #26
0
def test_extensions(nlp, read):
    nlp.add_pipe(read, last=True)
    doc = nlp("I contain four words.")
    assert Doc.has_extension("flesch_kincaid_grade_level")
    assert Doc.has_extension("flesch_kincaid_reading_ease")
    assert Doc.has_extension("dale_chall")
    assert Doc.has_extension("smog")
    assert Doc.has_extension("coleman_liau_index")
    assert Doc.has_extension("automated_readability_index")
    assert Doc.has_extension("forcast")
    def _calculate_score(self, doc: Doc):
        """
        Getter method. Calculates the score.
        """

        assert doc.has_extension(STAGE.CREDIBILITY_SCORE)
        try:
            result = self.get_features()
            return result

        except Exception as e:
            raise Exception(e)
Beispiel #28
0
 def __init__(self, lang="en"):
     super().__init__()
     self.package_check(lang)
     self.load_macros(lang)
     self.load_patterns(lang)
     if not Doc.has_extension('arguments'):
         Doc.set_extension('arguments', getter=ArgumentTexts(self))
     else:
         default, method, getter, setter = Doc.get_extension('arguments')
         assert isinstance(getter, ArgumentTexts), \
             "Expected 'arguments' extension to be of type ArgumentTexts " \
             "but found {}. Namespace clash?".format(type(Doc.get_extension('arguments')))
Beispiel #29
0
    def __call__(self, doc: Doc):
        if not doc.has_extension(STAGE.READABILITY):
            doc.set_extension(STAGE.READABILITY,
                              getter=self._calculate_readability)

        if doc.has_extension(STAGE.SUMMARIZER):
            # If the summarizer ran, we also calculate scores for the summary (not just fulltext)
            # spacy_readability needs a "Doc" object
            summary_sents = [str(s) for s in doc._.summarizer]

            summary_text = "\n".join([str(s) for s in summary_sents])

            # FIXME Sentencizer is needed by spacy_readability, but this here does not seem to work.
            # SMOG scores currently DON'T work !
            self.summary_doc = self.nlp.make_doc(summary_text)
            self.summary_doc = self.nlp.create_pipe("sentencizer")(
                self.summary_doc)

            # FIXME we could use the correct sentence boundaries to mark token.is_sent_start instead ?

        return doc
    def __init__(self, nlp: Language):
        """Initialize the pipeline component. The shared nlp instance is used to initialize the matcher.
    
    Args:
      nlp (spacy.Language): language environment
    """
        lang = nlp.meta["lang"]
        self.ext_name = "noun_phrases"
        self.rule_module_name = 'phrase_detective.{}.noun_phrases'.format(lang)
        self.nlp = nlp

        if not Doc.has_extension(self.ext_name):
            Doc.set_extension(self.ext_name, default=[])