def __init__(self, nlp: Language): """Initialise components""" extensions = [ "_n_sentences", "_n_tokens", "_n_syllables", "token_length", "sentence_length", "syllables", "counts", ] ext_funs = [ n_sentences, n_tokens, n_syllables, self.token_length, self.sentence_length, self.syllables, self.counts, ] for ext, fun in zip(extensions, ext_funs): if ext not in ["_n_sentences", "sentence_length", "syllables"]: if not Span.has_extension(ext): Span.set_extension(ext, getter=fun) if not Doc.has_extension(ext): Doc.set_extension(ext, getter=fun) if not Doc.has_extension("_filtered_tokens"): Doc.set_extension("_filtered_tokens", default=[]) if not Span.has_extension("_filtered_tokens"): Span.set_extension("_filtered_tokens", getter=filtered_tokens)
def set_extensions(): if not Doc.has_extension('coref_chains'): Doc.set_extension('coref_chains', default=None) if not Token.has_extension('coref_chains'): Token.set_extension('coref_chains', default=None) if not Doc.has_extension('holmes_document_info'): Doc.set_extension('holmes_document_info', default=None) if not Token.has_extension('holmes'): Token.set_extension('holmes', default=None)
def __init__(self): if not Doc.has_extension("association_scores_mean"): Doc.set_extension("association_scores_mean", default=[]) if not Doc.has_extension("association_scores_min"): Doc.set_extension("association_scores_min", default=[]) if not Doc.has_extension("association_scores_max"): Doc.set_extension("association_scores_max", default=[]) print('Created inside AssociationScores.') self.database_manager = DatabaseManager()
def install_classification_extensions( category: str, labels: list, doc_extention: str, ): prop_getter, label_getter = make_classification_getter( category, labels, doc_extention) if not Doc.has_extension(f"{category}_prop"): Doc.set_extension(f"{category}_prop", getter=prop_getter) if not Doc.has_extension(category): Doc.set_extension(category, getter=label_getter)
def test_json_to_doc_underscore(doc): if not Doc.has_extension("json_test1"): Doc.set_extension("json_test1", default=False) if not Doc.has_extension("json_test2"): Doc.set_extension("json_test2", default=False) doc._.json_test1 = "hello world" doc._.json_test2 = [1, 2, 3] json_doc = doc.to_json(underscore=["json_test1", "json_test2"]) new_doc = Doc(doc.vocab).from_json(json_doc, validate=True) assert all([new_doc.has_extension(f"json_test{i}") for i in range(1, 3)]) assert new_doc._.json_test1 == "hello world" assert new_doc._.json_test2 == [1, 2, 3]
def __call__(self, doc): #put the row on the matrix feature_row = [] # Some feature extraction classes might not have been added to the pipeline. # So we should only collect the features which have been extracted. if Doc.has_extension('features_lv'): feature_row += doc._.features_lv if Doc.has_extension('features_ls'): feature_row += doc._.features_ls if Doc.has_extension('features_la'): feature_row += doc._.features_la if Doc.has_extension('features_ca'): feature_row += doc._.features_ca if Doc.has_extension('features_ld'): feature_row += doc._.features_ld if Doc.has_extension('association_scores_mean'): feature_row += doc._.association_scores_mean if Doc.has_extension('association_scores_min'): feature_row += doc._.association_scores_min if Doc.has_extension('association_scores_max'): feature_row += doc._.association_scores_max # todo: collect features of new feature extractors # if Doc.has_extension('features_XX'): # feature_row += doc._.features_XX self.feature_matrix.append(feature_row) # it's useful to have access to the feature vector # at document level doc._.features = feature_row return doc
def test_read(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml') doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') self.eval(doc) def test_read_doc_name(self): ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml') doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') assert (doc._.doc_name == 'doc1.txt') ereader.doc_name_depth = 1 doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') assert (doc._.doc_name == r'corpus/doc1.txt') ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml', doc_name_depth=2) doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') assert (doc._.doc_name == r'ehost_test_corpus/corpus/doc1.txt')
def test_component_initialize(): data_path = Path(__file__).parent / "data" # With from_disk nlp = spacy.blank("en") s2v = nlp.add_pipe("sense2vec") if Doc.has_extension("s2v_phrases"): s2v.first_run = False # don't set up extensions again s2v.from_disk(data_path) doc = Doc(nlp.vocab, words=["beekeepers"], pos=["NOUN"]) s2v(doc) assert doc[0]._.s2v_key == "beekeepers|NOUN" most_similar = [item for item, score in doc[0]._.s2v_most_similar(2)] assert most_similar[0] == ("honey bees", "NOUN") assert most_similar[1] == ("Beekeepers", "NOUN") # With initialize nlp = spacy.blank("en") s2v = nlp.add_pipe("sense2vec") s2v.first_run = False # don't set up extensions again init_cfg = {"sense2vec": {"data_path": str(data_path)}} nlp.config["initialize"]["components"] = init_cfg nlp.initialize() doc = Doc(nlp.vocab, words=["beekeepers"], pos=["NOUN"]) s2v(doc) assert doc[0]._.s2v_key == "beekeepers|NOUN" most_similar = [item for item, score in doc[0]._.s2v_most_similar(2)] assert most_similar[0] == ("honey bees", "NOUN") assert most_similar[1] == ("Beekeepers", "NOUN")
def __call__(self, doc: Doc) -> Doc: mentions = [] if self.resolve_abbreviations and Doc.has_extension("abbreviations"): for ent in doc.ents: # TODO: This is possibly sub-optimal - we might # prefer to look up both the long and short forms. if ent._.long_form is not None: mentions.append(ent._.long_form) else: mentions.append(ent) else: mentions = doc.ents mention_strings = [x.text for x in mentions] batch_candidates = self.candidate_generator(mention_strings, self.k) for mention, candidates in zip(doc.ents, batch_candidates): predicted = [] for cand in candidates: score = max(cand.similarities) if (self.filter_for_definitions and self.umls.cui_to_entity[cand.concept_id].definition is None and score < self.no_definition_threshold): continue if score > self.threshold: predicted.append((cand.concept_id, score)) sorted_predicted = sorted(predicted, reverse=True, key=lambda x: x[1]) mention._.umls_ents = sorted_predicted[:self. max_entities_per_mention] return doc
def __call__(self, doc: Doc): """ Collect all labels related to patterns matching tokens. Each token have a `labels` extension in which to store them. The supplied `Doc` have a `labelings` extension in which all labeled spans are collected. If the doc has abbrs, they contribute to label spans. Parameters ---------- doc: Doc The doc to label over. Returns ------- Doc The doc after labeling. """ for key, start, end in self._matcher(doc): label = doc.vocab.strings[key] span = Span(doc, start, end, label) for token in span: if label in token._.labels: continue token._.labels.append(label) doc._.labelings.append(span) _sort_labelings(doc) if doc.has_extension("abbrs"): _merge_abbrs_labelings(doc) if self._only_longest: _fix_overlabelings(doc) return doc
def _analyze_health_text(self, doc: Doc): """ Getter method. Makes the API call and aggregates the response. """ assert doc.has_extension(STAGE.HEALTH_ANALYZER) if not self._endpoint: return {} headers = ( {} ) # FIXME authorization / API key. Right now this goes to a preview deployment # FIXME change to new Azure Web API url = f"{self._endpoint}/text/analytics/v3.2-preview.1/entities/health" # TODO language language = "en" try: documents = self._split_into_documents(str(doc.text), language) response = requests.post(url, headers=headers, json=documents) if response.ok: docs = response.json()["documents"] result = self._collect_entities(docs) return result else: raise Exception(response.reason) except Exception as e: raise Exception(e)
def __init__(self, hunspell_object=None): self.stopwords_list = [ 'the', 'a', 'an', 'are', 'on', 'to', 'at', 'every', 'this' ] if hunspell_object is None: hunspell_object = get_hunspell_default() self.hobj = hunspell_object self.nlp = spacy.load('en_core_web_lg') # we need another spaCy model as we want to tag the document with # corrected spelling mistakes # Todo: maybe always do that at the beginning and just save the # spelling mistakes if not Doc.has_extension("lstFilteredDepParseCorpus"): Doc.set_extension("lstFilteredDepParseCorpus", default=[])
def test_dir_reader2(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") dir_reader = EhostDirReader( nlp=English(), support_overlap=True, recursive=True, schema_file='data/ehost_test_corpus/config/projectschema.xml') docs = dir_reader.read(txt_dir='data/ehost_test_corpus/') assert (len(docs) == 2) for doc in docs: assert (len(doc._.concepts) == 7) assert ('Doc_Level_Purulence_Assessment' in doc._.concepts) assert (str( doc._.concepts['Doc_Level_Purulence_Assessment'][0]) == 'CHIEF' ) assert ('Purulent' in doc._.concepts) assert (str(doc._.concepts['Purulent'][0]) == 'Abdominal pain') assert ('Non-Purulent' in doc._.concepts) assert (str(doc._.concepts['Non-Purulent'][0]) == 'PRESENT') assert ('Incision_and_Drainage' in doc._.concepts) assert (str( doc._.concepts['Incision_and_Drainage'][0]) == 'patient') assert ('PreAnnotated' in doc._.concepts) assert (str(doc._.concepts['PreAnnotated'][0]) == '71-year-old') assert ('Nonspecific_SSTI' in doc._.concepts) assert (str( doc._.concepts['Nonspecific_SSTI'][0]) == 'X. The patient') assert ('Exclusions' in doc._.concepts) assert (str(doc._.concepts['Exclusions'][0]) == 'presented')
def set_doc_extensions(): """ Set :mod:`textacy.extract` custom property and method doc extensions on the global :class:`spacy.tokens.Doc`. """ for name, kwargs in get_doc_extensions().items(): if not Doc.has_extension(name): Doc.set_extension(name, **kwargs)
def __init__(self): """Initialise the pipeline component. """ if not Doc.has_extension("flesch_kincaid_grade_level"): Doc.set_extension("flesch_kincaid_grade_level", getter=self.fk_grade) if not Doc.has_extension("flesch_kincaid_reading_ease"): Doc.set_extension("flesch_kincaid_reading_ease", getter=self.fk_ease) if not Doc.has_extension("dale_chall"): Doc.set_extension("dale_chall", getter=self.dale_chall) if not Doc.has_extension("smog"): Doc.set_extension("smog", getter=self.smog) if not Doc.has_extension("coleman_liau_index"): Doc.set_extension("coleman_liau_index", getter=self.coleman_liau) if not Doc.has_extension("automated_readability_index"): Doc.set_extension("automated_readability_index", getter=self.ari) if not Doc.has_extension("forcast"): Doc.set_extension("forcast", getter=self.forcast)
def __init__(self, hunspell_objects=[]): if not hunspell_objects: hunspell_objects = [get_hunspell('en_US'), get_hunspell('en_GB')] self.hunspell_objects = hunspell_objects self.token_pattern = re.compile("^[A-Za-z]+$") if not Doc.has_extension("spell_errors"): Doc.set_extension("spell_errors", default=[])
def __init__(self, nlp: Language, use_pos: bool): """Initialise components""" self.use_pos = use_pos if not Doc.has_extension("pos_proportions"): Doc.set_extension("pos_proportions", getter=self.pos_proportions) if not Span.has_extension("pos_proportions"): Span.set_extension("pos_proportions", getter=self.pos_proportions)
def test_parse_to_dicts(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") ereader = EhostDocReader(nlp=English()) spans, classes, attributes, relations = ereader.parse_to_dicts( 'data/ehost_test_corpus/saved/doc1.txt.knowtator.xml') assert (len(spans) == 7) assert (len(classes) == 7) assert (len(attributes) == 6)
def __call__(self, doc: Doc): if not doc.has_extension(STAGE.HEALTH_ANALYZER) and self._endpoint: doc.set_extension(STAGE.HEALTH_ANALYZER, getter=self._analyze_health_text) if not self._endpoint: log.warning( "No endpoint for Azure Text Analytics for health, pls configure env vars ('AZ_TA_FOR_HEALTH_ENDPOINT' etc..)" ) return doc
def _summarize(self, doc: Doc): assert doc.has_extension(STAGE.SUMMARIZER) summary_modes = { "gensim": self._createSummaryWithGensim, } selected_mode = "gensim" # for now, the only one that produces ok results summary_sentences = summary_modes.get(selected_mode, "gensim")(doc) return summary_sentences[0:self.num_sentences]
def test_parse_to_dicts(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") breader = BratDocReader(nlp=English()) spans, classes, attributes, relations = breader.parse_to_dicts( Path('data/brat_test_corpus/000-introduction.ann').read_text()) assert (len(spans) == 12) assert (len(classes) == 17) assert (len(attributes) == 6) assert (len(relations) == 5)
def test_set_attributes(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") BratDocReader(nlp=English(), schema_file='data/brat_test_corpus/annotation.conf') nlp = English() doc = nlp('test status attribute') span = doc[1:2] assert (hasattr(span._, 'Negation')) assert (hasattr(span._, 'Confidence'))
def test_set_attributes(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml') nlp = English() doc = nlp('test status attribute') assert (hasattr(doc[1:2]._, 'status')) assert (doc[1:2]._.status == 'present')
def __init__(self, nlp: Language): """Initialise components""" if not Token.has_extension("dependency_distance"): Token.set_extension("dependency_distance", getter=self.token_dependency) if not Span.has_extension("dependency_distance"): Span.set_extension("dependency_distance", getter=self.span_dependency) if not Doc.has_extension("dependency_distance"): Doc.set_extension("dependency_distance", getter=self.doc_dependency)
def test_dir_reader(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") dir_reader = EhostDirReader( nlp=English(), recursive=True, schema_file='data/ehost_test_corpus/config/projectschema.xml') docs = dir_reader.read(txt_dir='data/ehost_test_corpus/') assert (len(docs) == 2) for doc in docs: self.eval(doc)
def test_extensions(nlp, read): nlp.add_pipe(read, last=True) doc = nlp("I contain four words.") assert Doc.has_extension("flesch_kincaid_grade_level") assert Doc.has_extension("flesch_kincaid_reading_ease") assert Doc.has_extension("dale_chall") assert Doc.has_extension("smog") assert Doc.has_extension("coleman_liau_index") assert Doc.has_extension("automated_readability_index") assert Doc.has_extension("forcast")
def _calculate_score(self, doc: Doc): """ Getter method. Calculates the score. """ assert doc.has_extension(STAGE.CREDIBILITY_SCORE) try: result = self.get_features() return result except Exception as e: raise Exception(e)
def __init__(self, lang="en"): super().__init__() self.package_check(lang) self.load_macros(lang) self.load_patterns(lang) if not Doc.has_extension('arguments'): Doc.set_extension('arguments', getter=ArgumentTexts(self)) else: default, method, getter, setter = Doc.get_extension('arguments') assert isinstance(getter, ArgumentTexts), \ "Expected 'arguments' extension to be of type ArgumentTexts " \ "but found {}. Namespace clash?".format(type(Doc.get_extension('arguments')))
def __call__(self, doc: Doc): if not doc.has_extension(STAGE.READABILITY): doc.set_extension(STAGE.READABILITY, getter=self._calculate_readability) if doc.has_extension(STAGE.SUMMARIZER): # If the summarizer ran, we also calculate scores for the summary (not just fulltext) # spacy_readability needs a "Doc" object summary_sents = [str(s) for s in doc._.summarizer] summary_text = "\n".join([str(s) for s in summary_sents]) # FIXME Sentencizer is needed by spacy_readability, but this here does not seem to work. # SMOG scores currently DON'T work ! self.summary_doc = self.nlp.make_doc(summary_text) self.summary_doc = self.nlp.create_pipe("sentencizer")( self.summary_doc) # FIXME we could use the correct sentence boundaries to mark token.is_sent_start instead ? return doc
def __init__(self, nlp: Language): """Initialize the pipeline component. The shared nlp instance is used to initialize the matcher. Args: nlp (spacy.Language): language environment """ lang = nlp.meta["lang"] self.ext_name = "noun_phrases" self.rule_module_name = 'phrase_detective.{}.noun_phrases'.format(lang) self.nlp = nlp if not Doc.has_extension(self.ext_name): Doc.set_extension(self.ext_name, default=[])