def __call__(self, doc): text = urllib.parse.quote_plus(doc.text) result = urllib.request.urlopen(self.url + text).read() annotationset = ET.fromstring(result).find('AnnotationSet') for annotation in annotationset.getchildren(): if annotation.get('Type') != 'Token': continue word_index = int(annotation.get('StartNode')) token = self.get_token_by_idx(word_index, doc) ner_tag = self.get_value_from_annotation(annotation, 'NER-BIO1') ner_tag = ner_tag.strip() if ner_tag != 'O': # this is a capital o letter ner_tag_parts = ner_tag.split('-') label = doc.vocab.strings[ner_tag_parts[1]] if ner_tag_parts[0] == '1': doc.ents = (list(doc.ents) + [Span(doc, token.i, token.i + 1, label=label)]) elif ner_tag_parts[0] == 'B': self.ner_begin_idx = token.i elif ner_tag_parts[0] == 'E': if self.ner_begin_idx is None: raise Exception('Found end of ner,' 'when looking for beginning') else: span = Span(doc, self.ner_begin_idx, token.i + 1, label=label) span.merge() doc.ents = (list(doc.ents) + [span]) self.ner_begin_idx = None return doc
def __call__(self, doc): entities = [] for c in self.pipe_: doc = c(doc, entities) # add extracted entities to doc.ents for e in entities: start, end, label = e span = Span(doc, start, end, label=label) doc.ents = list(doc.ents) + [span] # merge entities into one token? (default to False) if (self.merge_entity_spans): for span in doc.ents: span.merge() return doc
def init_doc(self, words, spaces, spans): ## Creating a new document with the text doc = Doc(self.nlp.vocab, words=words, spaces=spaces) ## Loading GROBID entities in the spaCY document entities = [] for s in spans: span = Span(doc=doc, start=s['tokenStart'], end=s['tokenEnd'], label=s['type']) span._.set('id', str(s['id'])) if 'boundingBoxes' in s: span._.set('bounding_boxes', s['boundingBoxes']) if 'formattedText' in s: span._.set('formattedText', s['formattedText']) if 'links' in s: span._.set('links', s['links']) if 'linkable' in s: span._.set('linkable', s['linkable']) entities.append(span) doc.ents = entities # print("Entities: " + str(doc.ents)) for span in entities: # Iterate over all spans and merge them into one token. This is done # after setting the entities – otherwise, it would cause mismatched # indices! span.merge() for token in span: token._.id = span._.id token._.bounding_boxes = span._.bounding_boxes token._.formattedText = span._.formattedText token._.links = span._.links token._.linkable = span._.linkable self.nlp.tagger(doc) self.nlp.parser(doc) ## Merge entities and phrase nouns, but only when they are not overlapping, # to avoid loosing the entity type information phrases_ents = self.extract_phrases_ents(doc) # print(phrases_ents) for span in phrases_ents: # print("Span " + str(span)) overlapping = False for ent in entities: # print(ent) if ((span.start <= ent.start <= span.end) or (span.start <= ent.end >= span.end) or (span.start >= ent.start and span.end <= ent.end) or (span.start <= ent.start and span.end >= ent.end)): overlapping = True break # Entities and phrase noun are not overlapping if not overlapping: span.merge() # self.nlp.tagger(doc) # self.nlp.parser(doc) return doc