def test_token_indices(): text = ": nation on" sentence = Sentence(text) assert text == sentence.to_original_text() text = ": nation on" sentence = Sentence(text, use_tokenizer=SegtokTokenizer()) assert text == sentence.to_original_text() text = "I love Berlin." sentence = Sentence(text) assert text == sentence.to_original_text() text = ( 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " ' "in einer Weise aufgetreten , die alles andere als überzeugend " 'war " .' ) sentence = Sentence(text) assert text == sentence.to_original_text() text = ( 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " ' "in einer Weise aufgetreten , die alles andere als überzeugend " 'war " .' ) sentence = Sentence(text, use_tokenizer=SegtokTokenizer()) assert text == sentence.to_original_text()
def test_html_rendering(): text = ( "Boris Johnson has been elected new Conservative leader in a ballot of party members and will become the " "next UK prime minister. &") sent = Sentence() sent.get_spans = MagicMock() sent.get_spans.return_value = [ mock_ner_span("PER", 0, 13), mock_ner_span("MISC", 35, 47), mock_ner_span("LOC", 109, 111), ] sent.to_original_text = MagicMock() sent.to_original_text.return_value = text settings = {"colors": {"LOC": "yellow"}, "labels": {"LOC": "location"}} actual = Visualizer.render_ner_html([sent], settings=settings) expected_res = HTML_PAGE.format( text=TAGGED_ENTITY.format( color="#F7FF53", entity="Boris Johnson", label="PER") + " has been elected new " + TAGGED_ENTITY.format( color="#4647EB", entity="Conservative", label="MISC") + " leader in a ballot of party members and will become the next " + TAGGED_ENTITY.format(color="yellow", entity="UK", label="location") + " prime minister. &") assert expected_res == actual
def test_html_rendering(): text = ( "Boris Johnson has been elected new Conservative leader in a ballot of party members and will become the " "next UK prime minister. &") sent = Sentence() sent.get_spans = MagicMock() sent.get_spans.return_value = [ mock_ner_span(text, "PER", 0, 13), mock_ner_span(text, "MISC", 35, 47), mock_ner_span(text, "LOC", 109, 111), ] sent.to_original_text = MagicMock() sent.to_original_text.return_value = text colors = { "PER": "#F7FF53", "ORG": "#E8902E", "LOC": "yellow", "MISC": "#4647EB", "O": "#ddd", } actual = render_ner_html([sent], colors=colors) expected_res = HTML_PAGE.format( text=PARAGRAPH.format( sentence=TAGGED_ENTITY.format( color="#F7FF53", entity="Boris Johnson", label="PER") + " has been elected new " + TAGGED_ENTITY.format( color="#4647EB", entity="Conservative", label="MISC") + " leader in a ballot of party members and will become the next " + TAGGED_ENTITY.format(color="yellow", entity="UK", label="LOC") + " prime minister. &"), title="Flair", ) assert expected_res == actual
def test_token_indices(): text = ': nation on' sentence = Sentence(text) assert (text == sentence.to_original_text()) text = ': nation on' sentence = Sentence(text, use_tokenizer=True) assert (text == sentence.to_original_text()) text = 'I love Berlin.' sentence = Sentence(text) assert (text == sentence.to_original_text()) text = 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .' sentence = Sentence(text) assert (text == sentence.to_original_text()) text = 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .' sentence = Sentence(text, use_tokenizer=True) assert (text == sentence.to_original_text())
def split_to_spans(s: Sentence): orig = s.to_original_text() last_idx = 0 spans = [] tagged_ents = s.get_spans('ner') for ent in tagged_ents: if last_idx != ent.start_pos: spans.append((orig[last_idx:ent.start_pos], None)) spans.append((orig[ent.start_pos:ent.end_pos], ent.tag)) last_idx = ent.end_pos if last_idx < len(orig) - 1: spans.append((orig[last_idx:len(orig)], None)) return spans
def split_to_spans(s: Sentence): orig = s.to_original_text() last_idx = 0 spans = [] tagged_ents = s.get_labels("ner") for ent in tagged_ents: if last_idx != ent.span.start_pos: spans.append((orig[last_idx:ent.span.start_pos], None)) spans.append((ent.span.text, ent.value)) assert ent.span.end_pos is not None last_idx = ent.span.end_pos if last_idx < len(orig) - 1: spans.append((orig[last_idx:len(orig)], None)) return spans
def _label(self, sentence: Sentence): """ This will add a complex_label to the given sentence for every match.span() for every registered_mapping. If a match span overlaps with a token span an exception is raised. """ collection = RegexpTagger.TokenCollection(sentence) for label, pattern in self._regexp_mapping.items(): for match in pattern.finditer(sentence.to_original_text()): span: Tuple[int, int] = match.span() try: token_span = collection.get_token_span(span) except ValueError: raise Exception( f"The match span {span} for label '{label}' is overlapping with a token!" ) sentence.add_complex_label(label, SpanLabel(token_span, label))
def get_reason_for_appearance(organisation: Span, sentence: Sentence): """ Extract the reason for the appearance of an 'ORG' NER tag in a sentence. """ # Find ORG placement in sentence. org_end = organisation.end_pos frame_tags = sentence.get_spans("frame") # Extract frame and POS tags after organisation occurence. pos_tags = list( filter(lambda span: "VBD" in span.tag, sentence.get_spans("pos"))) frame_tags_after_org = list( filter(lambda span: span.start_pos > org_end, frame_tags)) pos_tags_after_org = list( filter(lambda span: span.start_pos > org_end, pos_tags)) # If no frame tags are usable, fall back to POS tags. if not frame_tags_after_org and not pos_tags_after_org: return None first_after_org = (frame_tags_after_org[0] if frame_tags_after_org else pos_tags_after_org[0]) original = sentence.to_original_text() # Extract reason following ORG occurence. reason = original[first_after_org.start_pos:] return reason
def transform(self, X, y=None, **kwargs): """ an abstract method that is used to transform according to what happend in the fit method :param X: features - Dataframe :param y: target vector - Series :param kwargs: free parameters - dictionary :return: X: the transformed data - Dataframe """ X = X['text'] dataset_hash = hash(str(X) + str(self.embedder.__dict__)) if dataset_hash in self.dataset_cache: return self.dataset_cache[dataset_hash] else: embeddings = [] for first in trange(0, len(X), self.batch_size): subset = X[first:first + self.batch_size] sentences = [] for element in subset: sentence = Sentence(element) # sentence.tokens = sentence.tokens[:200] sentences.append(sentence) self.embedder.embed(sentences) for sentence in sentences: key = sentence.to_original_text() if key in self.vector_cache.keys(): vector = self.vector_cache[key] else: vector = sentence.get_embedding().cpu().detach().numpy( ) self.vector_cache[key] = vector embeddings.append(vector) embedding_dataset = numpy.vstack(embeddings) self.dataset_cache[dataset_hash] = embedding_dataset return embedding_dataset