def test_unclosed_brackets(self): sentences = [ "The medial preoptic area (MPOA), and 2) did not decrease Fos-lir.", "However, olfactory desensitizations did decrease Fos-lir." ] self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
def __init__(self, text: str = None, use_tokenizer: bool = False, labels: List[str] = None): self.tokens: List[Token] = [] self.labels: List[str] = labels self._embeddings: Dict = {} # optionally, directly instantiate with sentence tokens if text is not None: # tokenize the text first if option selected, otherwise assumes whitespace tokenized text if use_tokenizer: sentences = split_single(text) tokens = [] for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) text = ' '.join(tokens) # add each word in tokenized string as Token object to Sentence for word in text.split(' '): self.add_token(Token(word))
def test_long_bracket_abbervation(self): sentences = [ "This is expected, on the basis of (Olmsted, M. C., C. F. Anderson, " "and M. T. Record, Jr. 1989. Proc. Natl. Acad. Sci. USA. 100:100), " "to decrease sharply." ] self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
def wikipedia_api(word): try: wikipediaPage = wikipedia.page(word) except wikipedia.exceptions.DisambiguationError as e: if len(e.options) == 0: return [] else: wikipediaPage = wikipedia.page(e.options[0]) sentence_list = list(segmenter.split_single(wikipediaPage.summary)) topic = wikipediaPage.title shortDescription = sentence_list[0] description = "".join(sentence_list[0:3]) entity = "" imgUrl = wikipediaPage.images.pop() return [{ 'topic': topic, 'shortDescription': shortDescription, 'description': description, 'entity': entity, 'imgUrl': imgUrl, 'source': 'Wikipedia' }]
def test_names(self): sentences = [ "Written by A. McArthur, K. Elvin, and D. Eden.", "This is Mr. A. Starr over there.", "B. Boyden is over there.", ] self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
def test_parenthesis(self): sentences = [ "Nested ((Parenthesis. (With words right (inside))) (More stuff. " "Uff, this is it!))", "In the Big City." ] self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
def test_names(self): sentences = [ "Written by A. McArthur, K. Elvin, and D. Eden.", "This is Mr. A. Starr over there.", "B. Boyden is over there." ] self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
def test_alpha_items(self): sentences = [ "This is figure A, B, and C.", "This is table A and B.", "That is item A, B." ] self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
def predict_paragraph(self, paragraph) -> dict: """Predict for paragraph. Args: paragraph (str): Input paragraph. Returns: spacy formatted dict with entities """ # mismatch if there are empty sentences sentences = [ Sentence(x, use_tokenizer=self.tokenizer) for x in split_single(paragraph) ] # move to separate function if self.max_length: sentences = self._split_long_sentences(sentences) self.model.predict(sentences) json_sentences = [] for sent in sentences: spacy_format_ner = flair_to_spacy(sent) json_sentences.append(spacy_format_ner) ner = concat_json_sentences(json_sentences) return ner
def split_text(self, dataset, is_flair=False): """ Splits text into sentences with optional spans (format is a requirement for GERBIL usage). This behavior is required for the default NER-tagger, which during experiments was experienced to achieve higher performance. :return: dictionary with sentences and optional given spans per sentence. """ res = {} splits = [0] processed_sentences = [] for doc in dataset: text, spans = dataset[doc] sentences = split_single(text) res[doc] = {} i = 0 for sent in sentences: if len(sent.strip()) == 0: continue # Match gt to sentence. pos_start = text.find(sent) pos_end = pos_start + len(sent) # ngram, start_pos, end_pos spans_sent = [[text[x[0]:x[0] + x[1]], x[0], x[0] + x[1]] for x in spans if pos_start <= x[0] < pos_end] res[doc][i] = [sent, spans_sent] if len(spans) == 0: processed_sentences.append(sent) i += 1 splits.append(splits[-1] + i) return res, processed_sentences, splits
def test_continuations(self): sentences = [ "colonic colonization inhibits development of inflammatory lesions.", "to investigate whether an inf. of the pancreas was the case...", "though we hate to use capital lett. that usually separate sentences.", ] self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
def test_species_names_tough(self): sentences = [ "The level of the genus Allomonas gen. nov. with so " "far the only species A. enterica known." ] self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
def split_text(self, dataset): """ Splits text into sentences. This behavior is required for the default NER-tagger, which during experiments was experienced to perform more optimally in such a fashion. :return: dictionary with sentences and optional given spans per sentence. """ res = {} splits = [0] processed_sentences = [] for doc in dataset: text, spans = dataset[doc] sentences = split_single(text) res[doc] = {} i = 0 for sent in sentences: if len(sent.strip()) == 0: continue # Match gt to sentence. pos_start = text.find(sent) pos_end = pos_start + len(sent) # ngram, start_pos, end_pos spans_sent = [[text[x[0]:x[0] + x[1]], x[0], x[0] + x[1]] for x in spans if pos_start <= x[0] < pos_end] res[doc][i] = [sent, spans_sent] if len(spans) == 0: processed_sentences.append( Sentence(sent, use_tokenizer=True)) i += 1 splits.append(splits[-1] + i) return res, processed_sentences, splits
def __next__(self): if self._curr_row is None: raise StopIteration() row = self._curr_row if len(row) != self._row_len: msg = 'found %d columns, but expected %d at line %s:\n%s' raise IOError(msg % (len(row), self._row_len, self._line, str(row))) try: self._curr_row = next(self._row_gen) self._line += 1 except StopIteration: self._curr_row = None data_or_text = lambda c: row[c] if c not in self.text_columns else [] data = [data_or_text(col) for col in range(self._row_len)] for col in self.text_columns: for sentence in split_single(row[col]): sentence = self._decap(sentence) tokens = [self._lower(t) for t in word_tokenizer(sentence)] data[col].append(tokens) return data
def getSections(self): sectionsDict = {} content = self.wikiPage.content sections = re.findall('\n== (.*) ==\n', content) sections = [section for section in sections if section not in [ "See also", "Bibliography", "Further reading", "References", "External links", "Notes", "Notes and references"]] for section in sections: start = content.index('== {0} =='.format(section)) try: end = start + content[start:].index('\n== ') except ValueError: # On last heading, no headings follow it end = -1 # Remove all subheadings sectionContent = clean( re.sub('==* .* ==*', '', content[start:end])) self.lengths[section] = len(sectionContent) sentences = [sent for sent in split_single( sectionContent)] # Split into sentences for sentence in sentences: # Add the source to the source map self.sourceMap[sentence] = self.wikiPage.url sectionsDict[section] = sentences return sectionsDict
def toSentences(infile, outfile): """ simple function: just creates an html file to view a conllu file """ triple = re.compile(r"\s*\n\s*\n\s*\n\s*", re.M) double = re.compile(r"\s*\n\s*\n\s*", re.M) sentpunct = re.compile(r"([!?.;]+)\s*") text = open(infile).read() outstr = "" for sect in triple.split(text): if len(double.split(sect)) != 2: sqdf t = textCorrection(double.split(sect)[1]) t = sentpunct.sub(r"\1 ", t) #print(t) t = "\n".join(split_single(t)) outstr += t.strip() + "\n" open(outfile, "w").write(outstr)
def __next__(self): if self._curr_row is None: raise StopIteration() row = self._curr_row if len(row) != self._row_len: msg = 'found %d columns, but expected %d at line %s:\n%s' raise IOError(msg % ( len(row), self._row_len, self._line, str(row) )) try: self._curr_row = next(self._row_gen) self._line += 1 except StopIteration: self._curr_row = None data_or_text = lambda c: row[c] if c not in self.text_columns else [] data = [data_or_text(col) for col in range(self._row_len)] for col in self.text_columns: for sentence in split_single(row[col]): sentence = self._decap(sentence) tokens = [self._lower(t) for t in word_tokenizer(sentence)] data[col].append(tokens) return data
def test_inner_names(self): sentences = [ "Bla bla [Sim et al. (1981) Biochem. J. 193, 129-141].", "The adjusted (ml. min-1. 1.73 m-2) rate." ] self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
def predict(self, sentences: Union[str, Sentence, List[Sentence], List[str]], display_html: bool = True, html_file: str = None, display_str: bool = False, **kwargs): if type(sentences) == Sentence: sentences = [sentences] elif type(sentences) == str: sentences = split_single(sentences) if type(sentences[0]) == str: sentences = [Sentence(s, use_tokenizer=True) for s in sentences] self.model.predict(sentences) if display_html or html_file: html = render_ner_html(sentences, **kwargs) if display_html: display(HTML(html)) if html_file: (self.path / html_file).write_text(html) if display_str: for sentence in sentences: print(sentence.to_tagged_string())
def test_species_names(self): sentences = [ "Their presence was detected by transformation into S. lividans.", "Three subjects diagnosed as having something." ] self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
def test_continuations(self): sentences = [ "colonic colonization inhibits development of inflammatory lesions.", "to investigate whether an inf. of the pancreas was the case...", "though we hate to use capital lett. that usually separate sentences." ] self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
def test_long_bracket_abbervation(self): sentences = [ "This is expected, on the basis of (Olmsted, M. C., C. F. Anderson, " "and M. T. Record, Jr. 1989. Proc. Natl. Acad. Sci. USA. 100:100), " "to decrease sharply." ] self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
def test_parenthesis_with_sentences(self): sentences = [ "The segmenter segments on single lines or to consecutive lines.", "(If you want to extract sentences that cross newlines, remove those line-breaks.", "Segtok assumes your content has some minimal semantical meaning.)", "It gracefully handles this and similar issues.", ] self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
def test_parenthesis_with_sentences(self): sentences = [ "The segmenter segments on single lines or to consecutive lines.", "(If you want to extract sentences that cross newlines, remove those line-breaks.", "Segtok assumes your content has some minimal semantical meaning.)", "It gracefully handles this and similar issues." ] self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
def model_ner_SENT_spacy(paragraph, type_question): sentences = [nlp_spacy(sent) for sent in split_single(paragraph)] list_predictions_data = [] for sentence in sentences: for entity in sentence.ents: if entity.label_ in interesting_entities( type_question) and len(normalize_answer(entity.text)): list_predictions_data.append(entity.text) return list_predictions_data
def __init__(self, text: str, configs, pipeline): self.text = text self.sentences = [ Sentence(sent, use_tokenizer=segtok_tokenizer) for sent in split_single(text) ] self.configs = configs self.pipeline = pipeline self.results: List[Dict] = []
def get_tokenized_sentences(snippets): """get_tokenized_sentences will first split the snippets in sentences""" sentences = [] id2sentences = [] for snippet in snippets: snippet_sent = split_single(snippet[1]) id2sentences.extend([snippet[0]] * len(snippet_sent)) sentences.extend(snippet_sent) return [Sentence(sentence, use_tokenizer=True) for sentence in sentences], id2sentences
def run_tokenize(text: str) -> List[str]: words: List[str] = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) words.extend(contractions) words = list(filter(None, words)) return words
def split_sents(corpus): ''' splits document string into multiple strings containing one sentence each corpus: corpus file, list of document strings returns list of lists of Sentence objects ''' corp = [] for doc in corpus: sentences = [Sentence(sent, use_tokenizer = True) for sent in split_single(doc['text'])] corp.append(sentences) return corp
def tokenize(text): """ Inputs: txt Outputs: tokens tokenized by segtok.tokenizer """ tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) return tokens
def __init__(self, text: str = None, use_tokenizer: bool = False, labels: Union[List[Label], List[str]] = None): super(Sentence, self).__init__() self.tokens: List[Token] = [] self.labels: List[Label] = [] if labels is not None: self.add_labels(labels) self._embeddings: Dict = {} # if text is passed, instantiate sentence with tokens (words) if text is not None: # tokenize the text first if option selected if use_tokenizer: # use segtok for tokenization tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) # determine offsets for whitespace_after field index = text.index running_offset = 0 last_word_offset = -1 last_token = None for word in tokens: token = Token(word) self.add_token(token) try: word_offset = index(word, running_offset) except: word_offset = last_word_offset + 1 if word_offset - 1 == last_word_offset and last_token is not None: last_token.whitespace_after = False word_len = len(word) running_offset = word_offset + word_len last_word_offset = running_offset - 1 last_token = token # otherwise assumes whitespace tokenized text else: # add each word in tokenized string as Token object to Sentence for word in text.split(' '): if word: token = Token(word) self.add_token(token)
def sentence_tokenize(self, text): """Get list of string sentences from input string. Args: text: raw input string Yields: str: non-whitespace, non-empty sentence strings """ for sentence in split_single(to_unix_linebreaks(text)): clean_sentence = sentence.strip() if len(clean_sentence) > 0: yield clean_sentence
def model_ner_SENT_flair(paragraph, type_question): sentences = [ Sentence(sent, use_tokenizer=True) for sent in split_single(paragraph) ] tagger.predict(sentences) list_predictions_data = [] for sentence in sentences: for entity in sentence.get_spans('ner'): if entity.tag in interesting_entities(type_question) and len( normalize_answer(entity.text)): list_predictions_data.append(entity.text) return list_predictions_data
def make_sentences(text: str, min_char: int = 3) -> list: """ Break apart text into a list of sentences """ if len(text) > min_char: sentences: list = [ sent for sent in split_single(text) if len(sent) > min_char ] else: sentences: list = [] if not sentences: logger.warning("Default sentence was added") sentences: list = [SentimentAnalysisAPI.default_sentence] return sentences
def __init__(self, text=None, use_tokenizer=False, labels=None): super(Sentence, self).__init__() self.tokens = [] self.labels = [] if (labels is not None): self.add_labels(labels) self._embeddings = {} if (text is not None): if use_tokenizer: tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) index = text.index running_offset = 0 last_word_offset = (-1) last_token = None for word in tokens: try: word_offset = index(word, running_offset) start_position = word_offset except: word_offset = (last_word_offset + 1) start_position = ((running_offset + 1) if (running_offset > 0) else running_offset) token = Token(word, start_position=start_position) self.add_token(token) if (((word_offset - 1) == last_word_offset) and (last_token is not None)): last_token.whitespace_after = False word_len = len(word) running_offset = (word_offset + word_len) last_word_offset = (running_offset - 1) last_token = token else: word = u'' for (index, char) in enumerate(text): if (char == u' '): if (len(word) > 0): token = Token(word, start_position=(index - len(word))) self.add_token(token) word = u'' else: word += char index += 1 if (len(word) > 0): token = Token(word, start_position=(index - len(word))) self.add_token(token)
def test_middle_name_initials(self): sentences = ["The administrative basis for Lester B. Pearson's foreign policy was developed later.", "This model was introduced by Dr. Edgar F. Codd after initial criticisms."] self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
def test_linebreak(self): text = "This is a\nmultiline sentence." self.assertSequenceEqual(text.split("\n"), list(split_single(text)))
def test_simple_case(self): self.assertEqual(["This is a test."], list(split_single("This is a test.")))
def test_unclosed_brackets(self): sentences = [ "The medial preoptic area (MPOA), and 2) did not decrease Fos-lir.", "However, olfactory desensitizations did decrease Fos-lir.", ] self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
def test_species_names_tough(self): sentences = ["The level of the genus Allomonas gen. nov. with so " "far the only species A. enterica known."] self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
def test_parenthesis(self): sentences = [ "Nested ((Parenthesis. (With words right (inside))) (More stuff. " "Uff, this is it!))", "In the Big City.", ] self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
def test_species_names(self): sentences = [ "Their presence was detected by transformation into S. lividans.", "Three subjects diagnosed as having something.", ] self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
def test_inner_names(self): sentences = [ "Bla bla [Sim et al. (1981) Biochem. J. 193, 129-141].", "The adjusted (ml. min-1. 1.73 m-2) rate.", ] self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
def test_regex(self): self.assertSequenceEqual(SENTENCES, list(split_single(TEXT)))
def test_author_list(self): sentences = ["R. S. Kauffman, R. Ahmed, and B. N. Fields show stuff in their paper."] self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
def test_alpha_items(self): sentences = ["This is figure A, B, and C.", "This is table A and B.", "That is item A, B."] self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
def test_european_dates(self): sentences = ["Der Unfall am 24. Dezember 2016.", "Am 13. Jän. 2006 war es regnerisch.", "Am 13. 1. 2006 war es regnerisch."] self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
def test_linebreak2(self): text = "Folding Beijing\nby Hao Jingfang" self.assertSequenceEqual(text.split('\n'), list(split_single(text)))