Ejemplo n.º 1
0
 def test_unclosed_brackets(self):
     sentences = [
         "The medial preoptic area (MPOA), and 2) did not decrease Fos-lir.",
         "However, olfactory desensitizations did decrease Fos-lir."
     ]
     self.assertSequenceEqual(sentences,
                              list(split_single(' '.join(sentences))))
Ejemplo n.º 2
0
    def __init__(self,
                 text: str = None,
                 use_tokenizer: bool = False,
                 labels: List[str] = None):

        self.tokens: List[Token] = []

        self.labels: List[str] = labels

        self._embeddings: Dict = {}

        # optionally, directly instantiate with sentence tokens
        if text is not None:

            # tokenize the text first if option selected, otherwise assumes whitespace tokenized text
            if use_tokenizer:
                sentences = split_single(text)
                tokens = []
                for sentence in sentences:
                    contractions = split_contractions(word_tokenizer(sentence))
                    tokens.extend(contractions)

                text = ' '.join(tokens)

            # add each word in tokenized string as Token object to Sentence
            for word in text.split(' '):
                self.add_token(Token(word))
Ejemplo n.º 3
0
 def test_long_bracket_abbervation(self):
     sentences = [
         "This is expected, on the basis of (Olmsted, M. C., C. F. Anderson, "
         "and M. T. Record, Jr. 1989. Proc. Natl. Acad. Sci. USA. 100:100), "
         "to decrease sharply."
     ]
     self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
Ejemplo n.º 4
0
def wikipedia_api(word):
    try:
        wikipediaPage = wikipedia.page(word)
    except wikipedia.exceptions.DisambiguationError as e:
        if len(e.options) == 0:
            return []
        else:
            wikipediaPage = wikipedia.page(e.options[0])

    sentence_list = list(segmenter.split_single(wikipediaPage.summary))

    topic = wikipediaPage.title
    shortDescription = sentence_list[0]
    description = "".join(sentence_list[0:3])
    entity = ""
    imgUrl = wikipediaPage.images.pop()

    return [{
        'topic': topic,
        'shortDescription': shortDescription,
        'description': description,
        'entity': entity,
        'imgUrl': imgUrl,
        'source': 'Wikipedia'
    }]
Ejemplo n.º 5
0
 def test_names(self):
     sentences = [
         "Written by A. McArthur, K. Elvin, and D. Eden.",
         "This is Mr. A. Starr over there.",
         "B. Boyden is over there.",
     ]
     self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
Ejemplo n.º 6
0
 def test_parenthesis(self):
     sentences = [
         "Nested ((Parenthesis. (With words right (inside))) (More stuff. "
         "Uff, this is it!))", "In the Big City."
     ]
     self.assertSequenceEqual(sentences,
                              list(split_single(' '.join(sentences))))
Ejemplo n.º 7
0
 def test_names(self):
     sentences = [
         "Written by A. McArthur, K. Elvin, and D. Eden.",
         "This is Mr. A. Starr over there.", "B. Boyden is over there."
     ]
     self.assertSequenceEqual(sentences,
                              list(split_single(' '.join(sentences))))
Ejemplo n.º 8
0
 def test_alpha_items(self):
     sentences = [
         "This is figure A, B, and C.", "This is table A and B.",
         "That is item A, B."
     ]
     self.assertSequenceEqual(sentences,
                              list(split_single(' '.join(sentences))))
Ejemplo n.º 9
0
    def predict_paragraph(self, paragraph) -> dict:
        """Predict for paragraph.

        Args:
            paragraph (str): Input paragraph.

        Returns:
            spacy formatted dict with entities
        """
        # mismatch if there are empty sentences
        sentences = [
            Sentence(x, use_tokenizer=self.tokenizer) for x in split_single(paragraph)
        ]
        # move to separate function
        if self.max_length:
            sentences = self._split_long_sentences(sentences)

        self.model.predict(sentences)

        json_sentences = []
        for sent in sentences:
            spacy_format_ner = flair_to_spacy(sent)
            json_sentences.append(spacy_format_ner)

        ner = concat_json_sentences(json_sentences)
        return ner
Ejemplo n.º 10
0
    def split_text(self, dataset, is_flair=False):
        """
        Splits text into sentences with optional spans (format is a requirement for GERBIL usage).
        This behavior is required for the default NER-tagger, which during experiments was experienced
        to achieve higher performance.

        :return: dictionary with sentences and optional given spans per sentence.
        """

        res = {}
        splits = [0]
        processed_sentences = []
        for doc in dataset:
            text, spans = dataset[doc]
            sentences = split_single(text)
            res[doc] = {}

            i = 0
            for sent in sentences:
                if len(sent.strip()) == 0:
                    continue
                # Match gt to sentence.
                pos_start = text.find(sent)
                pos_end = pos_start + len(sent)

                # ngram, start_pos, end_pos
                spans_sent = [[text[x[0]:x[0] + x[1]], x[0], x[0] + x[1]]
                              for x in spans if pos_start <= x[0] < pos_end]
                res[doc][i] = [sent, spans_sent]
                if len(spans) == 0:
                    processed_sentences.append(sent)
                i += 1
            splits.append(splits[-1] + i)
        return res, processed_sentences, splits
Ejemplo n.º 11
0
 def test_continuations(self):
     sentences = [
         "colonic colonization inhibits development of inflammatory lesions.",
         "to investigate whether an inf. of the pancreas was the case...",
         "though we hate to use capital lett. that usually separate sentences.",
     ]
     self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
Ejemplo n.º 12
0
 def test_species_names_tough(self):
     sentences = [
         "The level of the genus Allomonas gen. nov. with so "
         "far the only species A. enterica known."
     ]
     self.assertSequenceEqual(sentences,
                              list(split_single(' '.join(sentences))))
Ejemplo n.º 13
0
    def split_text(self, dataset):
        """
        Splits text into sentences. This behavior is required for the default NER-tagger, which during experiments
        was experienced to perform more optimally in such a fashion.

        :return: dictionary with sentences and optional given spans per sentence.
        """

        res = {}
        splits = [0]
        processed_sentences = []
        for doc in dataset:
            text, spans = dataset[doc]
            sentences = split_single(text)
            res[doc] = {}

            i = 0
            for sent in sentences:
                if len(sent.strip()) == 0:
                    continue
                # Match gt to sentence.
                pos_start = text.find(sent)
                pos_end = pos_start + len(sent)

                # ngram, start_pos, end_pos
                spans_sent = [[text[x[0]:x[0] + x[1]], x[0], x[0] + x[1]]
                              for x in spans if pos_start <= x[0] < pos_end]
                res[doc][i] = [sent, spans_sent]
                if len(spans) == 0:
                    processed_sentences.append(
                        Sentence(sent, use_tokenizer=True))
                i += 1
            splits.append(splits[-1] + i)
        return res, processed_sentences, splits
Ejemplo n.º 14
0
    def __next__(self):
        if self._curr_row is None:
            raise StopIteration()

        row = self._curr_row

        if len(row) != self._row_len:
            msg = 'found %d columns, but expected %d at line %s:\n%s'
            raise IOError(msg %
                          (len(row), self._row_len, self._line, str(row)))

        try:
            self._curr_row = next(self._row_gen)
            self._line += 1
        except StopIteration:
            self._curr_row = None

        data_or_text = lambda c: row[c] if c not in self.text_columns else []
        data = [data_or_text(col) for col in range(self._row_len)]

        for col in self.text_columns:
            for sentence in split_single(row[col]):
                sentence = self._decap(sentence)
                tokens = [self._lower(t) for t in word_tokenizer(sentence)]
                data[col].append(tokens)

        return data
Ejemplo n.º 15
0
    def getSections(self):
        sectionsDict = {}
        content = self.wikiPage.content
        sections = re.findall('\n== (.*) ==\n', content)
        sections = [section for section in sections if section not in [
            "See also", "Bibliography", "Further reading", "References", "External links", "Notes", "Notes and references"]]
        for section in sections:
            start = content.index('== {0} =='.format(section))

            try:
                end = start + content[start:].index('\n== ')
            except ValueError:  # On last heading, no headings follow it
                end = -1

            # Remove all subheadings
            sectionContent = clean(
                re.sub('==* .* ==*', '', content[start:end]))
            self.lengths[section] = len(sectionContent)
            sentences = [sent for sent in split_single(
                sectionContent)]  # Split into sentences
            for sentence in sentences:
                # Add the source to the source map
                self.sourceMap[sentence] = self.wikiPage.url
            sectionsDict[section] = sentences
        return sectionsDict
Ejemplo n.º 16
0
def toSentences(infile, outfile):
    """
	simple function: just creates an html file to view a conllu file
	
	"""
    triple = re.compile(r"\s*\n\s*\n\s*\n\s*", re.M)
    double = re.compile(r"\s*\n\s*\n\s*", re.M)
    sentpunct = re.compile(r"([!?.;]+)\s*")

    text = open(infile).read()

    outstr = ""

    for sect in triple.split(text):

        if len(double.split(sect)) != 2:
            sqdf

        t = textCorrection(double.split(sect)[1])

        t = sentpunct.sub(r"\1 ", t)
        #print(t)
        t = "\n".join(split_single(t))

        outstr += t.strip() + "\n"

    open(outfile, "w").write(outstr)
Ejemplo n.º 17
0
    def __next__(self):
        if self._curr_row is None:
            raise StopIteration()

        row = self._curr_row

        if len(row) != self._row_len:
            msg = 'found %d columns, but expected %d at line %s:\n%s'
            raise IOError(msg % (
                len(row), self._row_len, self._line, str(row)
            ))

        try:
            self._curr_row = next(self._row_gen)
            self._line += 1
        except StopIteration:
            self._curr_row = None

        data_or_text = lambda c: row[c] if c not in self.text_columns else []
        data = [data_or_text(col) for col in range(self._row_len)]

        for col in self.text_columns:
            for sentence in split_single(row[col]):
                sentence = self._decap(sentence)
                tokens = [self._lower(t) for t in word_tokenizer(sentence)]
                data[col].append(tokens)

        return data
Ejemplo n.º 18
0
 def test_inner_names(self):
     sentences = [
         "Bla bla [Sim et al. (1981) Biochem. J. 193, 129-141].",
         "The adjusted (ml. min-1. 1.73 m-2) rate."
     ]
     self.assertSequenceEqual(sentences,
                              list(split_single(' '.join(sentences))))
Ejemplo n.º 19
0
    def predict(self,
                sentences: Union[str, Sentence, List[Sentence], List[str]],
                display_html: bool = True,
                html_file: str = None,
                display_str: bool = False,
                **kwargs):
        if type(sentences) == Sentence:
            sentences = [sentences]
        elif type(sentences) == str:
            sentences = split_single(sentences)

        if type(sentences[0]) == str:
            sentences = [Sentence(s, use_tokenizer=True) for s in sentences]

        self.model.predict(sentences)

        if display_html or html_file:
            html = render_ner_html(sentences, **kwargs)
            if display_html:
                display(HTML(html))
            if html_file:
                (self.path / html_file).write_text(html)
        if display_str:
            for sentence in sentences:
                print(sentence.to_tagged_string())
Ejemplo n.º 20
0
 def test_species_names(self):
     sentences = [
         "Their presence was detected by transformation into S. lividans.",
         "Three subjects diagnosed as having something."
     ]
     self.assertSequenceEqual(sentences,
                              list(split_single(' '.join(sentences))))
Ejemplo n.º 21
0
 def test_continuations(self):
     sentences = [
         "colonic colonization inhibits development of inflammatory lesions.",
         "to investigate whether an inf. of the pancreas was the case...",
         "though we hate to use capital lett. that usually separate sentences."
     ]
     self.assertSequenceEqual(sentences,
                              list(split_single(' '.join(sentences))))
Ejemplo n.º 22
0
 def test_long_bracket_abbervation(self):
     sentences = [
         "This is expected, on the basis of (Olmsted, M. C., C. F. Anderson, "
         "and M. T. Record, Jr. 1989. Proc. Natl. Acad. Sci. USA. 100:100), "
         "to decrease sharply."
     ]
     self.assertSequenceEqual(sentences,
                              list(split_single(' '.join(sentences))))
Ejemplo n.º 23
0
 def test_parenthesis_with_sentences(self):
     sentences = [
         "The segmenter segments on single lines or to consecutive lines.",
         "(If you want to extract sentences that cross newlines, remove those line-breaks.",
         "Segtok assumes your content has some minimal semantical meaning.)",
         "It gracefully handles this and similar issues.",
     ]
     self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
Ejemplo n.º 24
0
 def test_parenthesis_with_sentences(self):
     sentences = [
         "The segmenter segments on single lines or to consecutive lines.",
         "(If you want to extract sentences that cross newlines, remove those line-breaks.",
         "Segtok assumes your content has some minimal semantical meaning.)",
         "It gracefully handles this and similar issues."
     ]
     self.assertSequenceEqual(sentences,
                              list(split_single(' '.join(sentences))))
Ejemplo n.º 25
0
 def model_ner_SENT_spacy(paragraph, type_question):
     sentences = [nlp_spacy(sent) for sent in split_single(paragraph)]
     list_predictions_data = []
     for sentence in sentences:
         for entity in sentence.ents:
             if entity.label_ in interesting_entities(
                     type_question) and len(normalize_answer(entity.text)):
                 list_predictions_data.append(entity.text)
     return list_predictions_data
Ejemplo n.º 26
0
 def __init__(self, text: str, configs, pipeline):
     self.text = text
     self.sentences = [
         Sentence(sent, use_tokenizer=segtok_tokenizer)
         for sent in split_single(text)
     ]
     self.configs = configs
     self.pipeline = pipeline
     self.results: List[Dict] = []
Ejemplo n.º 27
0
def get_tokenized_sentences(snippets):
    """get_tokenized_sentences will first split the snippets in sentences"""
    sentences = []
    id2sentences = []
    for snippet in snippets:
        snippet_sent = split_single(snippet[1])
        id2sentences.extend([snippet[0]] * len(snippet_sent))
        sentences.extend(snippet_sent)

    return [Sentence(sentence, use_tokenizer=True) for sentence in sentences], id2sentences
Ejemplo n.º 28
0
    def run_tokenize(text: str) -> List[str]:
        words: List[str] = []

        sentences = split_single(text)
        for sentence in sentences:
            contractions = split_contractions(word_tokenizer(sentence))
            words.extend(contractions)

        words = list(filter(None, words))

        return words
Ejemplo n.º 29
0
def split_sents(corpus):
    '''
    splits document string into multiple strings containing one sentence each
    corpus: corpus file, list of document strings
    returns list of lists of Sentence objects
    '''
    corp = []
    for doc in corpus:
        sentences = [Sentence(sent, use_tokenizer = True) for sent in split_single(doc['text'])]
        corp.append(sentences)  
    return corp
def tokenize(text):
    """
    Inputs: txt
    Outputs: tokens tokenized by segtok.tokenizer
    """
    tokens = []
    sentences = split_single(text)
    for sentence in sentences:
        contractions = split_contractions(word_tokenizer(sentence))
        tokens.extend(contractions)
    return tokens
Ejemplo n.º 31
0
    def __init__(self,
                 text: str = None,
                 use_tokenizer: bool = False,
                 labels: Union[List[Label], List[str]] = None):

        super(Sentence, self).__init__()

        self.tokens: List[Token] = []

        self.labels: List[Label] = []
        if labels is not None: self.add_labels(labels)

        self._embeddings: Dict = {}

        # if text is passed, instantiate sentence with tokens (words)
        if text is not None:

            # tokenize the text first if option selected
            if use_tokenizer:

                # use segtok for tokenization
                tokens = []
                sentences = split_single(text)
                for sentence in sentences:
                    contractions = split_contractions(word_tokenizer(sentence))
                    tokens.extend(contractions)

                # determine offsets for whitespace_after field
                index = text.index
                running_offset = 0
                last_word_offset = -1
                last_token = None
                for word in tokens:
                    token = Token(word)
                    self.add_token(token)
                    try:
                        word_offset = index(word, running_offset)
                    except:
                        word_offset = last_word_offset + 1
                    if word_offset - 1 == last_word_offset and last_token is not None:
                        last_token.whitespace_after = False
                    word_len = len(word)
                    running_offset = word_offset + word_len
                    last_word_offset = running_offset - 1
                    last_token = token

            # otherwise assumes whitespace tokenized text
            else:
                # add each word in tokenized string as Token object to Sentence
                for word in text.split(' '):
                    if word:
                        token = Token(word)
                        self.add_token(token)
Ejemplo n.º 32
0
    def sentence_tokenize(self, text):
        """Get list of string sentences from input string.

        Args:
            text: raw input string
        Yields:
            str: non-whitespace, non-empty sentence strings
        """
        for sentence in split_single(to_unix_linebreaks(text)):
            clean_sentence = sentence.strip()
            if len(clean_sentence) > 0:
                yield clean_sentence
Ejemplo n.º 33
0
 def model_ner_SENT_flair(paragraph, type_question):
     sentences = [
         Sentence(sent, use_tokenizer=True)
         for sent in split_single(paragraph)
     ]
     tagger.predict(sentences)
     list_predictions_data = []
     for sentence in sentences:
         for entity in sentence.get_spans('ner'):
             if entity.tag in interesting_entities(type_question) and len(
                     normalize_answer(entity.text)):
                 list_predictions_data.append(entity.text)
     return list_predictions_data
Ejemplo n.º 34
0
    def make_sentences(text: str, min_char: int = 3) -> list:
        """ Break apart text into a list of sentences """
        if len(text) > min_char:
            sentences: list = [
                sent for sent in split_single(text) if len(sent) > min_char
            ]
        else:
            sentences: list = []

        if not sentences:
            logger.warning("Default sentence was added")
            sentences: list = [SentimentAnalysisAPI.default_sentence]
        return sentences
Ejemplo n.º 35
0
 def __init__(self, text=None, use_tokenizer=False, labels=None):
     super(Sentence, self).__init__()
     self.tokens = []
     self.labels = []
     if (labels is not None):
         self.add_labels(labels)
     self._embeddings = {}
     if (text is not None):
         if use_tokenizer:
             tokens = []
             sentences = split_single(text)
             for sentence in sentences:
                 contractions = split_contractions(word_tokenizer(sentence))
                 tokens.extend(contractions)
             index = text.index
             running_offset = 0
             last_word_offset = (-1)
             last_token = None
             for word in tokens:
                 try:
                     word_offset = index(word, running_offset)
                     start_position = word_offset
                 except:
                     word_offset = (last_word_offset + 1)
                     start_position = ((running_offset + 1) if
                                       (running_offset > 0) else
                                       running_offset)
                 token = Token(word, start_position=start_position)
                 self.add_token(token)
                 if (((word_offset - 1) == last_word_offset)
                         and (last_token is not None)):
                     last_token.whitespace_after = False
                 word_len = len(word)
                 running_offset = (word_offset + word_len)
                 last_word_offset = (running_offset - 1)
                 last_token = token
         else:
             word = u''
             for (index, char) in enumerate(text):
                 if (char == u' '):
                     if (len(word) > 0):
                         token = Token(word,
                                       start_position=(index - len(word)))
                         self.add_token(token)
                     word = u''
                 else:
                     word += char
             index += 1
             if (len(word) > 0):
                 token = Token(word, start_position=(index - len(word)))
                 self.add_token(token)
Ejemplo n.º 36
0
 def test_middle_name_initials(self):
     sentences = ["The administrative basis for Lester B. Pearson's foreign policy was developed later.",
                  "This model was introduced by Dr. Edgar F. Codd after initial criticisms."]
     self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
Ejemplo n.º 37
0
 def test_linebreak(self):
     text = "This is a\nmultiline sentence."
     self.assertSequenceEqual(text.split("\n"), list(split_single(text)))
Ejemplo n.º 38
0
 def test_simple_case(self):
     self.assertEqual(["This is a test."], list(split_single("This is a test.")))
Ejemplo n.º 39
0
 def test_unclosed_brackets(self):
     sentences = [
         "The medial preoptic area (MPOA), and 2) did not decrease Fos-lir.",
         "However, olfactory desensitizations did decrease Fos-lir.",
     ]
     self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
Ejemplo n.º 40
0
 def test_species_names_tough(self):
     sentences = ["The level of the genus Allomonas gen. nov. with so " "far the only species A. enterica known."]
     self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
Ejemplo n.º 41
0
 def test_parenthesis(self):
     sentences = [
         "Nested ((Parenthesis. (With words right (inside))) (More stuff. " "Uff, this is it!))",
         "In the Big City.",
     ]
     self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
Ejemplo n.º 42
0
 def test_species_names(self):
     sentences = [
         "Their presence was detected by transformation into S. lividans.",
         "Three subjects diagnosed as having something.",
     ]
     self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
Ejemplo n.º 43
0
 def test_inner_names(self):
     sentences = [
         "Bla bla [Sim et al. (1981) Biochem. J. 193, 129-141].",
         "The adjusted (ml. min-1. 1.73 m-2) rate.",
     ]
     self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
Ejemplo n.º 44
0
 def test_regex(self):
     self.assertSequenceEqual(SENTENCES, list(split_single(TEXT)))
Ejemplo n.º 45
0
 def test_author_list(self):
     sentences = ["R. S. Kauffman, R. Ahmed, and B. N. Fields show stuff in their paper."]
     self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
Ejemplo n.º 46
0
 def test_alpha_items(self):
     sentences = ["This is figure A, B, and C.", "This is table A and B.", "That is item A, B."]
     self.assertSequenceEqual(sentences, list(split_single(" ".join(sentences))))
Ejemplo n.º 47
0
 def test_european_dates(self):
     sentences = ["Der Unfall am 24. Dezember 2016.",
                  "Am 13. Jän. 2006 war es regnerisch.",
                  "Am 13. 1. 2006 war es regnerisch."]
     self.assertSequenceEqual(sentences, list(split_single(' '.join(sentences))))
Ejemplo n.º 48
0
 def test_linebreak2(self):
     text = "Folding Beijing\nby Hao Jingfang"
     self.assertSequenceEqual(text.split('\n'), list(split_single(text)))