Beispiel #1
0
class NERTokenizer:
    def __init__(self):

        self._word_tokenizer = Tokenizer(split_camel_case=True,
                                         token_classes=False,
                                         extra_info=False)

        self._sentence_splitter = SentenceSplitter()

    def parse_text(self, text):
        tokens = self._word_tokenizer.tokenize_paragraph(text)

        sentences_tokenized = self._sentence_splitter.split(tokens)

        sentences = []
        for sen in sentences_tokenized:

            sen = [tok.replace(" ", "") for tok in sen]

            if len(sen) == 0:
                continue

            sentences.append((sen, []))

        return sentences
Beispiel #2
0
def SentenceSplit(text):

    tokenizer = Tokenizer(split_camel_case=False,
                          token_classes=False,
                          extra_info=False)
    tokens = tokenizer.tokenize(text)

    sentence_splitter = SentenceSplitter(is_tuple=False)
    sentences = sentence_splitter.split(tokens)
    return sentences
Beispiel #3
0
def get_sents(texts):
    tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False)
    sentence_splitter = SentenceSplitter(is_tuple=False)
    
    results = []
    for text in texts:
#         text = clean(text, lang='de', lower=False)
        tokens = tokenizer.tokenize_paragraph(text)
        sentences = sentence_splitter.split(tokens)
        cleaned = [clean(' '.join(s), no_urls=True, no_digits=True, no_punct=True, no_line_breaks=True, lang='de') for s in sentences]
        results.append(cleaned)
    return results
Beispiel #4
0
class TestSentenceSplitter(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True)
        self.sentence_splitter = SentenceSplitter()

    def _equal(self, raw, tokenized_sentences):
        """"""
        tokens = self.tokenizer.tokenize(raw)
        sentences = self.sentence_splitter.split(tokens)
        sentences = [" ".join(s) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)
Beispiel #5
0
class SentenceTokenizer(object):
    def __init__(self, language='en'):
        self.language = language
        if language == 'en':
            self.tokenizer = PunktSentenceTokenizer()
        elif language == 'de':
            self.tokenizer = SentenceSplitter(is_tuple=False)
        else:
            raise NotImplementedError

    def tokenize(self, sentences):
        if self.language == 'en':
            return self.tokenizer.tokenize(sentences)
        else:
            return self.tokenizer.split(sentences)
Beispiel #6
0
class TestSentenceSplitterPretokenized(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.sentence_splitter = SentenceSplitter(language="de_CMC")

    def _equal(self, tokens, tokenized_sentences):
        """"""
        sentences = self.sentence_splitter.split(tokens.split())
        self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])

    def _equal_xml(self, tokens, tokenized_sentences):
        """"""
        eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
        eos_tags = set(eos_tags)
        sentences = self.sentence_splitter.split_xml(tokens.split(), eos_tags)
        self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
def splitSentTokenIdx(text):
    # generate tokens from text:
    tokens = tokenSplit(text)

    # sort to sentences:
    sentence_splitter = SentenceSplitter(is_tuple=False)
    sentences = sentence_splitter.split(tokens)

    # add start and end indexes of token in text:
    endIdxUpdate = 0
    sents_idxd = []
    for sent in sentences:
        tokens_idxd = []
        for token in sent:
            startIdx = text.find(token, endIdxUpdate)
            endIdx = startIdx + len(token)
            if startIdx != -1:
                endIdxUpdate = endIdx
            tokens_idxd.append((token, startIdx, endIdx))
        sents_idxd.append(tokens_idxd)
    return sents_idxd
Beispiel #8
0
class TestSentenceSplitter(unittest.TestCase):
    """"""
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = Tokenizer(split_camel_case=True)
        self.sentence_splitter = SentenceSplitter()

    def _equal(self, raw, tokenized_sentences):
        """"""
        tokens = self.tokenizer.tokenize(raw)
        sentences = self.sentence_splitter.split(tokens)
        sentences = [" ".join(s) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)

    def _equal_xml(self, raw, tokenized_sentences):
        """"""
        eos_tags = "title h1 h2 h3 h4 h5 h6 p br div ol ul dl table".split()
        eos_tags = set(eos_tags)
        tokens = self.tokenizer.tokenize(raw)
        sentences = self.sentence_splitter.split_xml(tokens, eos_tags)
        sentences = [" ".join(s) for s in sentences]
        self.assertEqual(sentences, tokenized_sentences)
Beispiel #9
0
 def myprocessor(myinput):
     tokenizer = Tokenizer(language="de")
     sentsplitter = SentenceSplitter(language="de")
     tokenized = tokenizer.tokenize_paragraph(myinput)
     sentsplit = sentsplitter.split(tokenized)
     return sentsplit