class NERTokenizer: def __init__(self): self._word_tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) self._sentence_splitter = SentenceSplitter() def parse_text(self, text): tokens = self._word_tokenizer.tokenize_paragraph(text) sentences_tokenized = self._sentence_splitter.split(tokens) sentences = [] for sen in sentences_tokenized: sen = [tok.replace(" ", "") for tok in sen] if len(sen) == 0: continue sentences.append((sen, [])) return sentences
def SentenceSplit(text): tokenizer = Tokenizer(split_camel_case=False, token_classes=False, extra_info=False) tokens = tokenizer.tokenize(text) sentence_splitter = SentenceSplitter(is_tuple=False) sentences = sentence_splitter.split(tokens) return sentences
def get_sents(texts): tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False) sentence_splitter = SentenceSplitter(is_tuple=False) results = [] for text in texts: # text = clean(text, lang='de', lower=False) tokens = tokenizer.tokenize_paragraph(text) sentences = sentence_splitter.split(tokens) cleaned = [clean(' '.join(s), no_urls=True, no_digits=True, no_punct=True, no_line_breaks=True, lang='de') for s in sentences] results.append(cleaned) return results
class TestSentenceSplitter(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) self.sentence_splitter = SentenceSplitter() def _equal(self, raw, tokenized_sentences): """""" tokens = self.tokenizer.tokenize(raw) sentences = self.sentence_splitter.split(tokens) sentences = [" ".join(s) for s in sentences] self.assertEqual(sentences, tokenized_sentences)
class SentenceTokenizer(object): def __init__(self, language='en'): self.language = language if language == 'en': self.tokenizer = PunktSentenceTokenizer() elif language == 'de': self.tokenizer = SentenceSplitter(is_tuple=False) else: raise NotImplementedError def tokenize(self, sentences): if self.language == 'en': return self.tokenizer.tokenize(sentences) else: return self.tokenizer.split(sentences)
class TestSentenceSplitterPretokenized(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.sentence_splitter = SentenceSplitter(language="de_CMC") def _equal(self, tokens, tokenized_sentences): """""" sentences = self.sentence_splitter.split(tokens.split()) self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences]) def _equal_xml(self, tokens, tokenized_sentences): """""" eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() eos_tags = set(eos_tags) sentences = self.sentence_splitter.split_xml(tokens.split(), eos_tags) self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
def splitSentTokenIdx(text): # generate tokens from text: tokens = tokenSplit(text) # sort to sentences: sentence_splitter = SentenceSplitter(is_tuple=False) sentences = sentence_splitter.split(tokens) # add start and end indexes of token in text: endIdxUpdate = 0 sents_idxd = [] for sent in sentences: tokens_idxd = [] for token in sent: startIdx = text.find(token, endIdxUpdate) endIdx = startIdx + len(token) if startIdx != -1: endIdxUpdate = endIdx tokens_idxd.append((token, startIdx, endIdx)) sents_idxd.append(tokens_idxd) return sents_idxd
class TestSentenceSplitter(unittest.TestCase): """""" def setUp(self): """Necessary preparations""" self.tokenizer = Tokenizer(split_camel_case=True) self.sentence_splitter = SentenceSplitter() def _equal(self, raw, tokenized_sentences): """""" tokens = self.tokenizer.tokenize(raw) sentences = self.sentence_splitter.split(tokens) sentences = [" ".join(s) for s in sentences] self.assertEqual(sentences, tokenized_sentences) def _equal_xml(self, raw, tokenized_sentences): """""" eos_tags = "title h1 h2 h3 h4 h5 h6 p br div ol ul dl table".split() eos_tags = set(eos_tags) tokens = self.tokenizer.tokenize(raw) sentences = self.sentence_splitter.split_xml(tokens, eos_tags) sentences = [" ".join(s) for s in sentences] self.assertEqual(sentences, tokenized_sentences)
def myprocessor(myinput): tokenizer = Tokenizer(language="de") sentsplitter = SentenceSplitter(language="de") tokenized = tokenizer.tokenize_paragraph(myinput) sentsplit = sentsplitter.split(tokenized) return sentsplit