class ProcessorSentenceSplitter: """Performs sentence splitting using simple rules. Simple wrapper around NLTK component. Suitable for european languages. """ def __init__(self, delay_init=False): self.sent_tokeniser_ = None if not delay_init: self.init() def init(self): if self.sent_tokeniser_ is None: punkt_param = PunktParameters() punkt_param.abbrev_types = self.compile_abbreviations() self.sent_tokeniser_ = PunktSentenceTokenizer(punkt_param) def __call__(self, tokens): assert self.sent_tokeniser_ sents = self.sent_tokeniser_.sentences_from_tokens( (e.text for e in tokens)) curr = 0 res_sents = list() for sent in sents: res_sents.append(Sentence(curr, curr + len(sent))) curr += len(sent) return res_sents def compile_abbreviations(self): def get_dot_pairs(alphabet): return [ '.'.join(abbrev) for abbrev in list(combinations(alphabet, 2)) ] def clean_regexps(regexps): return [ ''.join(abbrev.lower().split('.')[:-1]).replace( '\\', '').replace(u'\xad', '').replace(' ', '.').replace('?', ' ').lower() for abbrev in regexps ] ru_abbrevs = get_dot_pairs('цукенгшзхфвапролджэячсмитбю') ru_abbrevs += clean_regexps(_ru_abbrevs) en_abbrevs = get_dot_pairs('qwertyuiopasdfghjklzxcvbnm') en_abbrevs += clean_regexps(_en_abbrevs) return list(set(ru_abbrevs + en_abbrevs))
class SentenceSplitter(object): def __init__(self): super(SentenceSplitter, self).__init__() self.sent_tokeniser_ = PunktSentenceTokenizer() def process(self, text, tokens): token_strs = [text[e[0] : e[1]] for e in tokens] sents = self.sent_tokeniser_.sentences_from_tokens(token_strs) curr = 0 res_sents = list() for sent in sents: res_sents.append([Span(begin = e[0], end = e[1]) for e in tokens[curr : curr + len(sent)]]) curr += len(sent) return res_sents