def tokenizer(text, tokenizer_fn, to_lower=False): text = ftfy.fix_text(text) if to_lower: text = text.lower() try: seq = Sequence(text.strip()) except ValueError: return tokens = tokenizer_fn.transform(seq) new_tokens = [] for token in tokens: if token.strip() == '': continue elif PUNCTSYM.search(token): token = '$' elif LIKENUM.search(token): token = '0' elif LIKEUNIT.search(token): token = LIKEUNIT.sub(r'0 \1', token) elif token == "can't": token = 'can not' elif CONTRACTION1.search(token): token = CONTRACTION1.sub(r"\1 '\2", token) elif CONTRACTION2.search(token): token = CONTRACTION2.sub(r"\1 n't", token) new_tokens.append(token) if new_tokens: return ' '.join(new_tokens).strip() return
def transform(self, sequence): seq = Sequence(sequence.text) seq.idx = [0] for segment in sequence: offset = seq.idx[-1] self.breaker.setText(segment) seq.idx.extend([offset + x for x in self.breaker]) return seq
def sentSegment(par, lang): try: sents = sent_tokenize(par, lang) except: try: par_seq = Sequence(par) st = SentenceTokenizer(locale = lang_map[lang]) sents = [sent for sent in st.transform(par_seq)] except: return None return sents
def segment(args): lang = args.lang w_tokenizer = WordTokenizer(locale=lang) s_tokenizer = SentenceTokenizer(locale=lang) if args.only_sent: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(u'\n'.join(s_tokenizer.transform(seq))) elif args.only_word: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(u' '.join(w_tokenizer.transform(seq))) else: for l in args.input: seq = Sequence(l) sents = s_tokenizer.transform(seq) words = w_tokenizer.transform(seq) for tokenized_sent in words.split(sents): if not tokenized_sent.empty(): _print(u' '.join(tokenized_sent.tokens()))
def _create_sentence_objects(self): '''Returns a list of Sentence objects from the raw text. ''' sentence_objects = [] sent_tokenizer = SentenceTokenizer(locale=self.language.code) seq = Sequence(self.raw) seq = sent_tokenizer.transform(seq) for start_index, end_index in zip(seq.idx[:-1], seq.idx[1:]): # Sentences share the same models as their parent blob sent = seq.text[start_index:end_index].strip() if not sent: continue s = Sentence(sent, start_index=start_index, end_index=end_index) s.detected_languages = self.detected_languages sentence_objects.append(s) return sentence_objects
def tokenizer(text, tokenizer_fn): seq = Sequence(text.strip()) return filter(lambda w: w != ' ', tokenizer_fn.transform(seq))
def tokens(self): """Return a list of tokens, using this blob's tokenizer object (defaults to :class:`WordTokenizer <textblob.tokenizers.WordTokenizer>`). """ seq = self.word_tokenizer.transform(Sequence(self.raw)) return WordList(seq.tokens(), parent=self, language=self.language.code)
def transform2words(self, text): return self.word_tokenizer.transform(Sequence(text)).tokens()
def tokens(self): """Return a list of tokens, using this blob's tokenizer object (defaults to :class:`WordTokenizer <textblob.tokenizers.WordTokenizer>`). """ seq = self.word_tokenizer.transform(Sequence(self.raw)) tokens = WordList(seq.tokens(), parent=self, language=self.language.code) fix_hyphen = [] i = 0 # SIDE DELETE # while i < len(tokens): # hyphen_word = '' # while i + 3 < len(tokens) and tokens[i+1] == '-' and tokens[i+2] not in string.punctuation: # if tokens[i+3] == '-': # hyphen_word += tokens[i] + tokens[i+1] # i += 2 # if i + 2 < len(tokens): # if tokens[i+1] == '-' and tokens[i+2] not in string.punctuation: # hyphen_word += tokens[i] + tokens[i + 1] + tokens[i+2] # # i+=3 # SIDE delete error [list out bound] # i += 1 # SIDE ADD # if tokens[i] != '-': # break # else: # hyphen_word += tokens[i] + tokens[i + 1] + tokens[i + 2] # i += 3 # if tokens[i] != '-': # break # if hyphen_word: # fix_hyphen.append(hyphen_word) # continue # else: # if i + 2 < len(tokens): # if tokens[i] not in string.punctuation and tokens[i+1] == '-' and tokens[i+2] not in string.punctuation: # fix_hyphen.append(tokens[i]+tokens[i+1]+tokens[i+2]) # i += 3 # continue # fix_hyphen.append(tokens[i]) # i+=1 # SIDE ADD while i < len(tokens): hyphen_word = '' if fix_hyphen and tokens[i] == '-' and i + 1 < len( tokens) and tokens[i + 1] not in string.punctuation: hyphen_word = tokens[i] + tokens[i + 1] if hyphen_word: fix_hyphen[-1] = fix_hyphen[-1] + hyphen_word i += 1 else: fix_hyphen.append(tokens[i]) i += 1 if self.split_apostrophe: fix_apostrophe = [] for token in fix_hyphen: if '\'' in token: split = token.split('\'') for i, t in enumerate(split): fix_apostrophe.append(t) if i != len(split) - 1: fix_apostrophe.append('\'') else: fix_apostrophe.append(token) return WordList(fix_apostrophe, parent=self, language=self.language.code) else: return WordList(fix_hyphen, parent=self, language=self.language.code)