def capitalized(self): """Returns the first capitalized type for this class if it exists""" for t in self.types: if punctuation.is_capitalized(t): return t return self.normalized
def record(self, token, left_context=EMPTY_TOKEN, right_context=EMPTY_TOKEN): self.types.add(token.type) self.count += 1 if punctuation.is_capitalized(token.type): self.upper_count += 1 if token.type.startswith(u'.') == False and \ (right_context.type == u'.' or token.type.endswith(u'.')): self.abbr_count += 1
def segment(text_or_tokens, raw=False, classifier=CLASSIFIER): tokens = tokenize(text_or_tokens) if raw else text_or_tokens n_tokens = len(tokens) # Classify tokens for abbreviation and proper noun detection if not classifier: classifier = TokenClassifier() classifier.train(tokens) sentences, cache = [], [] closing = False stack = SpanishPunctuationStack() for i, t in enumerate(tokens): # Always segment on a newline if t.match(punctuation.NEWLINE_RE): # Newline if cache: if raw: start, end = cache[0].start, cache[-1].end sentences.append(text_or_tokens[start:end]) else: sentences.append(cache) cache = [] stack = SpanishPunctuationStack() closing = False continue stack.feed(t.type) cache.append(t) # Classify local context current_class = classifier.classify(t) next_token = tokens[i + 1] if i < n_tokens - 1 else EMPTY_TOKEN next_class = classifier.classify(next_token) # Look for the next sentence segment marker # The candidates are abbreviations and literal markers if not closing: if t.type.endswith(u'.') or current_class.is_abbreviation: # Segment on this abbreviation # if the the next token is capitalized and not a proper noun if punctuation.is_capitalized(next_token.type) \ and not next_class.is_proper_noun: closing = True elif t.match(punctuation.SEGMENT_RE): # Segment at this literal segment marker # if punctuation.is_capitalized(next_token): closing = True # If we have found the sentence segment marker or reached the end, # check for any remaining tokens then end the sentence if closing or not next_token: if not stack.pending() or not next_token: if raw: start, end = cache[0].start, cache[-1].end sentences.append(text_or_tokens[start:end]) else: sentences.append(cache) cache = [] stack = SpanishPunctuationStack() closing = False return sentences