Beispiel #1
0
    def capitalized(self):
        """Returns the first capitalized type for this class if it exists"""
        for t in self.types:
            if punctuation.is_capitalized(t):
                return t

        return self.normalized
Beispiel #2
0
    def record(self, token,
        left_context=EMPTY_TOKEN, right_context=EMPTY_TOKEN):
        self.types.add(token.type)

        self.count += 1
        if punctuation.is_capitalized(token.type):
            self.upper_count += 1
        if token.type.startswith(u'.') == False and \
           (right_context.type == u'.' or token.type.endswith(u'.')):
            self.abbr_count += 1
Beispiel #3
0
def segment(text_or_tokens, raw=False, classifier=CLASSIFIER):
    tokens = tokenize(text_or_tokens) if raw else text_or_tokens
    n_tokens = len(tokens)

    # Classify tokens for abbreviation and proper noun detection
    if not classifier:
        classifier = TokenClassifier()
        classifier.train(tokens)

    sentences, cache = [], []
    closing = False
    stack = SpanishPunctuationStack()

    for i, t in enumerate(tokens):
        # Always segment on a newline
        if t.match(punctuation.NEWLINE_RE):  # Newline
            if cache:
                if raw:
                    start, end = cache[0].start, cache[-1].end
                    sentences.append(text_or_tokens[start:end])
                else:
                    sentences.append(cache)
                cache = []
                stack = SpanishPunctuationStack()
                closing = False
            continue

        stack.feed(t.type)
        cache.append(t)

        # Classify local context
        current_class = classifier.classify(t)
        next_token = tokens[i + 1] if i < n_tokens - 1 else EMPTY_TOKEN
        next_class = classifier.classify(next_token)

        # Look for the next sentence segment marker
        # The candidates are abbreviations and literal markers
        if not closing:
            if t.type.endswith(u'.') or current_class.is_abbreviation:
                # Segment on this abbreviation
                # if the the next token is capitalized and not a proper noun
                if punctuation.is_capitalized(next_token.type) \
                   and not next_class.is_proper_noun:
                    closing = True
            elif t.match(punctuation.SEGMENT_RE):
                # Segment at this literal segment marker
                # if punctuation.is_capitalized(next_token):
                closing = True

        # If we have found the sentence segment marker or reached the end,
        # check for any remaining tokens then end the sentence
        if closing or not next_token:
            if not stack.pending() or not next_token:
                if raw:
                    start, end = cache[0].start, cache[-1].end
                    sentences.append(text_or_tokens[start:end])
                else:
                    sentences.append(cache)
                cache = []
                stack = SpanishPunctuationStack()
                closing = False

    return sentences