Example #1
0
def reassemble(sentences):
    at_start = True


    in_dquote = False
    buf = []

    for sentence in sentences:
        for fragment in sentence:
            if fragment == '"':
                if in_dquote:
                    in_dquote = False
                else:
                    if not at_start:
                        buf.append(' ')

                    at_start = True
                    in_dquote = True
            elif not PUNCTUATION_REGEX.match(unicode(fragment)) and not at_start:
                buf.append(' ')
            else:
                at_start = False

            if isinstance(fragment, Substitution):
                buf.append('<del>{}</del><ins>{}</ins>'.format(*fragment))
            else:
                buf.append(fragment)

    return ''.join(buf)
Example #2
0
    def tag(self, sentence, tokenize=True):
        """Tag a string `sentence`.

        :param str or list sentence: A string or a list of sentence strings.
        :param tokenize: (optional) If ``False`` string has to be tokenized before
            (space separated string).

        """
        # : Do not process empty strings (Issue #3)
        if sentence.strip() == "":
            return []
        # : Do not process strings consisting of a single punctuation mark (Issue #4)
        elif sentence.strip() in PUNCTUATION:
            if self.include_punc:
                _sym = sentence.strip()
                if _sym in tuple('.?!'):
                    _tag = "."
                else:
                    _tag = _sym
                return [(_sym, _tag)]
            else:
                return []
        if tokenize:
            _tokenized = " ".join(self.tokenizer.tokenize(sentence))
            sentence = _tokenized
        # Sentence is tokenized before it is passed on to pattern.de.tag
        # (i.e. it is either submitted tokenized or if )
        _tagged = pattern_tag(sentence, tokenize=False)
        if self.include_punc:
            return _tagged
        else:
            _tagged = [(word, t) for word, t in _tagged
                       if not PUNCTUATION_REGEX.match(unicode(t))]
            return _tagged
Example #3
0
    def pos_tags(self):
        '''Returns an list of tuples of the form (word, POS tag).

        Example:
        ::

            [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
                    ('Thursday', 'NNP'), ('morning', 'NN')]

        :rtype: list of tuples
        '''
        return [(Word(word, pos_tag=t), unicode(t))
                for word, t in self.pos_tagger.tag(self.raw)
                if not PUNCTUATION_REGEX.match(unicode(t))]
Example #4
0
    def pos_tags(self):
        """Returns an list of tuples of the form (word, POS tag).

        Example:
        ::

            [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
                    ('Thursday', 'NNP'), ('morning', 'NN')]

        :rtype: list of tuples
        """
        return [(Word(word, pos_tag=t), unicode(t))
                for word, t in self.pos_tagger.tag(self.raw)
                if not PUNCTUATION_REGEX.match(unicode(t))]
Example #5
0
    def pos_tags(self):
        """Returns an list of tuples of the form (word, POS tag).

        Example:
        ::

            [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
                    ('Thursday', 'NNP'), ('morning', 'NN')]

        :rtype: list of tuples
        """
        if isinstance(self, TextBlob):
            return [val for sublist in [s.pos_tags for s in self.sentences] for val in sublist]
        else:
            return [(Word(unicode(word), pos_tag=t), unicode(t))
                    for word, t in self.pos_tagger.tag(self)
                    if not PUNCTUATION_REGEX.match(unicode(t))]
Example #6
0
    def pos_tags(self):
        """Returns an list of tuples of the form (word, POS tag).

        Example:
        ::

            [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
                    ('Thursday', 'NNP'), ('morning', 'NN')]

        :rtype: list of tuples
        """
        if isinstance(self, TextBlob):
            return [
                val for sublist in [s.pos_tags for s in self.sentences]
                for val in sublist
            ]
        else:
            return [(Word(word, pos_tag=t), unicode(t))
                    for word, t in self.pos_tagger.tag(self)
                    if not PUNCTUATION_REGEX.match(unicode(t))]
    def tag(self, sentence, tokenize=True):
        """Tag a string `sentence`.

        :param str or list sentence: A string or a list of sentence strings.
        :param tokenize: (optional) If ``False`` string has to be tokenized before
            (space separated string).

        """
        #: Do not process empty strings (Issue #3)
        if sentence.strip() == "":
            return []
        #: Do not process strings consisting of a single punctuation mark (Issue #4)
        elif sentence.strip() in PUNCTUATION:
            if self.include_punc:
                _sym = sentence.strip()
                if _sym in tuple('.?!'):
                    _tag = "."
                else:
                    _tag = _sym
                return [(_sym, _tag)]
            else:
                return []
        if tokenize:
            _tokenized = " ".join(self.tokenizer.tokenize(sentence))
            sentence = _tokenized
        # Sentence is tokenized before it is passed on to pattern.de.tag
        # (i.e. it is either submitted tokenized or if )
        _tagged = pattern_tag(sentence, tokenize=False,
                              encoding=self.encoding,
                              tagset=self.tagset)
        if self.include_punc:
            return _tagged
        else:
            _tagged = [
                (word, t) for word, t in _tagged if not PUNCTUATION_REGEX.match(
                    unicode(t))]
            return _tagged