Ejemplo n.º 1
0
 def test_slice_repr(self):
     wl = tb.WordList(['Schön', 'ist', 'besser'])
     if PY2:
         assert_equal(unicode(repr(wl[:2])),
                      u"WordList([u'Sch\\xf6n', u'ist'])")
     else:
         assert_equal(repr(wl[:2]), "WordList(['Schön', 'ist'])")
Ejemplo n.º 2
0
    def tag(self, sentence, tokenize=True):
        """Tag a string `sentence`.

        :param str or list sentence: A string or a list of sentence strings.
        :param tokenize: (optional) If ``False`` string has to be tokenized before
            (space separated string).

        """
        # : Do not process empty strings (Issue #3)
        if sentence.strip() == "":
            return []
        # : Do not process strings consisting of a single punctuation mark (Issue #4)
        elif sentence.strip() in PUNCTUATION:
            if self.include_punc:
                _sym = sentence.strip()
                if _sym in tuple('.?!'):
                    _tag = "."
                else:
                    _tag = _sym
                return [(_sym, _tag)]
            else:
                return []
        if tokenize:
            _tokenized = " ".join(self.tokenizer.tokenize(sentence))
            sentence = _tokenized
        # Sentence is tokenized before it is passed on to pattern.de.tag
        # (i.e. it is either submitted tokenized or if )
        _tagged = pattern_tag(sentence, tokenize=False)
        if self.include_punc:
            return _tagged
        else:
            _tagged = [(word, t) for word, t in _tagged
                       if not PUNCTUATION_REGEX.match(unicode(t))]
            return _tagged
Ejemplo n.º 3
0
 def test_slice_repr(self):
     wl = tb.WordList(['Schön', 'ist', 'besser'])
     if PY2:
         assert_equal(unicode(repr(wl[:2])),
                      u"WordList([u'Sch\\xf6n', u'ist'])")
     else:
         assert_equal(repr(wl[:2]), "WordList(['Schön', 'ist'])")
Ejemplo n.º 4
0
 def test_translate_detects_language_by_default(self):
     blob = tb.TextBlobDE(unicode("ذات سيادة كاملة"))
     assert_true(blob.translate() in ("Vollständig souveränen",
                                      "Völlig souverän",
                                      "Mit voller Souveränität",
                                      "Mit vollen Souveränität",
                                      "Volle Souveränität",
                                      "Voll souverän"))
Ejemplo n.º 5
0
 def test_repr(self):
     wl = tb.WordList(['Schön', 'ist', 'besser'])
     # This compat clause is necessary because from __future__ import unicode_literals
     # turns the whole second argument into one single unicode string:
     # Without it you get an AssertionError on PY2:
     # "WordList([u'Sch\\xf6n', u'ist', u'besser'])" != \
     # u"WordList(['Sch\xf6n', 'ist', 'besser'])"
     if PY2:
         assert_equal(unicode(repr(wl)),
                      u"WordList([u'Sch\\xf6n', u'ist', u'besser'])")
     else:
         assert_equal(repr(wl), "WordList(['Schön', 'ist', 'besser'])")
Ejemplo n.º 6
0
 def test_repr(self):
     wl = tb.WordList(['Schön', 'ist', 'besser'])
     # This compat clause is necessary because from __future__ import unicode_literals
     # turns the whole second argument into one single unicode string:
     # Without it you get an AssertionError on PY2:
     # "WordList([u'Sch\\xf6n', u'ist', u'besser'])" != \
     # u"WordList(['Sch\xf6n', 'ist', 'besser'])"
     if PY2:
         assert_equal(
             unicode(
                 repr(wl)),
             u"WordList([u'Sch\\xf6n', u'ist', u'besser'])")
     else:
         assert_equal(repr(wl), "WordList(['Schön', 'ist', 'besser'])")
Ejemplo n.º 7
0
    def pos_tags(self):
        """Returns an list of tuples of the form (word, POS tag).

        Example:
        ::

            [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
                    ('Thursday', 'NNP'), ('morning', 'NN')]

        :rtype: list of tuples

        """
        return [(Word(word, pos_tag=t), unicode(t))
                for word, t in self.pos_tagger.tag(self.raw)
                # new keyword PatternTagger(include_punc=False)
                # if not PUNCTUATION_REGEX.match(unicode(t))
                ]
Ejemplo n.º 8
0
    def pos_tags(self):
        """Returns an list of tuples of the form (word, POS tag).

        Example:
        ::

            [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
                    ('Thursday', 'NNP'), ('morning', 'NN')]

        :rtype: list of tuples

        """
        return [(Word(word, pos_tag=t), unicode(t))
                for word, t in self.pos_tagger.tag(self.raw)
                # new keyword PatternTagger(include_punc=False)
                # if not PUNCTUATION_REGEX.match(unicode(t))
                ]
Ejemplo n.º 9
0
    def tag(self, sentence, tokenize=True):
        """Tag a string `sentence`.

        :param str or list sentence: A string or a list of sentence strings.
        :param tokenize: (optional) If ``False`` string has to be tokenized before
            (space separated string).

        """
        #: Do not process empty strings (Issue #3)
        if sentence.strip() == "":
            return []
        #: Do not process strings consisting of a single punctuation mark (Issue #4)
        elif sentence.strip() in PUNCTUATION:
            if self.include_punc:
                _sym = sentence.strip()
                if _sym in tuple('.?!'):
                    _tag = "."
                else:
                    _tag = _sym
                return [(_sym, _tag)]
            else:
                return []
        if tokenize:
            _tokenized = " ".join(self.tokenizer.tokenize(sentence))
            sentence = _tokenized
        # Sentence is tokenized before it is passed on to pattern.de.tag
        # (i.e. it is either submitted tokenized or if )
        _tagged = pattern_tag(sentence, tokenize=False,
                              encoding=self.encoding,
                              tagset=self.tagset)
        if self.include_punc:
            return _tagged
        else:
            _tagged = [
                (word, t) for word, t in _tagged if not PUNCTUATION_REGEX.match(
                    unicode(t))]
            return _tagged
Ejemplo n.º 10
0
 def test_translate_detects_language_by_default(self):
     blob = tb.TextBlobDE(unicode("ذات سيادة كاملة"))
     assert_true(blob.translate() in ("Vollständig souveränen",
                                      "Mit voller Souveränität"))