def test_slice_repr(self): wl = tb.WordList(['Schön', 'ist', 'besser']) if PY2: assert_equal(unicode(repr(wl[:2])), u"WordList([u'Sch\\xf6n', u'ist'])") else: assert_equal(repr(wl[:2]), "WordList(['Schön', 'ist'])")
def tag(self, sentence, tokenize=True): """Tag a string `sentence`. :param str or list sentence: A string or a list of sentence strings. :param tokenize: (optional) If ``False`` string has to be tokenized before (space separated string). """ # : Do not process empty strings (Issue #3) if sentence.strip() == "": return [] # : Do not process strings consisting of a single punctuation mark (Issue #4) elif sentence.strip() in PUNCTUATION: if self.include_punc: _sym = sentence.strip() if _sym in tuple('.?!'): _tag = "." else: _tag = _sym return [(_sym, _tag)] else: return [] if tokenize: _tokenized = " ".join(self.tokenizer.tokenize(sentence)) sentence = _tokenized # Sentence is tokenized before it is passed on to pattern.de.tag # (i.e. it is either submitted tokenized or if ) _tagged = pattern_tag(sentence, tokenize=False) if self.include_punc: return _tagged else: _tagged = [(word, t) for word, t in _tagged if not PUNCTUATION_REGEX.match(unicode(t))] return _tagged
def test_translate_detects_language_by_default(self): blob = tb.TextBlobDE(unicode("ذات سيادة كاملة")) assert_true(blob.translate() in ("Vollständig souveränen", "Völlig souverän", "Mit voller Souveränität", "Mit vollen Souveränität", "Volle Souveränität", "Voll souverän"))
def test_repr(self): wl = tb.WordList(['Schön', 'ist', 'besser']) # This compat clause is necessary because from __future__ import unicode_literals # turns the whole second argument into one single unicode string: # Without it you get an AssertionError on PY2: # "WordList([u'Sch\\xf6n', u'ist', u'besser'])" != \ # u"WordList(['Sch\xf6n', 'ist', 'besser'])" if PY2: assert_equal(unicode(repr(wl)), u"WordList([u'Sch\\xf6n', u'ist', u'besser'])") else: assert_equal(repr(wl), "WordList(['Schön', 'ist', 'besser'])")
def test_repr(self): wl = tb.WordList(['Schön', 'ist', 'besser']) # This compat clause is necessary because from __future__ import unicode_literals # turns the whole second argument into one single unicode string: # Without it you get an AssertionError on PY2: # "WordList([u'Sch\\xf6n', u'ist', u'besser'])" != \ # u"WordList(['Sch\xf6n', 'ist', 'besser'])" if PY2: assert_equal( unicode( repr(wl)), u"WordList([u'Sch\\xf6n', u'ist', u'besser'])") else: assert_equal(repr(wl), "WordList(['Schön', 'ist', 'besser'])")
def pos_tags(self): """Returns an list of tuples of the form (word, POS tag). Example: :: [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN')] :rtype: list of tuples """ return [(Word(word, pos_tag=t), unicode(t)) for word, t in self.pos_tagger.tag(self.raw) # new keyword PatternTagger(include_punc=False) # if not PUNCTUATION_REGEX.match(unicode(t)) ]
def tag(self, sentence, tokenize=True): """Tag a string `sentence`. :param str or list sentence: A string or a list of sentence strings. :param tokenize: (optional) If ``False`` string has to be tokenized before (space separated string). """ #: Do not process empty strings (Issue #3) if sentence.strip() == "": return [] #: Do not process strings consisting of a single punctuation mark (Issue #4) elif sentence.strip() in PUNCTUATION: if self.include_punc: _sym = sentence.strip() if _sym in tuple('.?!'): _tag = "." else: _tag = _sym return [(_sym, _tag)] else: return [] if tokenize: _tokenized = " ".join(self.tokenizer.tokenize(sentence)) sentence = _tokenized # Sentence is tokenized before it is passed on to pattern.de.tag # (i.e. it is either submitted tokenized or if ) _tagged = pattern_tag(sentence, tokenize=False, encoding=self.encoding, tagset=self.tagset) if self.include_punc: return _tagged else: _tagged = [ (word, t) for word, t in _tagged if not PUNCTUATION_REGEX.match( unicode(t))] return _tagged
def test_translate_detects_language_by_default(self): blob = tb.TextBlobDE(unicode("ذات سيادة كاملة")) assert_true(blob.translate() in ("Vollständig souveränen", "Mit voller Souveränität"))