def test_stem(self): #only PorterStemmer tested w = tb.Word("cars") assert_equal(w.stem(), "car") w = tb.Word("wolves") assert_equal(w.stem(), "wolv") w = tb.Word("went") assert_equal(w.stem(), "went")
def test_pop(self): wl = tb.WordList(['cats', 'dogs']) assert_equal(wl.pop(), tb.Word('dogs')) assert_raises(IndexError, wl.__getitem__, 1) assert_equal(wl.pop(), tb.Word('cats')) assert_equal(len(wl), 0) assert_raises(IndexError, wl.pop)
def test_lemmatize(self): w = tb.Word("cars") assert_equal(w.lemmatize(), "car") w = tb.Word("wolves") assert_equal(w.lemmatize(), "wolf") w = tb.Word("went") assert_equal(w.lemmatize("v"), "go")
def test_lemmatize(self): w = tb.Word("cars") assert_equal(w.lemmatize(), "car") w = tb.Word("wolves") assert_equal(w.lemmatize(), "wolf") w = tb.Word("went") assert_equal(w.lemmatize("v"), "go") # wordnet tagset assert_equal(w.lemmatize("VBD"), "go") # penn treebank tagset
def swap_paragraph(mapping, paragraph): b = textblob.TextBlob(paragraph.replace('--', u' – ')) new_sentences = [] for sentence in b.sentences: new_words = [] lengths = [] # Inlined to exclude 'if not PUNCTUATION_REGEX.match(unicode(t))]' sentence_pos_tags = [ (textblob.Word(word, pos_tag=t), unicode(t)) for word, t in sentence.pos_tagger.tag(sentence.raw) ] for (word, pos_tag), (next_word, next_pos_tag) in shift_zip(sentence_pos_tags): replacements = mapping.map(word, next_word) if replacements: new_word = Substitution(word, '|'.join(replacements)) else: new_word = word new_words.append(new_word) lengths.append(max(len(x) for x in (word, pos_tag, new_word))) # TODO: remove whitespace around punctuation. #print ' '.join('%*s' % (l, x) for l, (x, t) in zip(lengths, sentence.pos_tags)) #print ' '.join('%*s' % (l, t) for l, (x, t) in zip(lengths, sentence.pos_tags)) #print ' '.join('%*s' % (l, y) for l, y in zip(lengths, new_words)) #print new_sentences.append(new_words) return reassemble(new_sentences)
def map(self, original_word, successor): if original_word.pos_tag == PosTag.NNS: word = original_word.singularize() word.pos_tag = PosTag.NN else: word = original_word for rule in self.rules: if word.pos_tag in rule.pos and rule.matches(word): replacement = textblob.Word(rule.apply(word), pos_tag=word.pos_tag) if replacement == 'hress': continue #if replacement.definitions == []: # continue #if rule.if_followed_by is None or ( # successor is not None and # successor.pos_tag in rule.if_followed_by) if original_word.pos_tag == PosTag.NNS: return { replacement.pluralize() } else: return { replacement } return set()
def is_body_part(word): #print "Entered is_animal with ", word w = textblob.Word(word).synsets if len(w) == 0: return False ss = w[0] return traverse(ss, "body_part")
def lemmatize(word): if word in ShannonEntropy.lemmas: lemma = ShannonEntropy.lemmas[word] else: lemma = textblob.Word(word).lemmatize() ShannonEntropy.lemmas[word] = lemma return lemma
def iter_filth( self, text, document_name: Optional[str] = None ) -> Generator[Filth, None, None]: """Yields discovered filth in the provided ``text``. :param text: The dirty text to clean. :type text: str :param document_name: The name of the document to clean. :type document_name: str, optional :return: An iterator to the discovered :class:`Filth` :rtype: Iterator[:class:`Filth`] """ # find 'skype' in the text using a customized tokenizer. this makes # sure that all valid skype usernames are kept as tokens and not split # into different words tokenizer = nltk.tokenize.regexp.RegexpTokenizer(self.SKYPE_TOKEN) blob = textblob.TextBlob(text, tokenizer=tokenizer) skype_indices, tokens = [], [] for i, token in enumerate(blob.tokens): tokens.append(token) if 'skype' in token.lower(): skype_indices.append(i) # go through the words before and after skype words to identify # potential skype usernames. skype_usernames = [] for i in skype_indices: jmin = max(i - self.word_radius, 0) jmax = min(i + self.word_radius + 1, len(tokens)) for j in list(range(jmin, i)) + list(range(i + 1, jmax)): token = tokens[j] if self.SKYPE_USERNAME.match(token): # this token is a valid skype username. Most skype # usernames appear to be misspelled words. Word.spellcheck # does not handle the situation of an all caps word very # well, so we cast these to all lower case before checking # whether the word is misspelled if token.isupper(): token = token.lower() word = textblob.Word(token) suggestions = word.spellcheck() corrected_word, score = suggestions[0] if score < 0.5: skype_usernames.append(token) # replace all skype usernames if skype_usernames: self.regex = re.compile('|'.join(skype_usernames)) yield from super(SkypeDetector, self).iter_filth(text, document_name=document_name) return
def is_motor_vehicle(word): #print "Entered is_animal with ", word w = textblob.Word(word).synsets if len(w) == 0: return False for ss in w: if traverse(ss, "motor_vehicle"): return True return False
def run(self, message): try: blob = textblob.Word(message) defs = blob.definitions s = "" if len(defs) > 0: for item in defs[0:4]: s += item.capitalize() + ".\n" self.manager.say(s) else: self.manager.say("No result found.") except RuntimeError: blob = textblob.Word(message) defs = blob.definitions s = "" if len(defs) > 0: for item in defs[0:4]: s += item.capitalize() + ".\n" self.manager.say(s) else: self.manager.say("No result found.")
def patriarchy(word_ending_ess): w = textblob.Word(word_ending_ess) return { (lemma_name, ess_synset.lexname(), any(x in ess_synset.definition() for x in ('woman', 'girl', 'female'))) for ess_synset in w.synsets if ess_synset.lexname() == u'noun.person' for ess_hyp_nym in ess_synset.hypernyms() + ess_synset.hyponyms() for lemma_name in ess_hyp_nym.lemma_names() if lemma_name != word_ending_ess if len(lemma_name) < len(word_ending_ess) if len( list( itertools.takewhile(lambda t: t[0] == t[1], zip(word_ending_ess, lemma_name)))) >= 3 }
def annotate(sent): global TAGGER ts = TAGGER.tag(sent) for raw, pos in ts: pos_kind = pos[0].lower() w = textblob.Word(raw.lower()) root = str(w) if is_not_word(raw[0]) or (pos == "SYM"): pos = "." elif pos_kind in ["n", "v"]: root = w.lemmatize(pos_kind) yield WordNode(raw=raw, pos=pos, root=root)
def test_spellcheck_special_cases(self): # Punctuation assert_equal(tb.Word("!").spellcheck(), [("!", 1.0)]) # Numbers assert_equal(tb.Word("42").spellcheck(), [("42", 1.0)]) assert_equal(tb.Word("12.34").spellcheck(), [("12.34", 1.0)]) # One-letter words assert_equal(tb.Word("I").spellcheck(), [("I", 1.0)]) assert_equal(tb.Word("A").spellcheck(), [("A", 1.0)]) assert_equal(tb.Word("a").spellcheck(), [("a", 1.0)])
def iter_filth(self, text): # find 'skype' in the text using a customized tokenizer. this makes # sure that all valid skype usernames are kept as tokens and not split # into different words tokenizer = nltk.tokenize.regexp.RegexpTokenizer( self.filth_cls.SKYPE_TOKEN ) blob = textblob.TextBlob(text, tokenizer=tokenizer) skype_indices, tokens = [], [] for i, token in enumerate(blob.tokens): tokens.append(token) if 'skype' in token.lower(): skype_indices.append(i) # go through the words before and after skype words to identify # potential skype usernames. skype_usernames = [] for i in skype_indices: jmin = max(i-self.word_radius, 0) jmax = min(i+self.word_radius+1, len(tokens)) for j in range(jmin, i) + range(i+1, jmax): token = tokens[j] if self.filth_cls.SKYPE_USERNAME.match(token): # this token is a valid skype username. Most skype # usernames appear to be misspelled words. Word.spellcheck # does not handle the situation of an all caps word very # well, so we cast these to all lower case before checking # whether the word is misspelled if token.isupper(): token = token.lower() word = textblob.Word(token) suggestions = word.spellcheck() corrected_word, score = suggestions[0] if score < 0.5: skype_usernames.append(token) # replace all skype usernames if skype_usernames: self.filth_cls.regex = re.compile('|'.join(skype_usernames)) else: self.filth_cls.regex = None return super(SkypeDetector, self).iter_filth(text)
def remove_noise(text, stop_words = ()): cleaned_text = '' for token, tag in text.pos_tags: token = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", token) #token = re.sub("(@[A-Za-z0-9_]+)","", token) if tag.startswith("NN"): pos = 'n' elif tag.startswith('VB'): pos = 'v' else: pos = 'a' token = tb.Word(token).lemmatize(pos) if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words: cleaned_text += token.lower().strip() + ' ' return tb.TextBlob(cleaned_text.strip())
def lookup_word(term): return term if isinstance(term, textblob.Word) else textblob.Word(term)
def test_translate(self, mock_translate): mock_translate.return_value = 'gato' assert_equal(tb.Word("cat").translate(to="es"), "gato")
def test_init(self): tb.Word("cat") assert_true(isinstance(self.cat, tb.Word)) word = tb.Word('cat', 'NN') assert_equal(word.pos_tag, 'NN')
def test_lemma(self): lemma = wn.Lemma('eat.v.01.eat') word = tb.Word("eat") assert_equal(word.synsets[0].lemmas()[0], lemma)
def test_translate_without_from_lang(self, mock_translate): mock_translate.return_value = 'hi' assert_equal(tb.Word('hola').translate(), 'hi')
def test_define(self): w = tb.Word("hack") synsets = w.get_synsets(wn.NOUN) definitions = w.define(wn.NOUN) assert_equal(len(synsets), len(definitions))
def test_synset(self): syn = wn.Synset("dog.n.01") word = tb.Word("dog") assert_equal(word.synsets[0], syn)
def test_synsets_with_pos_argument(self): w = tb.Word("work") noun_syns = w.get_synsets(pos=wn.NOUN) for synset in noun_syns: assert_equal(synset.pos(), wn.NOUN)
def test_definitions(self): w = tb.Word("octopus") for definition in w.definitions: print(type(definition)) assert_true(isinstance(definition, basestring))
def test_synsets(self): w = tb.Word("car") assert_true(isinstance(w.synsets, (list, tuple))) assert_true(isinstance(w.synsets[0], Synset))
def test_detect_language(self, mock_detect): mock_detect.return_value = 'fr' assert_equal(tb.Word("bonjour").detect_language(), 'fr')
def test_lemma(self): w = tb.Word("wolves") assert_equal(w.lemma, "wolf") w = tb.Word("went", "VBD"); assert_equal(w.lemma, "go")
def test_spellcheck(self): blob = tb.Word("speling") suggestions = blob.spellcheck() assert_equal(suggestions[0][0], "spelling")
def test_correct(self): w = tb.Word('speling') correct = w.correct() assert_equal(correct, tb.Word('spelling')) assert_true(isinstance(correct, tb.Word))