def UnitTest(cls, norm=None): # Does it make me disturbed that these are the first sentences that came # to mind? Somewhat troubling... strings = [ "Hi there. Gonna getcha. I've decided you'll die tonight.", "r u scared yet? B/c Ill rip our ur guts.", "Whatcha up2? We're gonna go on a killin' /spree/.", "Holy crap dood.", "Are you going out?", "#Hi @I love/hate $0 http://yfrog.com/a3ss0sa *always* don't /you/....", "r u going out?", ] if not norm: norm = TokenNormalizer() tokens = [] norm_tokens = [] for s in strings: t = word_tokenize(s) tokens.append(t) print s print "" for t in tokens: nt = norm.normalize_tokens(t) norm_tokens.append(nt) print nt print "" for nt in norm_tokens: norm._count_tokens(nt) denorm_tokens = [] for nt in norm_tokens: dt = norm.denormalize_tokens(nt) denorm_tokens.append(dt) print dt for dt in denorm_tokens: print word_detokenize(dt)
def say_something(self, tagged_tokens=None): if not tagged_tokens: tagged_tokens = self._nm_hmm_phrase() toks = [t[0] for t in tagged_tokens] self.hack_grammar(toks) if self.normalizer: tokens = self.normalizer.denormalize_tokens(toks) something = word_detokenize(tokens) else: tokens = toks something = word_detokenize(toks) return (something, toks, tagged_tokens)
def pos_tag(tokens, try_agfl=True, reject_agfl_fails=True, nltk_fallback=True): if try_agfl and agfl.agfl_ok(): detoked = word_detokenize(tokens) sentences = nltk.sent_tokenize(detoked) all_tags = [] for s in sentences: stokens = word_tokenize(s) tweaker = AGFLTweaker() tweaker.prune(stokens) nltk_tags = nltk.pos_tag(stokens) tweaker.agfl_fix(stokens, nltk_tags) s = word_detokenize(stokens) if not s: print "Empty string for: "+str(stokens) continue #print "Parsing: |"+s+"|" agfl_tree = agfl.parse_sentence(s) # XXX: We can re-try failed '?' with '.'.. if not agfl_tree: print "AGFL Parse fail for |"+s+"|" if not reject_agfl_fails: all_tags.extend(tweaker.deprune(nltk.pos_tag(stokens))) else: return None else: tags = agfl_tree.pos_tag() tags = tweaker.agfl_split(tags) did_join = tweaker.agfl_join(tags, stokens) if nltk_fallback: tweaker.agfl_repair(tags, nltk_tags) tweaker.deprune(tags) # Verify that we have labels for everything. # If some are still missing, drop. if tags: for t in tags: if not t[1]: print "Tag fail for: |"+s+"|" print str(tags) if did_join: print "Failed with attempted join: "+str(stokens) return None all_tags.extend(tags) else: print "Tag fail for |"+s+"|" return None return all_tags else: if try_agfl: print "AGFL not found/functional. Falling back to nltk.pos_tag()" return nltk.pos_tag(tokens)