Example #1
0
 def UnitTest(cls, norm=None):
     # Does it make me disturbed that these are the first sentences that came
     # to mind? Somewhat troubling...
     strings = [
         "Hi there. Gonna getcha. I've decided you'll die tonight.",
         "r u scared yet? B/c Ill rip our ur guts.",
         "Whatcha up2? We're gonna go on a killin' /spree/.",
         "Holy crap dood.",
         "Are you going out?",
         "#Hi @I love/hate $0 http://yfrog.com/a3ss0sa *always* don't /you/....",
         "r u going out?",
     ]
     if not norm:
         norm = TokenNormalizer()
     tokens = []
     norm_tokens = []
     for s in strings:
         t = word_tokenize(s)
         tokens.append(t)
         print s
     print ""
     for t in tokens:
         nt = norm.normalize_tokens(t)
         norm_tokens.append(nt)
         print nt
     print ""
     for nt in norm_tokens:
         norm._count_tokens(nt)
     denorm_tokens = []
     for nt in norm_tokens:
         dt = norm.denormalize_tokens(nt)
         denorm_tokens.append(dt)
         print dt
     for dt in denorm_tokens:
         print word_detokenize(dt)
Example #2
0
 def say_something(self, tagged_tokens=None):
     if not tagged_tokens:
         tagged_tokens = self._nm_hmm_phrase()
     toks = [t[0] for t in tagged_tokens]
     self.hack_grammar(toks)
     if self.normalizer:
         tokens = self.normalizer.denormalize_tokens(toks)
         something = word_detokenize(tokens)
     else:
         tokens = toks
         something = word_detokenize(toks)
     return (something, toks, tagged_tokens)
Example #3
0
def pos_tag(tokens, try_agfl=True, reject_agfl_fails=True,
            nltk_fallback=True):
  if try_agfl and agfl.agfl_ok():
    detoked = word_detokenize(tokens)
    sentences = nltk.sent_tokenize(detoked)
    all_tags = []
    for s in sentences:
      stokens = word_tokenize(s)
      tweaker = AGFLTweaker()
      tweaker.prune(stokens)
      nltk_tags = nltk.pos_tag(stokens)
      tweaker.agfl_fix(stokens, nltk_tags)
      s = word_detokenize(stokens)
      if not s:
        print "Empty string for: "+str(stokens)
        continue
      #print "Parsing: |"+s+"|"
      agfl_tree = agfl.parse_sentence(s)
      # XXX: We can re-try failed '?' with '.'..
      if not agfl_tree:
        print "AGFL Parse fail for |"+s+"|"
        if not reject_agfl_fails:
          all_tags.extend(tweaker.deprune(nltk.pos_tag(stokens)))
        else:
          return None
      else:
        tags = agfl_tree.pos_tag()
        tags = tweaker.agfl_split(tags)
        did_join = tweaker.agfl_join(tags, stokens)
        if nltk_fallback: tweaker.agfl_repair(tags, nltk_tags)
        tweaker.deprune(tags)
        # Verify that we have labels for everything.
        # If some are still missing, drop.
        if tags:
          for t in tags:
            if not t[1]:
              print "Tag fail for: |"+s+"|"
              print str(tags)
              if did_join: print "Failed with attempted join: "+str(stokens)
              return None
          all_tags.extend(tags)
        else:
          print "Tag fail for |"+s+"|"
          return None
    return all_tags
  else:
    if try_agfl:
      print "AGFL not found/functional. Falling back to nltk.pos_tag()"
    return nltk.pos_tag(tokens)