Python TokenNormalizer Examples

Programming Language: Python

Namespace/Package Name: libs.SpeechModels

Class/Type: TokenNormalizer

Examples at hotexamples.com: 2

Python TokenNormalizer - 2 examples found. These are the top rated real world Python examples of libs.SpeechModels.TokenNormalizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

normalize_tokens(2)

Example #1

Show file

File: extract.py Project: tweetr/Resurrectron

  def __init__(self, directory):
    self.normalizer = TokenNormalizer()
    self.quote_engine_only = config.getboolean('soul', 'quote_engine_only')
    # FIXME: http://www.w3schools.com/HTML/html_entities.asp
    clean_ents = [("&lt;", "<"), ("&gt;", ">"), ("&amp;", "&")]
    tagged_tweets = []
    self.vocab = set([])
    for root, dirs, files in os.walk(directory):
      for f in files:
        # .jtwt: json-encoded twitter tweets, 1 per line
        # TODO: Add @msgs to this user as hidden text
        if f.endswith(".jtwt"):
          fl = open(root+"/"+f, "r")
          for jtweet in fl.readlines():
            tweet = json.loads(jtweet)
            txt = tweet['text'].encode('ascii', 'ignore')
            if re.search("( |^)RT(:| )", txt, re.IGNORECASE): continue
            if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt)
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              self.vocab.update(tokens)
              tagged_tweet = pos_tag(tokens,
                               config.getboolean("soul","attempt_agfl"),
                               config.getboolean("soul","reject_agfl_failures"),
                               config.getboolean("soul","agfl_nltk_fallback"))
              if tagged_tweet: tagged_tweets.append(tagged_tweet)
            print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files))
        # .twt: plain-text tweets, 1 per line
        elif f.endswith(".twt"):
          fl = open(root+"/"+f, "r")
          for tweet in fl.readlines():
            txt = tweet.encode('ascii', 'ignore')
            if txt.startswith('RT'): continue
            if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt)
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              self.vocab.update(tokens)
              tagged_tweet = pos_tag(tokens,
                               config.getboolean("soul","attempt_agfl"),
                               config.getboolean("soul","reject_agfl_failures"),
                               config.getboolean("soul","agfl_nltk_fallback"))
              if tagged_tweet: tagged_tweets.append(tagged_tweet)
            print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files))
          pass
        # .post: long-winded material (blog/mailinglist posts, essays, articles, etc)
        elif f.endswith(".post"):
          fl = open(root+"/"+f, "r")
          post = fl.read()
          tweets = self.post_to_tweets(post)
          for txt in tweets:
            #txt = txt.encode('ascii', 'ignore')
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              if tokens:
                self.vocab.update(tokens)
                tagged_tweet = pos_tag(tokens,
                               config.getboolean("soul","attempt_agfl"),
                               config.getboolean("soul","reject_agfl_failures"),
                               config.getboolean("soul","agfl_nltk_fallback"))
                if tagged_tweet: tagged_tweets.append(tagged_tweet)
            print "Loaded post-tweet #"+str(len(tagged_tweets))
        # .irclog: irc log files. irssi format.
        elif f.endswith(".irclog"):
          pass
        # .4sq: foursquare data
        elif f.endswith(".4sq"):
          pass

    self.tagged_tweets = tagged_tweets

Example #2

Show file

File: extract.py Project: tdflatline/Resurrectron

class CorpusSoul:
  def __init__(self, directory):
    self.normalizer = TokenNormalizer()
    self.quote_engine_only = config.getboolean('soul', 'quote_engine_only')
    # FIXME: http://www.w3schools.com/HTML/html_entities.asp
    clean_ents = [("&lt;", "<"), ("&gt;", ">"), ("&amp;", "&")]
    tagged_tweets = []
    tweet_texts = []
    self.vocab = set([])
    for root, dirs, files in os.walk(directory):
      for f in files:
        # .jtwt: json-encoded twitter tweets, 1 per line
        # TODO: Add @msgs to this user as hidden text
        if f.endswith(".jtwt"):
          fl = open(root+"/"+f, "r")
          for jtweet in fl.readlines():
            tweet = json.loads(jtweet)
            txt = tweet['text'].encode('ascii', 'ignore')
            if re.search("( |^)RT(:| )", txt, re.IGNORECASE): continue
            if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt)
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              if tokens:
                self.vocab.update(tokens)
                tagged_tweet = pos_tag(tokens,
                                 config.getboolean("soul","attempt_agfl"),
                                 config.getboolean("soul","reject_agfl_failures"),
                                 config.getboolean("soul","agfl_nltk_fallback"))
                if tagged_tweet:
                  tweet_texts.append(word_detokenize(tokens))
                  tagged_tweets.append(tagged_tweet)
            print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files))
        # .twt: plain-text tweets, 1 per line
        elif f.endswith(".twt"):
          fl = open(root+"/"+f, "r")
          for tweet in fl.readlines():
            txt = tweet.encode('ascii', 'ignore')
            if txt.startswith('RT'): continue
            if txt[0] == '@': txt = re.sub('^@[\S]+ ', '', txt)
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              if tokens:
                self.vocab.update(tokens)
                tagged_tweet = pos_tag(tokens,
                              config.getboolean("soul","attempt_agfl"),
                              config.getboolean("soul","reject_agfl_failures"),
                              config.getboolean("soul","agfl_nltk_fallback"))
                if tagged_tweet:
                  tweet_texts.append(word_detokenize(tokens))
                  tagged_tweets.append(tagged_tweet)
            print "Loaded tweet #"+str(len(tagged_tweets)) #+"/"+str(len(files))
          pass
        # .post: long-winded material (blog/mailinglist posts, essays, articles, etc)
        elif f.endswith(".post"):
          fl = open(root+"/"+f, "r")
          post = fl.read()
          tweets = self.post_to_tweets(post)
          for txt in tweets:
            #txt = txt.encode('ascii', 'ignore')
            for e in clean_ents:
              txt = re.sub(e[0], e[1], txt)
            if self.quote_engine_only:
              tagged_tweets.append(txt)
            else:
              tokens = self.normalizer.normalize_tokens(word_tokenize(txt))
              if tokens:
                self.vocab.update(tokens)
                tagged_tweet = pos_tag(tokens,
                               config.getboolean("soul","attempt_agfl"),
                               config.getboolean("soul","reject_agfl_failures"),
                               config.getboolean("soul","agfl_nltk_fallback"))
                if tagged_tweet:
                  tweet_texts.append(word_detokenize(tokens))
                  tagged_tweets.append(tagged_tweet)
            print "Loaded post-tweet #"+str(len(tagged_tweets))
        # .irclog: irc log files. irssi format.
        elif f.endswith(".irclog"):
          pass
        # .4sq: foursquare data
        elif f.endswith(".4sq"):
          pass

    self.tagged_tweets = tagged_tweets

    num_clusters = config.getint('soul', 'tweet_topics')
    if num_clusters > 1:
      self.cluster_tweets(tweet_texts, num_clusters)
    else:
      self.cluster_rates = {}
      self.clustered_tweets = {}
      self.clustered_tweets[0] = tagged_tweets
      self.cluster_rates[0] = len(self.tagged_tweets)

  def post_to_tweets(self, post, summarize=False):
    # We do poorly with parentheticals. Just kill them.
    post = re.sub(r"\([^\)]+\)", "", post)
    if summarize:
      summ = SimpleSummarizer()
      post = summ.summarize(post, config.getint("soul", "post_summarize_len"))
    sentences = nltk.sent_tokenize(post)
    tweets = []
    tweet = ""
    for s in sentences:
      if len(s) > config.getint("soul","post_len"): continue
      if len(tweet + s) < config.getint("soul","post_len"):
        tweet += s+" "
      else:
        if tweet: tweets.append(tweet)
        tweet = ""
    return tweets

  def cluster_tweets(self, tweet_texts, num_clusters=3):
    # XXX: move SearchableTextCollection to libs
    from resurrect import SearchableTextCollection,SearchableText

    print "Scoring tweets.."
    tc = SearchableTextCollection(self.vocab)
    for tweet in tweet_texts:
      txt = SearchableText(tweet)
      tc.add_text(txt)
    tc.update_matrix()
    print "Scored tweets.."
    print "Clustering tweets.."
    cluster = nltk.cluster.KMeansClusterer(num_clusters,
                 nltk.cluster.util.euclidean_distance,
                 repeats=20*num_clusters)
    # EM takes waaaaaayy too long, even with SVD
    #cluster = nltk.cluster.EMClusterer(means, svd_dimensions=100)
    clustered = cluster.cluster(tc.D, assign_clusters=True)
    print "Clustered tweets.."
    clustered_tweets = {}
    for i in xrange(len(clustered)):
      if clustered[i] not in clustered_tweets:
        clustered_tweets[clustered[i]] = []
      clustered_tweets[clustered[i]].append(self.tagged_tweets[i])
    self.cluster_rates = {}
    for i in clustered_tweets.iterkeys():
      self.cluster_rates[i] = len(clustered_tweets[i])
      print
      print "Cluster "+str(i)+": "+str(len(clustered_tweets[i]))
      for t in clustered_tweets[i]:
        print t
    self.clustered_tweets = clustered_tweets