def featurize_tweet(t): toks = tokenize(t.lower()) norm_toks = [norm_token(t) for t in toks] norm_toks = [t for t in norm_toks if t] feats = list(set(norm_toks)) return feats
def normalize_tweet(tweet): ''' Break tweet into normalized tokens and append _NEG or _NEGFIRST tags to tokens that appear in negated context since we are using a contextual lexicon. Returns list of normalized tokens. ''' tokens = [norm_token(t) for t in tokenize(tweet.lower())] return [t for t in tokens if t]
def apply(self, input): tokens = tokenize(input.lower()) if self.stop_tokens: tokens = [t for t in tokens if t not in self.stop_token_list] if self.usernames: tokens = [t for t in tokens if not t.startswith('@')] if self.urls: tokens = [t for t in tokens if not re.match(self.url_regex, t)] if self.punctuation: tokens = [t for t in tokens if t.translate(self.punctuation_map)] return tokens
def classify(self, tweet, results=None): """ Takes in a string representing the entire text of a tweet, and uses the internal models to calculate the probability of that tweet belonging every 'seen' hashtag. If results is none, ALL hashtags and their associated probabilities will be returned. Otherwise, if results in an int, then that number of hashtags with the highest probabilities with be returned. """ # Twokenize tweet tweet_tokens = tokenize(tweet) # Filter out any existing hashtags tokens = set() for token in tweet_tokens: if token[0] != '#': tokens.add(unicode(token).lower()) probs = {} for hashtag in self._hashtags(): probs[hashtag] = self._prob(tokens, hashtag) return sorted(probs.iteritems(), key=lambda t: t[1], reverse=True)[:results]