コード例 #1
0
def featurize_tweet(t):
    toks = tokenize(t.lower())
    norm_toks = [norm_token(t) for t in toks]
    norm_toks = [t for t in norm_toks if t]

    feats = list(set(norm_toks))

    return feats
コード例 #2
0
def normalize_tweet(tweet):
    '''
    Break tweet into normalized tokens and append _NEG or _NEGFIRST tags
    to tokens that appear in negated context since we are using a
    contextual lexicon.  Returns list of normalized tokens.
    '''

    tokens = [norm_token(t) for t in tokenize(tweet.lower())]
    return [t for t in tokens if t]
コード例 #3
0
ファイル: tokeniser.py プロジェクト: alabarga/3yp
    def apply(self, input):
        tokens = tokenize(input.lower())

        if self.stop_tokens:
            tokens = [t for t in tokens if t not in self.stop_token_list]

        if self.usernames:
            tokens = [t for t in tokens if not t.startswith('@')]

        if self.urls:
            tokens = [t for t in tokens if not re.match(self.url_regex, t)]

        if self.punctuation:
            tokens = [t for t in tokens if t.translate(self.punctuation_map)]

        return tokens
コード例 #4
0
ファイル: classifier.py プロジェクト: alabarga/3yp
    def classify(self, tweet, results=None):
        """
        Takes in a string representing the entire text of a tweet, and uses the internal models
        to calculate the probability of that tweet belonging every 'seen' hashtag.

        If results is none, ALL hashtags and their associated probabilities will be returned.
        Otherwise, if results in an int, then that number of hashtags with the highest
        probabilities with be returned.
        """
        # Twokenize tweet
        tweet_tokens = tokenize(tweet)

        # Filter out any existing hashtags
        tokens = set()
        for token in tweet_tokens:
            if token[0] != '#':
                tokens.add(unicode(token).lower())

        probs = {}

        for hashtag in self._hashtags():
            probs[hashtag] = self._prob(tokens, hashtag)

        return sorted(probs.iteritems(), key=lambda t: t[1], reverse=True)[:results]