def twit_tokenize(raw_twit, ticker=None, normalize=True): twit = raw_twit if normalize: twit = normalizeTextForTagger(twit) tokenized_init = tokenize(twit) tokenized_result = [] for tok in tokenized_init: if token_is_cash_tag(tok): continue if token_is_punct(tok): continue if (ticker is not None) and (token_matches_ticker(tok, ticker)): continue if token_is_stopword(tok): continue processed_tok = process_token(tok, lowercase=True, stem=False) tokenized_result.append(processed_tok) return tokenized_result
def get_hashtags(tweet_json): text = get_text_from_tweet_json(tweet_json) if 'entities' in tweet_json: return [entity['text'].lower() for entity in tweet_json['entities']['hashtags']] else: # if its an old tweet, do it the hard way return [x for x in set( [t for t in tokenize(text) if t.startswith("#") and not t == "#"])]
def get_mentions(tweet_json, return_id=False): text = get_text_from_tweet_json(tweet_json) to_return = 'id' if return_id else 'screen_name' if 'entities' in tweet_json: return [entity[to_return] for entity in tweet_json['entities']['user_mentions'] if to_return in entity] else: # if its an old tweet, do it the hard way return [x for x in set( [t.replace("@", "") for t in tokenize(text) if t.startswith("@") and not t == "@"])]
# Used for experiment, not actually used in pipeline import sys,json from nlp import twokenize for line in sys.stdin: tweet = json.loads(line.split('\t')[-1]) print u' '.join(twokenize.tokenize(tweet['text'])).encode('utf8')