def type_clean(val,type): if type==bool: if val in (False,0,'0','f','false','False','no','n'): return False if val in (True,1,'1','t','true','True','yes','y'): return True raise Exception("bad bool value %s" % repr(val)) if type==str or type==unicode: # nope no strings, you're gonna get unicode instead! return util.unicodify(val) return type(val)
def type_clean(val, type): if type == bool: if val in (False, 0, '0', 'f', 'false', 'False', 'no', 'n'): return False if val in (True, 1, '1', 't', 'true', 'True', 'yes', 'y'): return True raise Exception("bad bool value %s" % repr(val)) if type == str or type == unicode: # nope no strings, you're gonna get unicode instead! return util.unicodify(val) return type(val)
def truncate_at(s, max=40): s = util.unicodify(s) if len(s) > max: s = s[:max] + "…" return s
def output(s): print util.unicodify(s)
import sys sys.path.insert(0, '/usr2/corpora/tweets/tweetmotif') import twokenize, util, bigrams util.fix_stdio() from sane_re import * AposFix = _R(r"( |^)(' [stm])( |$)") for line in sys.stdin: parts = util.unicodify(line[:-1]).split("\t") text = parts[-1] toks = twokenize.simple_tokenize(text) toked = " ".join(toks) #print "\t".join(parts[:-1]) + "\t" + toked #try: AposFix.show_match(toked) #except: pass featstr = AposFix.gsub(toked, lambda m: m[1] + m[2].replace(" ", "") + m[3]) featstr = featstr.lower() toks = featstr.split() feats = [ug[0] for ug in bigrams.filtered_unigrams(toks)] feats += ["_".join(ng) for ng in bigrams.filtered_bigrams(toks)] print "\t".join(parts[:-1]) + "\t" + util.unicodify(" ".join(feats))
from __future__ import print_function import json import tweepy import datetime # used for time stamping program start/stop import traceback from util import unicodify # Attributes I don't want UNWANTED_ATTR = [ "contributors", "current_user_retweet", "favorited", "geo", "id", "in_reply_to_status_id", "lang", "quoted_status_id", "retweeted", "source", "in_reply_to_user_id", "entities", "extended_entities", "in_reply_to_screen_name" ] UNWANTED_ATTR = unicodify(UNWANTED_ATTR) # Nested attributes that I want WANTED_NESTED_ATTR = { "user": [ "followers_count", "friends_count", "geo_enabled", "id_str", "location", "protected", "time_zone", "statuses_count", "created_at" ], "extended_tweet": ["full_text"] } WANTED_NESTED_ATTR = unicodify(WANTED_NESTED_ATTR) def trim_tweet(tweet_json): """Trims and returns a tweet (JSON object).
import sys sys.path.insert(0,'/usr2/corpora/tweets/tweetmotif') import twokenize,util,bigrams util.fix_stdio() from sane_re import * AposFix = _R(r"( |^)(' [stm])( |$)") for line in sys.stdin: parts = util.unicodify(line[:-1]).split("\t") text = parts[-1] toks = twokenize.simple_tokenize(text) toked = " ".join(toks) #print "\t".join(parts[:-1]) + "\t" + toked #try: AposFix.show_match(toked) #except: pass featstr = AposFix.gsub(toked, lambda m: m[1]+m[2].replace(" ","")+m[3]) featstr = featstr.lower() toks = featstr.split() feats = [ug[0] for ug in bigrams.filtered_unigrams(toks)] feats += ["_".join(ng) for ng in bigrams.filtered_bigrams(toks)] print "\t".join(parts[:-1]) + "\t" + util.unicodify(" ".join(feats))