def make(item_map, axis, top=None): """ item_map can be url/hashtag -> [(user_id, tweet_id)...] make item_time_map = url/hashtag -> [(user_id, tweet_id, created_at datetime, axis place)...] """ min_dt, max_dt = axis axis_map, center_date_map = map_axis(min_dt, max_dt) print('Axis Map Done') time_table = db.execute( db.mk_connection(), 'select user_id, tweet_id, created_at from tweets') time_map = {} for user_id, tweet_id, dt in time_table: time_map[(user_id, tweet_id)] = dt print('Time Map Done') item_time_map = {} key_set = item_map.keys() if top is None else sorted( item_map.keys(), key=lambda k: len(item_map[k]), reverse=True)[:top] for item in key_set: # keys can be url/hashtag instances = item_map[item] time_instances = [(user_id, tweet_id, time_map[(user_id, tweet_id)]) for user_id, tweet_id in instances] time_instances = [(user_id, tweet_id, dt) for user_id, tweet_id, dt in time_instances if dt is not None] time_instances.sort(key=lambda t: t[2]) item_time_map[item] = [(user_id, tweet_id, dt, axis_map[(dt.year, quarter(dt))]) for user_id, tweet_id, dt in time_instances] print('Done Item', item) return item_time_map, center_date_map
def runner(user, sos=False): try: tweets = user_tweets[user] conn = db.mk_connection() for ts in tweets: try: tweet = db.execute( conn, 'select tweet from connection_tweets where user_id = %s and tweet_id = %s' % (user, ts))[0][0] obj = json.loads(tweet) t = obj['created_at'] t = datetime.datetime.strptime( t, '%a %b %d %H:%M:%S +0000 %Y').strftime( '%Y-%m-%d %H:%M:%S') db.execute( conn, 'update tweets set created_at = %s where user_id = %s and tweet_id = %s', has_res=False, args=[t, user, ts]) except: print('Error', user, ts, t) if sos: raise conn.commit() conn.close() print('Done', user) except: if sos: raise print('Error', user)
def runner(user, sos = False): conn = db.mk_connection() tweets = user_tweets[user] try: metadata = {} for tweet in tweets: metadata[tweet] = db.execute(conn, 'select tweet from connection_tweets where user_id = %s and tweet_id = %s' % (user, tweet))[0][0] except: print(user, 'Error', '(fetch metadata)') if sos: raise try: urls = [] for tweet in tweets: tweet_urls = json.loads(metadata[tweet])['entities']['urls'] if len(tweet_urls) > 0: for url in tweet_urls: urls.append((tweet, url['expanded_url'])) except: print(user, 'Error', '(parse metadata)') if sos: raise try: for tweet, expanded_url in urls: db.execute(conn, 'insert into tweets_url values (%s, %s, %s)', has_res = False, args = [user, tweet, expanded_url.encode('ascii', 'ignore').decode('ascii')]) conn.commit() conn.close() except: print(user, 'Error', '(insert url)') if sos: raise print('Done', user)
def runner(args): user_id, tweet_id, url = args try: r = requests.get(url, timeout = REQUEST_TIMEOUT) try: conn = db.mk_connection() db.execute(conn, 'update tweets_url set expanded_url = %s where user_id = %s and tweet_id = %s', has_res = False, args = [r.url, user_id, tweet_id]) db.close_connection(conn) except: print('Insert Error', user_id, tweet_id, r.url) except requests.exceptions.Timeout: print('Request Timeout', user_id, tweet_id, url) except: print('Request Error', user_id, tweet_id, url) try: conn = db.mk_connection() db.execute(conn, 'update tweets_url set expanded_url = %s where user_id = %s and tweet_id = %s', has_res = False, args = ['Request Error', user_id, tweet_id]) db.close_connection(conn) except: print('Request Error [Insert Error]', user_id, tweet_id)
def runner(user): try: conn = db.mk_connection() for tweet in tweets[user]: text = db.get_tweet_text(conn, user, tweet).encode( 'ascii', 'ignore').decode('ascii') db.insert_tweet_text(conn, user, tweet, text) conn.commit() conn.close() print('Done', user) except: print('Exception on', user)
def map(): """ create url -> (user_id, tweet_id) list map frequency of url = len(url_map[url]) rank all urls = sorted(url_map.keys(), key = lambda url: len(url_map[url]), reverse = True) """ table = db.execute(db.mk_connection(), 'select user_id, tweet_id, url, expanded_url from tweets_url') print(len(table), 'Table Entries') url_map = {} for user_id, tweet_id, url, expanded_url in table: url = normalize(url) if expanded_url is None or expanded_url == 'Request Error' else normalize(expanded_url) if url not in url_map: url_map[url] = [] url_map[url].append((user_id, tweet_id)) return url_map
def map(): """ create hashtag -> (user_id, tweet_id) list map frequency of hashtag = len(hashtag_map[hashtag]) rank all hashtags = sorted(hashtag_map.keys(), key = lambda hashtag: len(hashtag_map[hashtag]), reverse = True) """ table = db.execute( db.mk_connection(), 'select user_id, tweet_id, hashtag from tweets_hashtags') print(len(table), 'Table Entries') hashtag_map = {} for user_id, tweet_id, hashtag in table: hashtag = normalize(hashtag) if hashtag not in hashtag_map: hashtag_map[hashtag] = [] hashtag_map[hashtag].append((user_id, tweet_id)) return hashtag_map
db.execute(conn, 'update tweets_url set expanded_url = %s where user_id = %s and tweet_id = %s', has_res = False, args = [r.url, user_id, tweet_id]) db.close_connection(conn) except: print('Insert Error', user_id, tweet_id, r.url) except requests.exceptions.Timeout: print('Request Timeout', user_id, tweet_id, url) except: print('Request Error', user_id, tweet_id, url) try: conn = db.mk_connection() db.execute(conn, 'update tweets_url set expanded_url = %s where user_id = %s and tweet_id = %s', has_res = False, args = ['Request Error', user_id, tweet_id]) db.close_connection(conn) except: print('Request Error [Insert Error]', user_id, tweet_id) if len(sys.argv) == 3: user_id = int(sys.argv[1]) tweet_id = int(sys.argv[2]) runner((user_id, tweet_id)) if len(sys.argv) <= 1: table = db.execute(db.mk_connection(), 'select user_id, tweet_id, url, expanded_url from tweets_url') print(len(table), 'Table Entries') table = [(user_id, tweet_id, url) for user_id, tweet_id, url, expanded_url in table if expanded_url is None] print(len(table), 'Not Unshortened Entries') random.shuffle(table) with multiprocessing.Pool(processes = PROCESSES) as pool: [0 for _ in pool.imap_unordered(runner, table)]
if __name__ == '__main__': import db import requests from termcolor import colored as cr connection = db.mk_connection() urls = [t[0] for t in db.execute(connection, 'select url from tweets_url')] print(len(urls), 'URLs') connection.close() redirect_counter = 0 seen_counter = 0 for url in urls: seen_counter += 1 try: r = requests.get(url, timeout=5) if r.url != url: redirect_counter += 1 print(cr(redirect_counter, 'red', attrs=['bold']), '/', cr(seen_counter, 'green', attrs=['bold']), url, '->', r.url, flush=True) except requests.exceptions.Timeout: print('Timeout', url) except: print('Fail', url)
return \ 'const data = {\n' +\ ' datasets: [\n' +\ ',\n'.join([make_data(i) for i in range(limit)]) +\ '\n ]\n};' if __name__ == '__main__': import sys import db if len(sys.argv) >= 2: if sys.argv[1] == 'hashtag': import hashtag_frequency import timeline axis = db.execute(db.mk_connection(), 'select min(created_at), max(created_at) from tweets')[0] limit = int(sys.argv[2]) if len(sys.argv) >= 3 else 15 m, center_date_map = timeline.make(hashtag_frequency.map(), axis, top = limit) def get_x_date(axis_t): d = center_date_map[axis_t] return '"%s"' % d.strftime('%d/%m/%Y') def get_y_reverse(item_i): return 10 + (limit - item_i) * 16 s = make(m, limit, xfn = get_x_date, yfn = get_y_reverse) fname = sys.argv[3] if len(sys.argv) >= 4 else 'data.js' with open(fname, 'w') as f: f.write(s) if sys.argv[1] == 'url': import url_frequency