def do_dependency_parse(fil): u = TwitterUser() u.populate_tweets_from_file(fil,do_tokenize=False) out_file_name = fil.replace(".json","").replace(".gz","").replace("/json/","/dep_parse/") print out_file_name if len(u.tweets) == 0: os.utime(out_file_name) return 'empty, success' data = dependency_parse_tweets(TWEEBOPARSER_LOC,u.tweets,out_file_name) return 'completed'
def do_dependency_parse(fil): u = TwitterUser() u.populate_tweets_from_file(fil, do_tokenize=False) out_file_name = fil.replace(".json", "").replace(".gz", "").replace( "/json/", "/dep_parse/") print out_file_name if len(u.tweets) == 0: os.utime(out_file_name) return 'empty, success' data = dependency_parse_tweets(TWEEBOPARSER_LOC, u.tweets, out_file_name) return 'completed'
def return_users_from_json_file(file_name, user_id_field='id', only_english=False, min_tweet_count_for_user=5, verbose=True, stopwords=None, return_tweet_json=False): if file_name.endswith(".gz"): reader = [ z.decode("utf8") for z in gzip.open(file_name).read().splitlines() ] else: reader = codecs.open(file_name, "r", "utf8") users = defaultdict(list) n_tweets = 0 n_non_english = 0 for line in reader: n_tweets += 1 try: tweet = json.loads(line) except: print 'failed tweet' pass lang = tweet['lang'] if 'lang' in tweet else langid.classify( tweet['text'])[0] if not only_english or (only_english and lang == 'en'): # ignore the old tweets for now users[tweet['user'][user_id_field]].append(tweet) else: n_non_english += 1 if not file_name.endswith(".gz"): reader.close() n_tweets = float(n_tweets) if n_tweets == 0 or (only_english and (n_tweets - n_non_english) == 0): return [] good_users = [ u for u in users.itervalues() if len(u) >= min_tweet_count_for_user ] twitter_users = [ TwitterUser(list_of_tweets=u, stopwords=stopwords) for u in good_users ] if verbose: print '\tPercent non english tweets ignored:\t{:0.2f}'.format( n_non_english / n_tweets) print '\tNum used tweets:\t{0}'.format(n_tweets - n_non_english) print '\tN users pre min selection:\t', len(users) print '\tN users post min selection:\t', len(twitter_users) n_tweets_per_user = [len(u) for u in users.itervalues()] #print 'Tweet stats...min: %d max: %d median: %d mean: %d sd: %d' % \ #(np.min(n_tweets_per_user), # np.max(n_tweets_per_user), # np.median(n_tweets_per_user), # np.mean(n_tweets_per_user), # np.std(n_tweets_per_user)) if not return_tweet_json: return twitter_users tweet_dict = {} if return_tweet_json: for u in good_users: for t in u: tweet_dict[t['id']] = t return twitter_users, tweet_dict
print 'n authed users: ', len(handles) out_dir = sys.argv[2] user_ids = [line.strip().split(",")[0] for line in open(sys.argv[3]).readlines()] print 'num users: ', len(user_ids) net_out = codecs.open("mention_net.csv","w","utf8") net_out.write("sender,mentioned,date\n") ht_out = codecs.open("ht_net.csv","w","utf8") ht_out.write("user,hashtag,date\n") att_out = codecs.open("att_net.csv","w","utf8") att_out.write("user,user_name,times_listed,n_followers,n_following\n") for i in range(len(user_ids)): user = TwitterUser(handles[i], screen_name=user_ids[i]) user.populate_tweets_from_api(sleep_var=False) print user.screen_name for t in user.tweets: datetime = t.created_at.strftime("%Y-%m-%d") for m in t.mentions_sns: net_out.write(",".join([user.screen_name, m, datetime])+"\n") for h in t.hashtags: ht_out.write(",".join([user.screen_name,h,datetime])+"\n") try: att_out.write(",".join([user.screen_name,user.name,str(user.times_listed),str(user.followers_count),str(user.following_count)])+"\n") except: pass net_out.close() ht_out.close() att_out.close()
""" A simple example of how to use a single api hook to get tweets for a particular user """ import codecs import sys from twitter_dm import TwitterApplicationHandler from twitter_dm import TwitterUser if len(sys.argv) != 4: print 'usage: [login_credentials_file] [user_screen_name] [output_file]' sys.exit(-1) ##get all the handles we have to the api app_handler = TwitterApplicationHandler(pathToConfigFile=sys.argv[1]) print 'n authed users: ', len(app_handler.api_hooks) user = TwitterUser(app_handler.api_hooks[0], screen_name=sys.argv[2]) print('\tgetting tweets for: ', sys.argv[2]) user.populate_tweets_from_api(sleep_var=False) if len(user.tweets) > 0: out_fil = codecs.open(sys.argv[3], "w", "utf8") for tweet in user.tweets: out_fil.write(tweet.text.replace("\n", " ") + "\n") out_fil.close()
""" A simple example of how to use a single api hook to get tweets for a particular user """ import sys,codecs from twitter_dm import TwitterUser from twitter_dm import TwitterApplicationHandler if len(sys.argv) != 4: print 'usage: [login_credentials_file] [user_screen_name] [output_file]' sys.exit(-1) ##get all the handles we have to the api app_handler = TwitterApplicationHandler(pathToConfigFile=sys.argv[1]) print 'n authed users: ', len(app_handler.api_hooks) user = TwitterUser(app_handler.api_hooks[0], screen_name=sys.argv[2]) print('\tgetting tweets for: ', sys.argv[2]) user.populate_tweets_from_api(sleep_var=False) if len(user.tweets) > 0: out_fil = codecs.open(sys.argv[3], "w","utf8") for tweet in user.tweets: out_fil.write(tweet.text.replace("\n"," ")+"\n") out_fil.close()