def gen_json_for_tweets_of_interest(data, identity_list): of_id, uid_list = data json_of_name = os.path.join(JSON_OUTPUT_DIRECTORY, str(of_id) + ".json.gz") print 'inp: ', json_of_name, len(uid_list), uid_list[0:2] tweets_to_write = [] if not os.path.exists(json_of_name): for i, uid in enumerate(uid_list): if i % 25 == 0: print i, len(tweets_to_write) try: u = TwitterUser() u.populate_tweets_from_file(os.path.join( JSON_INPUT_DIRECTORY, uid + ".json.gz"), store_json=True) tweets_to_keep = [] for t in u.tweets: if not t.retweeted and len(t.tokens) > 4: expanded_token_set = copy(t.tokens) for token in t.tokens: expanded_token_set += get_alternate_wordforms( token) if len(set(expanded_token_set) & identity_list): tweets_to_keep.append(t) tweets_to_write += tweets_to_keep except: print 'FAILED JSON FOR USER: '******'WRITING JSON' out_fil = gzip.open(json_of_name, "wb") for tweet in tweets_to_write: out_fil.write( json.dumps(tweet.raw_json).strip().encode("utf8") + "\n") out_fil.close()
def gen_conll_file(fil,ptb_dir, dp_dir): user = TwitterUser() user.populate_tweets_from_file(fil, do_tokenize=False) if 50 <= user.n_total_tweets <= 15000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: dp_filename = os.path.join(dp_dir,str(user.user_id)+".gz") ptb_filename = os.path.join(ptb_dir,str(user.user_id)+".txt.gz") if not os.path.exists(dp_filename) or not os.path.exists(ptb_filename): return ['no_dp_ptb',[user.user_id,os.path.exists(dp_filename),os.path.exists(ptb_filename)]] penntreebank = {x[0] : x[1:] for x in read_grouped_by_newline_file(ptb_filename)} dependency_parse = read_grouped_by_newline_file(dp_filename) tweet_set = [(i,t) for i,t in enumerate(user.tweets) if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ langid.classify(t.text)[0] == 'en'] # non english speaker or spam if len(tweet_set) < 40: return ['notweets',user.user_id] data_to_return = [] for twit_it, tweet in tweet_set: data_for_tweet = [] ptb_for_tweet = penntreebank[str(tweet.id)] dp_for_tweet = dependency_parse[twit_it] if ptb_for_tweet[0].split("\t")[2] != DependencyParseObject(dp_for_tweet[0]).text: print 'ahhhhh, weird stuff' continue for i, p in enumerate(dp_for_tweet): d = DependencyParseObject(tsn([p,tweet.id,user.user_id,tweet.created_at.strftime("%m-%d-%y")],newline=False)) # get java features spl_java = ptb_for_tweet[i].split("\t") java_id, penn_pos_tag,word = spl_java[:3] java_features = '' if len(spl_java) == 3 else spl_java[3] d.features += [x for x in java_features.split("|") if x != ''] d.features.append("penn_treebank_pos="+penn_pos_tag) data_for_tweet.append(d) data_to_return.append(data_for_tweet) return ['success', [user.user_id,data_to_return]] else: return ['baduser',user.user_id]
def gen_output(data, json_data_dir): term, is_reply, tweets_needed = data dataset = [] # get all user files files = glob.glob(os.path.join(json_data_dir, "*")) random.shuffle(files) for f in files: user = TwitterUser() user.populate_tweets_from_file(f, store_json=True, do_arabic_stemming=False, lemmatize=False) if 50 <= user.n_total_tweets <= 10000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ len(t.tokens) > 5 and\ t.created_at >= MIN_TWEET_DATE and\ (term == '' or term in t.tokens) and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if is_reply: tweet_set = [t for t in tweet_set if t.reply_to] else: tweet_set = [t for t in tweet_set if not t.reply_to] if len(tweet_set) == 0: print 'size 0', term, tweets_needed, is_reply continue tweet = random.sample(tweet_set, 1)[0] print user.screen_name, term, tweets_needed, is_reply, ":::: ", tweet.text dataset.append(tweet) tweets_needed -= 1 if tweets_needed == 0: name = term if term != '' else 'random' name += '_reply' if is_reply else '_non_reply' pickle.dump(dataset, open(name + ".p", 'wb')) print 'done with: ', name, is_reply return else: print 'failed user'
def gen_json_for_tweets_of_interest(input_filename, output_filename, keep_only_tweets_with_terms=None): """ This function generates a cleaned json file so that the identity extraction only happens on "interesting" tweets. Right now, interesting is defined as non-retweets that have >4 tokens. Feel free to redefine as you feel is suitable :param input_filename: input json file name (Can be gzipped) :param output_filename: cleaned output json filename :param keep_only_tweets_with_terms: If you only want to keep tweets containing a specific set of terms, you can use this argument and pass in a set of terms here :return: """ tweets_to_write = [] if not os.path.exists(output_filename): try: u = TwitterUser() u.populate_tweets_from_file(input_filename, store_json=True) tweets = [ t for t in u.tweets if not t.retweeted and len(t.tokens) > 4 ] tweets_to_keep = [] if keep_only_tweets_with_terms: expanded_token_set = copy(t.tokens) for t in tweets: for token in t.tokens: expanded_token_set += get_alternate_wordforms(token) if len( set(expanded_token_set) & keep_only_tweets_with_terms): tweets_to_keep.append(t) else: tweets_to_keep = tweets tweets_to_write += tweets_to_keep except: print 'FAILED TO PARSE JSON FILE: ', input_filename print 'WRITING JSON' out_fil = gzip.open(output_filename, "wb") for tweet in tweets_to_write: out_fil.write( json.dumps(tweet.raw_json).strip().encode("utf8") + "\n") out_fil.close()
def gen_output(data, json_data_dir): term,is_reply,tweets_needed = data dataset = [] # get all user files files = glob.glob(os.path.join(json_data_dir,"*")) random.shuffle(files) for f in files: user = TwitterUser() user.populate_tweets_from_file(f,store_json=True,do_arabic_stemming=False,lemmatize=False) if 50 <= user.n_total_tweets <= 10000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ len(t.tokens) > 5 and\ t.created_at >= MIN_TWEET_DATE and\ (term == '' or term in t.tokens) and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if is_reply: tweet_set = [t for t in tweet_set if t.reply_to] else: tweet_set = [t for t in tweet_set if not t.reply_to] if len(tweet_set) == 0: print 'size 0', term, tweets_needed, is_reply continue tweet = random.sample(tweet_set, 1)[0] print user.screen_name, term, tweets_needed, is_reply, ":::: ", tweet.text dataset.append(tweet) tweets_needed -= 1 if tweets_needed == 0: name = term if term != '' else 'random' name += '_reply' if is_reply else '_non_reply' pickle.dump(dataset,open(name+".p",'wb')) print 'done with: ',name, is_reply return else: print 'failed user'
def get_user_network(self, this_user_network_dir_name, user_ids, restrict_output_to_ids, stored_user_list): counter_val = 0 for uid in user_ids: counter_val += 1 if counter_val % 10 == 0: print (counter_val, " / ", len(user_ids), this_user_network_dir_name.replace(self.network_dir, "")) # try to find user in stored_users if str(uid) in stored_user_list: user = pickle.load(open(self.pickle_dir + "/" + str(uid), "rb")) else: user = TwitterUser(self.api_hook, user_id=uid) user.populate_tweets_from_api() out_fil = open(self.pickle_dir + "/" + str(uid), "wb") pickle.dump(user, out_fil) out_fil.close() self.write_user_network(this_user_network_dir_name, user, uid, restrict_output_to_ids)
def gen_json_for_tweets_of_interest(input_filename, output_filename,keep_only_tweets_with_terms=None): """ This function generates a cleaned json file so that the identity extraction only happens on "interesting" tweets. Right now, interesting is defined as non-retweets that have >4 tokens. Feel free to redefine as you feel is suitable :param input_filename: input json file name (Can be gzipped) :param output_filename: cleaned output json filename :param keep_only_tweets_with_terms: If you only want to keep tweets containing a specific set of terms, you can use this argument and pass in a set of terms here :return: """ tweets_to_write = [] if not os.path.exists(output_filename): try: u = TwitterUser() u.populate_tweets_from_file(input_filename,store_json=True) tweets = [t for t in u.tweets if not t.retweeted and len(t.tokens) > 4] tweets_to_keep = [] if keep_only_tweets_with_terms: expanded_token_set = copy(t.tokens) for t in tweets: for token in t.tokens: expanded_token_set += get_alternate_wordforms(token) if len(set(expanded_token_set) & keep_only_tweets_with_terms): tweets_to_keep.append(t) else: tweets_to_keep = tweets tweets_to_write += tweets_to_keep except: print 'FAILED TO PARSE JSON FILE: ', input_filename print 'WRITING JSON' out_fil = gzip.open(output_filename, "wb") for tweet in tweets_to_write: out_fil.write(json.dumps(tweet.raw_json).strip().encode("utf8") + "\n") out_fil.close()
def run(self): print ("Worker started") while True: user_id, snow_sample_number = self.queue.get(True) print ("Starting: ", user_id, snow_sample_number) stored_user_list = set([os.path.basename(user_pickle) for user_pickle in glob.glob(self.pickle_dir + "*")]) # Get the ego if user_id in stored_user_list: print ("\tgot pickled: ", user_id) user = pickle.load(open(self.pickle_dir + "/" + str(user_id), "rb")) else: user = TwitterUser(self.api_hook, user_id=user_id) print ("\tgetting tweets for: ", user_id) user.populate_tweets_from_api() print ("\t num tweets received for: ", user_id, " ", len(user.tweets)) # print '\tgetting followers for: ', screen_name # user.populate_followers() print ("\tgetting friends for: ", user_id) user.populate_friends() print ("pickling: ", user_id) pickle.dump(user, open(self.pickle_dir + "/" + user_id, "wb")) ##write out their following network and add each id to queue network_fil = codecs.open(os.path.join(self.network_dir, user_id), "w", "utf-8") added = 0 for following_id in user.friend_ids: if snow_sample_number < 2: added += 1 self.queue.put([str(following_id), snow_sample_number + 1]) network_fil.write(",".join([user_id, str(following_id)]) + "\n") network_fil.close() print "finished collecting data for: ", user_id print "added: ", added
def getTweets(twitterid): ''' Function to get the twitter data for an individual twitter ID. This function is written to work with Kenny's github example here: https://github.com/kennyjoseph/twitter_dm Input: string of twitterID Output: list of the raw string of all tweets for twitterID ''' from twitter_dm.TwitterUser import TwitterUser tweets = [] u = TwitterUser() u.populate_tweets_from_file( twitterid + '.json' ) #Need to figure out of we can use numeric ID (123456789.json) or name (kenny_joseph.json) for t in u.tweets: tweets.append( t.tokens ) #not sure if tokens is exactly what we want, we want the raw words, not necessarily tokens. We'll check this. # # texts={} # source_filename='Datasets/Twitter/members.zip' # parser = etree.XMLParser(encoding='utf8',recover=True) # with zipfile.ZipFile(source_filename) as zf: # for i,member in enumerate(zf.infolist()): # name=member.filename.split('/')[1].split('.')[0] #filename is Raw3/name.csv # if idx ==name: # #print idx, name # raw=zf.open(member) # data=csv.reader(raw) # for j,line in enumerate(data): # if j>0: # texts[idx+'_'+str(j)]=line[0] # if texts=={}: # print 'no tweets for ', idx return tweets
def run(self): print ("Worker started") while True: user_id, snow_sample_number = self.queue.get(True) print "Starting: ", user_id, snow_sample_number stored_user_list = set( [os.path.basename(user_pickle) for user_pickle in glob.glob(os.path.join(self.out_dir, "obj", "*"))] ) # Get the ego if user_id in stored_user_list: print ("\tgot pickled: ", user_id) user = pickle.load(open(os.path.join(self.out_dir, "obj", str(user_id)), "rb")) else: user = TwitterUser(self.api_hook, user_id=user_id) user.populate_tweets_from_api(json_output_directory=os.path.join(self.out_dir, "json")) if len(user.tweets) == 0: print "pickling and dumping: ", user.screen_name pickle.dump(user, open(os.path.join(self.out_dir, "obj", user_id), "wb")) continue print "populating friends, ", user.screen_name user.populate_friends() print "pickling and dumping (no tweets): ", user.screen_name user.tweets = [] pickle.dump(user, open(os.path.join(self.out_dir, "obj", user_id), "wb")) ##write out their following network and add each id to queue # network_fil = codecs.open(os.path.join(self.network_dir,user_id),"w", "utf-8") added = 0 for following_id in user.mentioned.keys(): if snow_sample_number < self.step_count: added += 1 self.queue.put([str(following_id), snow_sample_number + 1]) # network_fil.write(",".join([user_id,str(following_id)])+"\n") # network_fil.close() print "finished collecting data for: ", user_id print "added: ", added
# Get the handles to the Twitter API handles = get_handles(glob.glob(os.path.join(sys.argv[1], "*.txt"))) print 'n authed users: ', len(handles) out_dir = sys.argv[2] os.mkdir(out_dir) #user_sns = [line.strip() for line in open(sys.argv[3]).readlines()] user_sns = ['Neuro_Skeptic'] print 'num users: ', len(user_sns) of = codecs.open("output_fil.tsv", "w", "utf8") for i in range(len(user_sns)): #creates a Twitter User object to fill with information from the API user = TwitterUser(handles[i], screen_name=user_sns[i]) user.populate_tweets_from_api(json_output_filename=out_dir + user_sns[i] + ".json", sleep_var=False) user.populate_followers() rts = 0 gt = 0 for t in user.tweets: if t.retweeted is not None: rts += 1 if t.geocode_info is not None: gt += 1 of.write( tab_stringify_newline([ user.screen_name, gt, rts,
spl = dataset_descrip.split("=") if spl[0] == 'random': datasets_to_collect.append(['', int(spl[1]), []]) else: datasets_to_collect.append([spl[0], int(spl[1]), []]) # get all user files files = glob.glob(os.path.join(json_data_dir, "*")) curr_dataset = datasets_to_collect[0] print datasets_to_collect for f in files: user = TwitterUser(filename_for_tweets=f) if user.n_total_tweets < 10000 and user.n_total_tweets > 50 and\ user.followers_count < 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and\ len(t.tokens) > 5 and\ t.created_at <= MIN_TWEET_DATE and\ curr_dataset[0] in t.tokens and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if len(tweet_set) == 0: continue tweet = random.sample(tweet_set, 1)[0]
from twitter_dm.TwitterAPIHook import TwitterAPIHook from twitter_dm.TwitterUser import TwitterUser username_to_collect_data_for = 'Jackie_Pooo' consumer_key = "YOUR_CONSUMER_KEY_HERE" consumer_secret = "YOUR_CONSUMER_SECRET_HERE" access_token = "YOUR_ACCESS_TOKEN_HERE" access_token_secret = "YOUR_ACCESS_TOKEN_SECRET_HERE" ## get a "hook", or connection, to the API using your consumer key/secret and access token/secret api_hook = TwitterAPIHook(consumer_key,consumer_secret, access_token=access_token,access_token_secret=access_token_secret) #creates a Twitter User object to fill with information from the API user = TwitterUser(api_hook,screen_name=username_to_collect_data_for) # we call populate_tweets_from_api,which goes to the Twitter API # and collects the user's data it is outputted to the file username_you_put.json # the sleep_var param tells the function it shouldn't worry # about rate limits (we're only collecting for one user, so it doesn't really matter # If you remove the is_gzip argument, the output file will be gzipped print 'populating users tweets!' user.populate_tweets_from_api(json_output_filename=username_to_collect_data_for+".json", sleep_var=False, is_gzip=False, since_id=None) for t in user.tweets: print t.mentions print 'user had {n_tweets} tweets'.format(n_tweets=len(user.tweets))
# Get the handles to the Twitter API handles = get_handles(glob.glob(os.path.join(sys.argv[1],"*.txt"))) print 'n authed users: ', len(handles) out_dir = sys.argv[2] os.mkdir(out_dir) #user_sns = [line.strip() for line in open(sys.argv[3]).readlines()] user_sns = ['Neuro_Skeptic'] print 'num users: ', len(user_sns) of = codecs.open("output_fil.tsv","w","utf8") for i in range(len(user_sns)): #creates a Twitter User object to fill with information from the API user = TwitterUser(handles[i], screen_name=user_sns[i]) user.populate_tweets_from_api(json_output_filename=out_dir+user_sns[i]+".json", sleep_var=False) user.populate_followers() rts = 0 gt = 0 for t in user.tweets: if t.retweeted is not None: rts+=1 if t.geocode_info is not None: gt +=1 of.write(tab_stringify_newline([user.screen_name, gt, rts, len(user.tweets),
from twitter_dm.TwitterAPIHook import TwitterAPIHook username_to_collect_data_for = '_kenny_joseph' consumer_key = "YOUR_CONSUMER_KEY_HERE" consumer_secret = "YOUR_CONSUMER_SECRET_HERE" access_token = "YOUR_ACCESS_TOKEN_HERE" access_token_secret = "YOUR_ACCESS_TOKEN_SECRET_HERE" ## get a "hook", or connection, to the API using your consumer key/secret and access token/secret api_hook = TwitterAPIHook(consumer_key,consumer_secret, access_token=access_token,access_token_secret=access_token_secret) #creates a Twitter User object to fill with information from the API user = TwitterUser(api_hook, screen_name=username_to_collect_data_for) # we call populate_tweets_from_api,which goes to the Twitter API # and collects the user's data it is outputted to the file username_you_put.json # the sleep_var param tells the function it shouldn't worry # about rate limits (we're only collecting for one user, so it doesn't really matter # If you remove the is_gzip argument, the output file will be gzipped print 'populating users tweets!' user.populate_tweets_from_api(json_output_filename=username_to_collect_data_for+".json", sleep_var=False, is_gzip=False) print 'user had {n_tweets} tweets'.format(n_tweets=len(user.tweets)) # we now will collect the user's followers print 'populating user followers!' user.populate_followers(sleep_var=False)
elif args.screen_name: print 'Running with screen name: ', args.screen_name args.json_file_or_folder = os.path.join(OUTPUT_DIR, args.screen_name + ".json.gz") if os.path.exists(args.json_file_or_folder): print "User's tweets already in the system at: ", args.json_file_or_folder else: print "Getting user's tweets and saving to: ", args.json_file_or_folder if not args.path_to_twitter_credentials_file: print "Can't do anything with a screen name without some API credentials, see the help for this script " \ "and this parameter!" sys.exit(-1) app_handler = TwitterApplicationHandler( pathToConfigFile=args.path_to_twitter_credentials_file) user = TwitterUser(screen_name=args.screen_name, api_hook=app_handler.api_hooks[0]) user.populate_tweets_from_api( json_output_filename=args.json_file_or_folder, sleep_var=False) ######## # load the models and the files ######## print 'LOADING MODEL' identity_model, feature_names = get_identity_model_and_features() word_vector_model, all_dictionaries, ark_clusters, sets, names = get_init_data( GENSIM_MODEL_LOCATION, BROWN_CLUSTER_LOCATION) print 'MODEL HAS BEEN LOADED'
def gen_conll_file(fil, ptb_dir, dp_dir): user = TwitterUser() user.populate_tweets_from_file(fil, do_tokenize=False) if 50 <= user.n_total_tweets <= 15000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: dp_filename = os.path.join(dp_dir, str(user.user_id) + ".gz") ptb_filename = os.path.join(ptb_dir, str(user.user_id) + ".txt.gz") if not os.path.exists(dp_filename) or not os.path.exists(ptb_filename): return [ 'no_dp_ptb', [ user.user_id, os.path.exists(dp_filename), os.path.exists(ptb_filename) ] ] penntreebank = { x[0]: x[1:] for x in read_grouped_by_newline_file(ptb_filename) } dependency_parse = read_grouped_by_newline_file(dp_filename) tweet_set = [(i,t) for i,t in enumerate(user.tweets) if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ langid.classify(t.text)[0] == 'en'] # non english speaker or spam if len(tweet_set) < 40: return ['notweets', user.user_id] data_to_return = [] for twit_it, tweet in tweet_set: data_for_tweet = [] ptb_for_tweet = penntreebank[str(tweet.id)] dp_for_tweet = dependency_parse[twit_it] if ptb_for_tweet[0].split("\t")[2] != DependencyParseObject( dp_for_tweet[0]).text: print 'ahhhhh, weird stuff' continue for i, p in enumerate(dp_for_tweet): d = DependencyParseObject( tsn([ p, tweet.id, user.user_id, tweet.created_at.strftime("%m-%d-%y") ], newline=False)) # get java features spl_java = ptb_for_tweet[i].split("\t") java_id, penn_pos_tag, word = spl_java[:3] java_features = '' if len(spl_java) == 3 else spl_java[3] d.features += [x for x in java_features.split("|") if x != ''] d.features.append("penn_treebank_pos=" + penn_pos_tag) data_for_tweet.append(d) data_to_return.append(data_for_tweet) return ['success', [user.user_id, data_to_return]] else: return ['baduser', user.user_id]
else: print 'Running with json file: ', args.json_file_or_folder elif args.screen_name: print 'Running with screen name: ', args.screen_name args.json_file_or_folder = os.path.join(OUTPUT_DIR,args.screen_name+".json.gz") if os.path.exists(args.json_file_or_folder): print "User's tweets already in the system at: ", args.json_file_or_folder else: print "Getting user's tweets and saving to: ", args.json_file_or_folder if not args.path_to_twitter_credentials_file: print "Can't do anything with a screen name without some API credentials, see the help for this script " \ "and this parameter!" sys.exit(-1) app_handler = TwitterApplicationHandler(pathToConfigFile=args.path_to_twitter_credentials_file) user = TwitterUser(screen_name=args.screen_name, api_hook=app_handler.api_hooks[0]) user.populate_tweets_from_api(json_output_filename=args.json_file_or_folder,sleep_var=False) ######## # load the models and the files ######## print 'LOADING MODEL' identity_model,feature_names = get_identity_model_and_features() word_vector_model, all_dictionaries, ark_clusters, sets, names = get_init_data(GENSIM_MODEL_LOCATION, BROWN_CLUSTER_LOCATION) print 'MODEL HAS BEEN LOADED' def gen_json_for_tweets_of_interest(input_filename, output_filename,keep_only_tweets_with_terms=None): """
def run(self): print('Worker started') # do some initialization here while True: data = self.queue.get(True) try: if data is None: print('ALL FINISHED!!!!', self.conn_number) break print('Starting: ', data) if self.gets_user_id: user = TwitterUser(self.api_hook, user_id=data) else: user = TwitterUser(self.api_hook, screen_name=data) user.populate_tweets_from_api(json_output_directory=os.path.join(self.out_dir,"json")) if len(user.tweets) == 0: if self.to_pickle or self.populate_lists or self.populate_friends or self.populate_followers: print 'pickling and dumping: ', user.screen_name pickle.dump(user, open(os.path.join(self.out_dir,"obj",data), "wb")) continue if self.populate_lists: user.populate_lists_member_of() if self.populate_friends: print 'populating friends, ', user.screen_name user.populate_friends() if self.populate_followers: print 'populating followers, ', user.screen_name user.populate_followers() if self.to_pickle or self.populate_lists or self.populate_friends or self.populate_followers: # Pickle and dump user print 'pickling and dumping (no tweets): ', user.screen_name user.tweets = [] pickle.dump(user, open(os.path.join(self.out_dir,"obj",data), "wb")) except Exception: print('FAILED:: ', data) exc_type, exc_value, exc_traceback = sys.exc_info() print("*** print_tb:") traceback.print_tb(exc_traceback, limit=30, file=sys.stdout) print("*** print_exception:") print('finished collecting data for: ', data)
def run(self): print ("Worker started") while True: try: data = self.queue.get(True) if data is None: print "ALL DONE, EXITING!" return user_id, screen_name = data[0], data[1] print ("Starting: ", screen_name, user_id) this_user_network_dir_name = os.path.join(self.network_dir, user_id) mkdir_no_err(this_user_network_dir_name) stored_user_list = set( [os.path.basename(user_pickle) for user_pickle in glob.glob(self.pickle_dir + "*")] ) # Get the ego if user_id in stored_user_list: print ("\tgot pickled: ", user_id) user = pickle.load(open(self.pickle_dir + "/" + str(user_id), "rb")) else: user = TwitterUser(self.api_hook, user_id=user_id) print ("\tgetting tweets for: ", user_id) user.populate_tweets_from_api() print ("\t num tweets received for: ", user_id, " (", screen_name, "): ", len(user.tweets)) if len(user.tweets) > 0: print ("\tgetting lists, friends, followers for: ", user_id) user.populate_lists_member_of() # user.populate_followers() # user.populate_friends() print ("pickling: ", screen_name) pickle.dump(user, open(self.pickle_dir + "/" + user_id, "wb")) self.write_user_network(this_user_network_dir_name, user, user_id, None) if len(user.tweets) == 0: print ("finished collecting data for: ", user_id, ", no tweets") continue # Find the ego network based on retweets, mentions and replies user_network_to_pull = user.get_ego_network_actors() print ("Starting to get ", user.user_id, "'s network of ", len(user_network_to_pull), " actors") restrict_to_users = [u for u in user_network_to_pull] restrict_to_users.append(user_id) self.get_user_network( this_user_network_dir_name, user_network_to_pull, restrict_to_users, stored_user_list ) except Exception: print ("FAILED:: ", data) exc_type, exc_value, exc_traceback = sys.exc_info() print ("*** print_tb:") traceback.print_tb(exc_traceback, limit=50, file=sys.stdout) print ("finished collecting data for: ", screen_name)
from twitter_dm.TwitterUser import TwitterUser import datetime from time import mktime u = TwitterUser() u.populate_tweets_from_file("/Users/kennyjoseph/git/thesis/twitter_dm/examples/2431225676.json.gz") for t in u.tweets: print mktime(t.created_at.timetuple())
def run(self): print('Worker started') # do some initialization here snow_sample_number = None since_tweet_id = None while True: data = self.queue.get(True) try: if data is None: print 'ALL FINISHED!!!!' break if len(data) == 1 or type(data) is str or type( data) is unicode or type(data) is int: user_identifier = data elif len(data) == 3: user_identifier, snow_sample_number, since_tweet_id = data elif len(data) == 2: if self.step_count: user_identifier, snow_sample_number = data elif self.gets_since_tweet_id: user_identifier, since_tweet_id = data user_identifier = str(user_identifier) print 'Starting: ', data pickle_filename = os.path.join(self.out_dir, "obj", user_identifier) json_filename = os.path.join(self.out_dir, "json", user_identifier + ".json.gz") # Get the user's data if os.path.exists(pickle_filename) and os.path.exists( json_filename) and not self.add_to_file: print '\tgot existing data for: ', data user = pickle.load(open(pickle_filename, "rb")) user.populate_tweets_from_file(json_filename) else: if self.gets_user_id: user = TwitterUser(self.api_hook, user_id=user_identifier) else: user = TwitterUser(self.api_hook, screen_name=user_identifier) print 'populating tweets', user_identifier if self.populate_tweets: if self.save_user_tweets: print 'saving tweets to: ', json_filename of_name, tweet_count = user.populate_tweets_from_api( json_output_filename=json_filename, since_id=since_tweet_id, populate_object_with_tweets=False) else: of_name, tweet_count = user.populate_tweets_from_api( since_id=since_tweet_id, populate_object_with_tweets=False) if self.tweet_count_file: self.tweet_count_file.write( str(user_identifier) + "\t" + str(tweet_count) + "\n") if self.populate_lists: print 'populating lists', user.screen_name user.populate_lists_member_of() if self.populate_friends: print 'populating friends, ', user.screen_name user.populate_friends() if self.populate_followers: print 'populating followers, ', user.screen_name user.populate_followers() if self.save_user_data and \ (self.always_pickle or self.populate_lists or self.populate_friends or self.populate_followers): # Pickle and dump user #print 'pickling and dumping (no tweets): ', user.screen_name user.tweets = [] pickle.dump(user, open(pickle_filename, "wb")) # now add to queue if necessary if snow_sample_number is not None and snow_sample_number < self.step_count: for user_identifier in self.add_users_to_queue_function( user): self.queue.put( [str(user_identifier), snow_sample_number + 1]) if self.post_process_function: self.post_process_function(user) except KeyboardInterrupt as e: print e break except Exception: print('FAILED:: ', data) exc_type, exc_value, exc_traceback = sys.exc_info() print("*** print_tb:") traceback.print_tb(exc_traceback, limit=30, file=sys.stdout) print("*** print_exception:")