#user screen names we are interested in user_screenname_id_pairs = [line.strip().split("\t") for line in open(sys.argv[3]).readlines()] print user_screenname_id_pairs[0] pickle_dir = OUTPUT_DIRECTORY +"/obj/" network_dir = OUTPUT_DIRECTORY+"/net/" general_utils.mkdir_no_err(OUTPUT_DIRECTORY) general_utils.mkdir_no_err(pickle_dir) general_utils.mkdir_no_err(network_dir) multiprocess_setup.init_good_sync_manager() ##put data on the queue request_queue = multiprocess_setup.load_request_queue(user_screenname_id_pairs, len(handles)) processes = [] for i in range(len(handles)): p = TwitterEgoNetworkWorker(request_queue, handles[i], network_dir, pickle_dir) p.start() processes.append(p) try: for p in processes: p.join() except KeyboardInterrupt: print 'keyboard interrupt'
# Create the output directory mkdir_no_err(output_dir) # chunk tweets into 100s (the API takes them by 100) i = 0 tweets_chunked = chunk_data(tweet_ids) print tweets_chunked[0] # init a sync manager multiprocess_setup.init_good_sync_manager() # put data on the queue request_queue = multiprocess_setup.load_request_queue(tweets_chunked, len(handles)) # run! processes = [] for i in range(len(handles)): p = TweetDataWorker(request_queue,handles[i],i,output_dir) p.start() processes.append(p) try: for p in processes: p.join() except KeyboardInterrupt: print 'keyboard interrupt'
# get all the handles we have to the api handles = general_utils.get_handles(glob.glob(os.path.join(sys.argv[1],"*.txt"))) print len(handles) print 'n authed users: ', len(handles) # user screen names we are interested in user_sns = ['ManchesterMJC','Aviationeuro','SanteriSanttus','OneworldLover', 'ENORsquawker','plane_spotters','MANSpotter99','BennyPlanespot','PlanespotterGuy','planespotterWal'] user_screenname_id_pairs = get_user_ids_and_sn_data_from_list(user_sns,handles,True) print 'got screen names, ', len(user_screenname_id_pairs) # put data on the queue request_queue = multiprocess_setup.load_request_queue( [(x[1],0) for x in user_screenname_id_pairs], len(handles), add_nones=False) pickle_dir = OUTPUT_DIRECTORY +"/obj/" network_dir = OUTPUT_DIRECTORY+"/json/" general_utils.mkdir_no_err(OUTPUT_DIRECTORY) general_utils.mkdir_no_err(pickle_dir) general_utils.mkdir_no_err(network_dir) multiprocess_setup.init_good_sync_manager() # put data on the queue user_screenname_id_pairs = get_user_ids_and_sn_data_from_list(user_sns,handles,True) print 'got screen names, ', len(user_screenname_id_pairs) # put data on the queue
user_ids = [u for u in user_ids] user_data_chunked = [] i=0 while i < len(user_ids): user_data_chunked.append(user_ids[i:(i+100)]) i += 100 user_data_chunked.append(user_ids[i-100:len(user_ids)]) print 'len chunked: ', len(user_data_chunked) multiprocess_setup.init_good_sync_manager() ##put data on the queue request_queue = multiprocess_setup.load_request_queue([x for x in user_data_chunked], len(handles), add_nones=True) processes = [] for i in range(len(handles)): p = SimpleUserLookupWorker(request_queue,handles[i],i, out_dir) p.start() processes.append(p) try: for p in processes: p.join() except KeyboardInterrupt: print 'keyboard interrupt'
from twitter_dm.utility import general_utils from twitter_dm.utility.general_utils import mkdir_no_err, collect_system_arguments, chunk_data handles, output_dir, tweet_ids, is_ids = collect_system_arguments(sys.argv) # Create the output directory mkdir_no_err(output_dir) # chunk tweets into 100s (the API takes them by 100) i = 0 tweets_chunked = chunk_data(tweet_ids) # init a sync manager multiprocess_setup.init_good_sync_manager() # put data on the queue request_queue = multiprocess_setup.load_request_queue(tweet_ids, len(handles)) # run! processes = [] for i in range(len(handles)): p = TweetDataWorker(request_queue, handles[i], i, output_dir) p.start() processes.append(p) try: for p in processes: p.join() except KeyboardInterrupt: print 'keyboard interrupt'
user_ids = [line.strip().split(",")[0] for line in open(sys.argv[3]).readlines()] print "num users: ", len(user_ids) general_utils.mkdir_no_err(out_dir) general_utils.mkdir_no_err(os.path.join(out_dir, "obj")) general_utils.mkdir_no_err(os.path.join(out_dir, "json")) multiprocess_setup.init_good_sync_manager() # already_done = set([os.path.basename(f) for f in glob.glob(out_dir+"/*")]) print "len already done:", 0 # user_screennames = [u for u in user_screennames if u not in already_done] ##put data on the queue request_queue = multiprocess_setup.load_request_queue(user_ids, len(handles)) processes = [] for i in range(len(handles)): p = UserDataWorker( request_queue, handles[i], out_dir, always_pickle=True, gets_user_id=False, populate_lists=False, populate_friends=True, populate_followers=True, ) p.start() processes.append(p)
print 'n authed users: ', len(handles) # user screen names we are interested in user_sns = [ 'ManchesterMJC', 'Aviationeuro', 'SanteriSanttus', 'OneworldLover', 'ENORsquawker', 'plane_spotters', 'MANSpotter99', 'BennyPlanespot', 'PlanespotterGuy', 'planespotterWal' ] user_screenname_id_pairs = get_user_ids_and_sn_data_from_list( user_sns, handles, True) print 'got screen names, ', len(user_screenname_id_pairs) # put data on the queue request_queue = multiprocess_setup.load_request_queue( [(x[1], 0) for x in user_screenname_id_pairs], len(handles), add_nones=False) pickle_dir = OUTPUT_DIRECTORY + "/obj/" network_dir = OUTPUT_DIRECTORY + "/json/" general_utils.mkdir_no_err(OUTPUT_DIRECTORY) general_utils.mkdir_no_err(pickle_dir) general_utils.mkdir_no_err(network_dir) multiprocess_setup.init_good_sync_manager() # put data on the queue user_screenname_id_pairs = get_user_ids_and_sn_data_from_list( user_sns, handles, True) print 'got screen names, ', len(user_screenname_id_pairs)
user_id_and_since_id = [] for line in open(sys.argv[3]).readlines(): line_spl = line.strip().split(",") if line_spl[1] == 'None': continue user_id_and_since_id.append((line_spl[0],line_spl[1])) print 'num users: ', len(user_id_and_since_id) general_utils.mkdir_no_err(out_dir) general_utils.mkdir_no_err(os.path.join(out_dir,"obj")) general_utils.mkdir_no_err(os.path.join(out_dir,"json")) multiprocess_setup.init_good_sync_manager() ##put data on the queue request_queue = multiprocess_setup.load_request_queue(user_id_and_since_id, len(handles)) processes = [] for i in range(len(handles)): p = UserDataWorker(request_queue,handles[i],out_dir, always_pickle=True, gets_user_id=False, populate_lists=False, populate_friends=True, populate_followers=True, gets_since_tweet_id=True) p.start() processes.append(p) try: for p in processes:
from twitter_dm.multiprocess import multiprocess_setup from twitter_dm.multiprocess.WorkerSimpleUserLookup import SimpleUserLookupWorker from twitter_dm.utility import general_utils handles, out_dir, data_to_collect, is_ids = collect_system_arguments(sys.argv) general_utils.mkdir_no_err(out_dir) user_data_chunked = chunk_data(data_to_collect) print 'len chunked: ', len(user_data_chunked) # initialize a better sync manager multiprocess_setup.init_good_sync_manager() # put data on the queue request_queue = multiprocess_setup.load_request_queue( [x for x in user_data_chunked], len(handles), add_nones=True) processes = [] for i in range(len(handles)): p = SimpleUserLookupWorker(request_queue, handles[i], i, out_dir, gets_user_id=is_ids) p.start() processes.append(p) try: for p in processes: p.join() except KeyboardInterrupt:
import sys from twitter_dm.utility.general_utils import collect_system_arguments from twitter_dm.multiprocess import multiprocess_setup from twitter_dm.multiprocess.WorkerBotOMeter import BotOMeterWorker from twitter_dm.utility import general_utils handles, out_dir, data_to_collect, is_ids, mashape_key = collect_system_arguments( sys.argv, ['mashape_key']) general_utils.mkdir_no_err(out_dir) # initialize a better sync manager multiprocess_setup.init_good_sync_manager() # put data on the queue request_queue = multiprocess_setup.load_request_queue( [x.strip() for x in data_to_collect], len(handles), add_nones=True) processes = [] for i in range(len(handles)): p = BotOMeterWorker(request_queue, handles[i], i, out_dir, mashape_key) p.start() processes.append(p) try: for p in processes: p.join() except KeyboardInterrupt: print 'keyboard interrupt'