def test_client(self): nid = node_id() logger.info("sending to %s"%(nid)) node_queue = NodeQueue(nid, redis_config=self.config['redis_config']) #redis_cmd_queue.clear() cmd = { "cmd": "CRAWL_FRIENDS", "user_id": 1948122342, "data_type": "ids", "depth": 2, "bucket":"friend_ids" } # cmd = { # "cmd": "CRAWL_USER_TIMELINE", # "user_id": 1948122342,#53039176, # "bucket": "timelines" # } node_queue.put(cmd) #cmd = {"cmd":"TERMINATE"} #node_queue.put(cmd) return True
def test_client(self): nid = node_id() logger.info("sending to %s" % (nid)) node_queue = NodeQueue(nid, redis_config=self.config['redis_config']) #redis_cmd_queue.clear() cmd = { "cmd": "CRAWL_FRIENDS", "user_id": 1948122342, "data_type": "ids", "depth": 2, "bucket": "friend_ids" } # cmd = { # "cmd": "CRAWL_USER_TIMELINE", # "user_id": 1948122342,#53039176, # "bucket": "timelines" # } node_queue.put(cmd) #cmd = {"cmd":"TERMINATE"} #node_queue.put(cmd) return True
}, 'SEARCH':{ '-q/--query': dictionary['-q/--query'] }} for k, v in cmds.iteritems(): print('') print('\t%s:'%k) for kk, vv in v.iteritems(): print('\t\t%s: %s'%(kk, vv)) print('') if __name__=="__main__": nid = node_id() import json, os parser = argparse.ArgumentParser(add_help=False) parser.add_argument('-c', '--config', help="config.json that contains a) twitter api keys; b) redis connection string;", required = True) parser.add_argument('-cmd', '--command', help="the cmd you want to run, e.g., \"CRAWL_FRIENDS\"", required=True) parser.add_argument('-uid', '--user_id', help="the user_id", default=0) parser.add_argument('-tid', '--tweet_id', help="the tweet_id", default=0) parser.add_argument('-dt', '--data_type', help="the data_type (e.g., 'ids' or 'users'", default='ids') parser.add_argument('-d', '--depth', help="the depth", default=1) parser.add_argument('-j', '--json', help="the location of the json file that has a list of user_ids or screen_names", required=False) parser.add_argument('-o', '--output', help="the location of the output json file for storing user_ids", default='user_ids.json') parser.add_argument('-nid', '--node_id', help="the node_id you want to interact with", default=nid) parser.add_argument('-q', '--query', help="the search query", default=None) try:
def start_server(config, proxies): check_config(config) config = copy.copy(config) folders_to_create = [] buckets = [ "tweets", "followers", "follower_ids", "friends", "friend_ids", "timelines" ] ouput_folder = os.path.abspath(config['output']) archive_output = os.path.abspath( config['archive_output']) if config['archive_output'] else ouput_folder archive_output = os.path.join(archive_output, 'archived') folders_to_create.append(ouput_folder) folders_to_create.append(archive_output) for bucket in buckets: folders_to_create.append(os.path.join(ouput_folder, bucket)) folders_to_create.append(os.path.join(archive_output, bucket)) for folder_to_create in folders_to_create: if not os.path.exists(folder_to_create): os.makedirs(folder_to_create) logger.info("output to %s" % ouput_folder) logger.info("archived to %s" % archive_output) this_node_id = node_id() node_queue = NodeQueue(this_node_id, redis_config=config['redis_config']) node_queue.clear() scheduler = Scheduler(this_node_id, config=config, proxies=proxies) logger.info('starting node_id: %s' % this_node_id) node_coordinator = NodeCoordinator(config['redis_config']) # node_coordinator.clear() # the main event loop, actually we don't need one, since we can just join on the crawlers and don't stop until a terminate command is issued to each crawler; # but we need one to report the status of each crawler and perform the tarball tashs... last_archive_ts = time.time( ) + 3600 # the first archive event starts 2 hrs later... pre_time = time.time() last_load_balancing_task_ts = time.time() while True: if time.time() - pre_time > 120: logger.info(pprint.pformat(scheduler.crawler_status())) pre_time = time.time() if scheduler.is_alive(): cmd = {'cmd': 'CRAWLER_FLUSH'} scheduler.enqueue(cmd) if time.time() - last_archive_ts > 3600: logger.info("start archive procedure...") with concurrent.futures.ProcessPoolExecutor( max_workers=len(buckets)) as executor: future_proxies = { executor.submit(tarball_results, ouput_folder, bucket, archive_output, int(time.time()) - 3600): bucket for bucket in buckets } for future in future_proxies: future.add_done_callback(lambda f: logger.info( "archive created? %s: [%s]" % f.result())) last_archive_ts = time.time() # block, the main process...for a command if not scheduler.is_alive(): logger.info( "no crawler is alive... waiting to recreate all crawlers...") time.sleep(120) # sleep for a minute and retry continue if time.time( ) - last_load_balancing_task_ts > 1800: # try to balance the local queues every 30 mins last_load_balancing_task_ts = time.time() cmd = {'cmd': 'BALANCING_LOAD'} scheduler.enqueue(cmd) cmd = node_queue.get(block=True, timeout=360) if cmd: scheduler.enqueue(cmd)
def start_server(config, proxies): import copy check_config(config) config = copy.copy(config) folders_to_create = [] buckets = ["tweets", "followers", "follower_ids", "friends", "friend_ids", "timelines"] ouput_folder = os.path.abspath(config['output']) archive_output = os.path.abspath(config['output']) if config['output'] else ouput_folder #archive_output = os.path.abspath(config['archive_output']) if config['archive_output'] else ouput_folder archive_output = os.path.join(archive_output, 'archived') folders_to_create.append(ouput_folder) folders_to_create.append(archive_output) for bucket in buckets: folders_to_create.append(os.path.join(ouput_folder, bucket)) folders_to_create.append(os.path.join(archive_output, bucket)) for folder_to_create in folders_to_create: if (not os.path.exists(folder_to_create)): os.makedirs(folder_to_create) logger.info("output to %s"%(ouput_folder)) logger.info("archived to %s"%(archive_output)) this_node_id = node_id() node_queue = NodeQueue(this_node_id, redis_config=config['redis_config']) node_queue.clear() scheduler = Scheduler(this_node_id, config=config, proxies=proxies) logger.info('starting node_id: %s'%this_node_id) node_coordinator = NodeCoordinator(config['redis_config']) #node_coordinator.clear() #the main event loop, actually we don't need one, since we can just join on the crawlers and don't stop until a terminate command is issued to each crawler; #but we need one to report the status of each crawler and perform the tarball tashs... last_archive_ts = time.time() + 3600 # the first archive event starts 2 hrs later... pre_time = time.time() last_load_balancing_task_ts = time.time() while True: if (time.time() - pre_time > 120): logger.info(pprint.pformat(scheduler.crawler_status())) pre_time = time.time() if (scheduler.is_alive()): cmd = {'cmd': 'CRAWLER_FLUSH'} scheduler.enqueue(cmd) if (time.time() - last_archive_ts > 3600): logger.info("start archive procedure...") with concurrent.futures.ProcessPoolExecutor(max_workers=len(buckets)) as executor: future_proxies = {executor.submit(tarball_results, ouput_folder, bucket, archive_output, int(time.time()) - 3600): bucket for bucket in buckets} for future in future_proxies: future.add_done_callback(lambda f: logger.info("archive created? %s: [%s]"%f.result())) last_archive_ts = time.time() # block, the main process...for a command if(not scheduler.is_alive()): logger.info("no crawler is alive... waiting to recreate all crawlers...") time.sleep(120) # sleep for a minute and retry continue if (time.time() - last_load_balancing_task_ts > 1800): # try to balance the local queues every 30 mins last_load_balancing_task_ts = time.time() cmd = {'cmd': 'BALANCING_LOAD'} scheduler.enqueue(cmd) cmd = node_queue.get(block=True, timeout=360) if cmd: scheduler.enqueue(cmd)
'SEARCH': { '-q/--query': dictionary['-q/--query'] } } for k, v in cmds.iteritems(): print('') print('\t%s:' % k) for kk, vv in v.iteritems(): print('\t\t%s: %s' % (kk, vv)) print('') if __name__ == "__main__": nid = node_id() import json, os parser = argparse.ArgumentParser(add_help=False) parser.add_argument( '-c', '--config', help= "config.json that contains a) twitter api keys; b) redis connection string;", required=True) parser.add_argument( '-cmd', '--command', help="the cmd you want to run, e.g., \"CRAWL_FRIENDS\"", required=True) parser.add_argument('-uid', '--user_id', help="the user_id", default=0)