def cmd(config, args): if (args.command not in avaliable_cmds): raise Exception("not a valid command...") nid = args.node_id logger.info("node_id: %s"%(nid)) node_queue = NodeQueue(nid, redis_config=config['redis_config']) node_coordinator = NodeCoordinator(config['redis_config']) # this can be done locally without sending the command to the servers... if (args.command == 'GET_UIDS_FROM_SCREEN_NAMES'): apikeys = config["apikeys"].values()[0] if (not os.path.exists(args.json)): raise Exception("doesn't exist... ") with open(os.path.abspath(args.json), 'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f: screen_names = json.load(f) user_api = User(apikeys=apikeys) user_ids = user_api.get_user_ids_by_screen_names(screen_names) json.dump(list(user_ids), o_f) elif (args.command == 'GET_USERS_FROM_IDS'): apikeys = config["apikeys"].values()[0] if (not os.path.exists(args.json)): raise Exception("doesn't exist... ") with open(os.path.abspath(args.json), 'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f: user_ids = json.load(f) user_api = User(apikeys=apikeys) users = user_api.get_users(user_ids) json.dump(list(users), o_f) elif (args.command.startswith('BATCH_')): command = args.command.replace('BATCH_', '') args_dict = copy.copy(args.__dict__) if (not os.path.exists(args.json)): raise Exception("doesn't exist... ") with open(os.path.abspath(args.json), 'rb') as f: user_ids = json.load(f) for user_id in user_ids: args_dict['user_id'] = user_id cmd = new_cmd(command, args_dict) node_queue.put(cmd) elif (args.command == 'LIST_NODES'): pp.pprint(node_coordinator.list_nodes()) elif (args.command == 'NODE_QSIZES'): raise NotImplemented("NotImplemented yet...") #pp.pprint(node_coordinator.list_nodes()) elif (args.command == 'SHUTDOWN_NODE'): #node_coordinator.remove_node(nid) #pp.pprint(node_coordinator.list_nodes()) raise NotImplemented("NotImplemented yet...") elif (args.command == 'CLEAR_NODE_QUEUES'): node_queue.clear_all_queues() else: args_dict = copy.copy(args.__dict__) cmd = new_cmd(args.command, args_dict) node_queue.put(cmd) logger.info('sent [%s]'%(cmd))
def flush_cmd(bulk, data_type, template, redis_config): try: node_coordinator = NodeCoordinator(redis_config=redis_config) qsizes = node_coordinator.node_qsizes() logger.debug(qsizes) node_queues = {} for element in bulk: if data_type == "ids" and type(element) == int: user_id = element elif data_type =="users" and type(element) == dict and "id" in element: user_id = element['id'] t = copy.copy(template) t["user_id"] = int(user_id) t["depth"] = int(t["depth"]) -1 node_id = get_keys_by_min_value(qsizes)[0] if (node_id in node_queues): node_queue = node_queues[node_id] else: node_queue = NodeQueue(node_id, redis_config=redis_config) node_queues[node_id] = node_queue t['cmd_hash'] = hash_cmd(t) node_queue.put(t) qsizes[node_id] += 1 logger.debug("send [%s] to node: %s"%(json.dumps(t),node_id)) # intend to close all redis connections, but not sure yet... node_queues.clear() del node_coordinator except Exception as exc: logger.error('error during flush: %s'%exc) return True
def flush_cmd(bulk, data_type, template, redis_config): try: node_coordinator = NodeCoordinator(redis_config=redis_config) qsizes = node_coordinator.node_qsizes() logger.debug(qsizes) node_queues = {} for element in bulk: if data_type == "ids" and type(element) == int: user_id = element elif data_type == "users" and type( element) == dict and "id" in element: user_id = element['id'] t = copy.copy(template) t["user_id"] = int(user_id) t["depth"] = int(t["depth"]) - 1 node_id = get_keys_by_min_value(qsizes)[0] if (node_id in node_queues): node_queue = node_queues[node_id] else: node_queue = NodeQueue(node_id, redis_config=redis_config) node_queues[node_id] = node_queue t['cmd_hash'] = hash_cmd(t) node_queue.put(t) qsizes[node_id] += 1 logger.debug("send [%s] to node: %s" % (json.dumps(t), node_id)) # intend to close all redis connections, but not sure yet... node_queues.clear() del node_coordinator except Exception as exc: logger.error('error during flush: %s' % exc) return True
def __init__(self, node_id, config={}, proxies=[]): self.node_id = node_id self.config = config if (proxies and len(proxies) > 0): self.proxy_list = proxy_checker(proxies) logger.info("number of live proxies: %d" % (len(self.proxy_list))) # each process only get one apikey... if there are more proxies than apikeys, each process can get more than one proxy that can be rotated when one fails. number_of_processes = min(len(self.config['apikeys']), len(self.proxy_list)) # if there are more proxies than apikeys, then each process will get a list of proxies, and the process will restart it self if a proxy failed, and try the next available proxy self.proxy_generator = self.split(self.proxy_list, number_of_processes) else: self.proxy_list = None self.proxy_generator = None number_of_processes = 1 logger.info("number of crawlers: %d" % (number_of_processes)) apikey_list = self.config['apikeys'].keys() self.crawlers = {} for idx in range(number_of_processes): try: self.new_crawler(self.node_id, self.config['apikeys'][apikey_list[idx]], config) except Exception as exc: logger.error(exc) pass self.node_coordinator = NodeCoordinator(config['redis_config']) self.node_coordinator.add_node(node_id) logger.info("number of crawlers: %d created" % (number_of_processes))
def __init__(self, node_id, config={}, proxies=[]): self.node_id = node_id self.config = config if (len(proxies) > 0): self.proxy_list = proxy_checker(proxies) logger.info("number of live proxies: %d"%(len(self.proxy_list))) # each process only get one apikey... if there are more proxies than apikeys, each process can get more than one proxy that can be rotated when one fails. number_of_processes = min(len(self.config['apikeys']), len(self.proxy_list)) # if there are more proxies than apikeys, then each process will get a list of proxies, and the process will restart it self if a proxy failed, and try the next available proxy self.proxy_generator = self.split(self.proxy_list, number_of_processes) else: self.proxy_list = None self.proxy_generator = None number_of_processes = 1 logger.info("number of crawlers: %d"%(number_of_processes)) apikey_list = self.config['apikeys'].keys() self.crawlers = {} for idx in range(number_of_processes): try: self.new_crawler(self.node_id, self.config['apikeys'][apikey_list[idx]], config) except Exception as exc: logger.error(exc) pass self.node_coordinator = NodeCoordinator(config['redis_config']) self.node_coordinator.add_node(node_id) logger.info("number of crawlers: %d created"%(number_of_processes))
class Scheduler(object): def __init__(self, node_id, config={}, proxies=[]): self.node_id = node_id self.config = config if (proxies and len(proxies) > 0): self.proxy_list = proxy_checker(proxies) logger.info("number of live proxies: %d" % (len(self.proxy_list))) # each process only get one apikey... if there are more proxies than apikeys, each process can get more than one proxy that can be rotated when one fails. number_of_processes = min(len(self.config['apikeys']), len(self.proxy_list)) # if there are more proxies than apikeys, then each process will get a list of proxies, and the process will restart it self if a proxy failed, and try the next available proxy self.proxy_generator = self.split(self.proxy_list, number_of_processes) else: self.proxy_list = None self.proxy_generator = None number_of_processes = 1 logger.info("number of crawlers: %d" % (number_of_processes)) apikey_list = self.config['apikeys'].keys() self.crawlers = {} for idx in range(number_of_processes): try: self.new_crawler(self.node_id, self.config['apikeys'][apikey_list[idx]], config) except Exception as exc: logger.error(exc) pass self.node_coordinator = NodeCoordinator(config['redis_config']) self.node_coordinator.add_node(node_id) logger.info("number of crawlers: %d created" % (number_of_processes)) def new_crawler(self, node_id, apikeys, config, crawler_proxies=None): file_handler_config = { "name": "FileHandler", "args": { "output_folder": config["output"] } } crawler_id = apikeys['app_key'] logger.debug('creating a new crawler: %s' % crawler_id) if (not crawler_proxies): crawler_proxies = next( self.proxy_generator) if self.proxy_generator else None crawler = TwitterCrawler( node_id, crawler_id, copy.copy(apikeys), handlers=[create_handler(file_handler_config)], redis_config=copy.copy(config['redis_config']), proxies=crawler_proxies) if (crawler_id in self.crawlers): #self.crawlers[crawler_id].clear() del self.crawlers[crawler_id] self.crawlers[crawler_id] = { 'apikeys': apikeys, 'crawler': crawler, 'crawler_queue': CrawlerQueue(self.node_id, crawler_id, redis_config=copy.copy(config['redis_config'])), 'crawler_proxies': crawler_proxies } crawler.start() def is_alive(self): a = [ 1 if self.crawlers[crawler_id]['crawler'].is_alive() else 0 for crawler_id in self.crawlers ] return sum(a) > 0 def crawler_status(self): status = [] for crawler_id in self.crawlers: cc = self.crawlers[crawler_id] if ((not cc['crawler'].is_alive())): if ('retry_timer_start_ts' in cc and (time.time() - cc['retry_timer_start_ts'] > 1800)): # retry 30 mins after the crawler dies... mostly the crawler died because "Twitter API returned a 503 (Service Unavailable), Over capacity" self.new_crawler(self.node_id, cc['apikeys'], self.config, cc['crawler_proxies']) cc = self.crawlers[crawler_id] logger.info('[%s] has been recrated...' % (crawler_id)) else: if ('retry_timer_start_ts' not in cc): cc['retry_timer_start_ts'] = int(time.time()) else: logger.warn( '[%s] failed; waiting to recreat in %f mins...' % (crawler_id, (time.time() + 1800 - cc['retry_timer_start_ts']) / float(60))) status.append({ 'crawler_id': crawler_id, 'alive?': cc['crawler'].is_alive(), 'qsize': cc['crawler_queue'].qsize(), 'crawler_queue_key': cc['crawler_queue'].get_key() }) return status def balancing_load(self): ''' Find the crawler that has the most load at this moment, and redistribut its item; Crawler is on a different subprocess, so we have to use redis to coordinate the redistribution... ''' sorted_queues = self.sorted_local_queue(False) max_crawler_id, max_qsize = sorted_queues[-1] min_crawler_id, min_qsize = sorted_queues[0] logger.info("crawler with max_qsize: %s (%d)" % (max_crawler_id, max_qsize)) logger.info("crawler with min_qsize: %s (%d)" % (min_crawler_id, min_qsize)) logger.info("max_qsize - min_qsize > 0.5 * min_qsize ?: %r" % ((max_qsize - min_qsize > 0.5 * min_qsize))) if (max_qsize - min_qsize > 0.5 * min_qsize): logger.info("load balancing process started...") cmds = [] controls = [] for i in range(int(0.3 * (max_qsize - min_qsize))): cmd = self.crawlers[max_crawler_id]['crawler_queue'].get() if (cmd['cmd'] in control_cmds): controls.append(cmd) else: cmds.append(cmd) # push control cmds back.. for cmd in controls: self.crawlers[max_crawler_id]['crawler_queue'].put(cmd) logger.info("redistribute %d cmds" % len(cmds)) for cmd in cmds: self.enqueue(cmd) def redistribute_crawler_queue(self, crawler_id): if (crawler_id in self.crawlers): logger.warn('%s just failed... redistributing its workload' % (crawler_id)) try: self.node_coordinator.distribute_to_nodes( self.crawlers[crawler_id]['crawler_queue']) wait_timer = 180 # wait until it dies (flushed all the data...) while (self.crawlers[crawler_id]['crawler'].is_alive() and wait_timer > 0): time.sleep(60) wait_timer -= 60 self.crawlers[crawler_id]['retry_timer_start_ts'] = int( time.time()) except Exception as exc: logger.error(full_stack()) else: logger.warn( "whatever are you trying to do? crawler_id: [%s] is not valid..." % (crawler_id)) def enqueue(self, cmd): if (cmd['cmd'] == 'TERMINATE'): [ self.crawlers[crawler_id]['crawler_queue'].put(cmd) for crawler_id in self.crawlers ] elif (cmd['cmd'] == 'CRAWLER_FLUSH'): [ self.crawlers[crawler_id]['crawler_queue'].put(cmd) for crawler_id in self.crawlers ] elif (cmd['cmd'] == 'BALANCING_LOAD'): self.balancing_load() elif (cmd['cmd'] == 'CRAWLER_FAILED'): crawler_id = cmd['crawler_id'] self.redistribute_crawler_queue(crawler_id) else: '''distribute item to the local crawler that has the least tasks in queue''' for crawler_id, qsize in self.sorted_local_queue(False): if self.crawlers[crawler_id]['crawler'].is_alive(): self.crawlers[crawler_id]['crawler_queue'].put(cmd) logger.debug("pushed %s to crawler: %s" % (cmd, crawler_id)) break def check_crawler_qsizes(self): return { crawler_id: self.crawlers[crawler_id]['crawler_queue'].qsize() for crawler_id in self.crawlers } def sorted_local_queue(self, reverse=False): local_qsizes = self.check_crawler_qsizes() return sorted(local_qsizes.iteritems(), key=itemgetter(1), reverse=reverse) def split(self, lst, n): """ Yield successive n chunks of even sized sub-lists from lst.""" lsize = {} results = {} for i in range(n): lsize[i] = 0 results[i] = [] for x in lst: idx = get_keys_by_min_value(lsize)[0] results[idx].append(x) lsize[idx] += 1 for i in range(n): yield results[i]
def start_server(config, proxies): check_config(config) config = copy.copy(config) folders_to_create = [] buckets = [ "tweets", "followers", "follower_ids", "friends", "friend_ids", "timelines" ] ouput_folder = os.path.abspath(config['output']) archive_output = os.path.abspath( config['archive_output']) if config['archive_output'] else ouput_folder archive_output = os.path.join(archive_output, 'archived') folders_to_create.append(ouput_folder) folders_to_create.append(archive_output) for bucket in buckets: folders_to_create.append(os.path.join(ouput_folder, bucket)) folders_to_create.append(os.path.join(archive_output, bucket)) for folder_to_create in folders_to_create: if not os.path.exists(folder_to_create): os.makedirs(folder_to_create) logger.info("output to %s" % ouput_folder) logger.info("archived to %s" % archive_output) this_node_id = node_id() node_queue = NodeQueue(this_node_id, redis_config=config['redis_config']) node_queue.clear() scheduler = Scheduler(this_node_id, config=config, proxies=proxies) logger.info('starting node_id: %s' % this_node_id) node_coordinator = NodeCoordinator(config['redis_config']) # node_coordinator.clear() # the main event loop, actually we don't need one, since we can just join on the crawlers and don't stop until a terminate command is issued to each crawler; # but we need one to report the status of each crawler and perform the tarball tashs... last_archive_ts = time.time( ) + 3600 # the first archive event starts 2 hrs later... pre_time = time.time() last_load_balancing_task_ts = time.time() while True: if time.time() - pre_time > 120: logger.info(pprint.pformat(scheduler.crawler_status())) pre_time = time.time() if scheduler.is_alive(): cmd = {'cmd': 'CRAWLER_FLUSH'} scheduler.enqueue(cmd) if time.time() - last_archive_ts > 3600: logger.info("start archive procedure...") with concurrent.futures.ProcessPoolExecutor( max_workers=len(buckets)) as executor: future_proxies = { executor.submit(tarball_results, ouput_folder, bucket, archive_output, int(time.time()) - 3600): bucket for bucket in buckets } for future in future_proxies: future.add_done_callback(lambda f: logger.info( "archive created? %s: [%s]" % f.result())) last_archive_ts = time.time() # block, the main process...for a command if not scheduler.is_alive(): logger.info( "no crawler is alive... waiting to recreate all crawlers...") time.sleep(120) # sleep for a minute and retry continue if time.time( ) - last_load_balancing_task_ts > 1800: # try to balance the local queues every 30 mins last_load_balancing_task_ts = time.time() cmd = {'cmd': 'BALANCING_LOAD'} scheduler.enqueue(cmd) cmd = node_queue.get(block=True, timeout=360) if cmd: scheduler.enqueue(cmd)
class Scheduler(object): def __init__(self, node_id, config={}, proxies=[]): self.node_id = node_id self.config = config if (len(proxies) > 0): self.proxy_list = proxy_checker(proxies) logger.info("number of live proxies: %d"%(len(self.proxy_list))) # each process only get one apikey... if there are more proxies than apikeys, each process can get more than one proxy that can be rotated when one fails. number_of_processes = min(len(self.config['apikeys']), len(self.proxy_list)) # if there are more proxies than apikeys, then each process will get a list of proxies, and the process will restart it self if a proxy failed, and try the next available proxy self.proxy_generator = self.split(self.proxy_list, number_of_processes) else: self.proxy_list = None self.proxy_generator = None number_of_processes = 1 logger.info("number of crawlers: %d"%(number_of_processes)) apikey_list = self.config['apikeys'].keys() self.crawlers = {} for idx in range(number_of_processes): try: self.new_crawler(self.node_id, self.config['apikeys'][apikey_list[idx]], config) except Exception as exc: logger.error(exc) pass self.node_coordinator = NodeCoordinator(config['redis_config']) self.node_coordinator.add_node(node_id) logger.info("number of crawlers: %d created"%(number_of_processes)) def new_crawler(self, node_id, apikeys, config, crawler_proxies = None): file_handler_config = { "name": "FileHandler", "args": { "output_folder" : config["output"] } } # try: #crawler_id = md5('%s:%s'%(self.node_id, idx)) #apikeys = self.config['apikeys'][apikey_list[idx]] crawler_id = apikeys['app_key'] logger.debug('creating a new crawler: %s'%crawler_id) if (not crawler_proxies): crawler_proxies = next(self.proxy_generator) if self.proxy_generator else None crawler = UserRelationshipCrawler(node_id, crawler_id, copy.copy(apikeys), handlers=[create_handler(file_handler_config)], redis_config=copy.copy(config['redis_config']), proxies=crawler_proxies) if (crawler_id in self.crawlers): #self.crawlers[crawler_id].clear() del self.crawlers[crawler_id] self.crawlers[crawler_id] = { 'apikeys': apikeys, 'crawler': crawler, 'crawler_queue': CrawlerQueue(self.node_id, crawler_id, redis_config=copy.copy(config['redis_config'])), 'crawler_proxies': crawler_proxies } crawler.start() # except twython.exceptions.TwythonAuthError as exc: # logger.error('%s: %s'%(exc, apikeys)) # except Exception as exc: # logger.error(exc) # raise def is_alive(self): a = [1 if self.crawlers[crawler_id]['crawler'].is_alive() else 0 for crawler_id in self.crawlers] return sum(a) > 0 def crawler_status(self): status = [] for crawler_id in self.crawlers: cc = self.crawlers[crawler_id] if ((not cc['crawler'].is_alive())): if ('retry_timer_start_ts' in cc and (time.time() - cc['retry_timer_start_ts'] > 1800)): # retry 30 mins after the crawler dies... mostly the crawler died because "Twitter API returned a 503 (Service Unavailable), Over capacity" self.new_crawler(self.node_id, cc['apikeys'], self.config, cc['crawler_proxies']) cc = self.crawlers[crawler_id] logger.info('[%s] has been recrated...'%(crawler_id)) else: if('retry_timer_start_ts' not in cc): cc['retry_timer_start_ts'] = int(time.time()) else: logger.warn('[%s] failed; waiting to recreat in %f mins...'%(crawler_id, (time.time() + 1800 - cc['retry_timer_start_ts'])/float(60))) status.append({'crawler_id':crawler_id, 'alive?': cc['crawler'].is_alive(), 'qsize': cc['crawler_queue'].qsize(), 'crawler_queue_key': cc['crawler_queue'].get_key()}) return status def balancing_load(self): ''' Find the crawler that has the most load at this moment, and redistribut its item; Crawler is on a different subprocess, so we have to use redis to coordinate the redistribution... ''' sorted_queues = self.sorted_local_queue(False) max_crawler_id, max_qsize = sorted_queues[-1] min_crawler_id, min_qsize = sorted_queues[0] logger.info("crawler with max_qsize: %s (%d)"%(max_crawler_id, max_qsize)) logger.info("crawler with min_qsize: %s (%d)"%(min_crawler_id, min_qsize)) logger.info("max_qsize - min_qsize > 0.5 * min_qsize ?: %r"%((max_qsize - min_qsize > 0.5 * min_qsize))) if (max_qsize - min_qsize > 0.5 * min_qsize): logger.info("load balancing process started...") cmds = [] controls = [] for i in range(int(0.3 * (max_qsize - min_qsize))): cmd = self.crawlers[max_crawler_id]['crawler_queue'].get() if (cmd['cmd'] in control_cmds): controls.append(cmd) else: cmds.append(cmd) # push control cmds back.. for cmd in controls: self.crawlers[max_crawler_id]['crawler_queue'].put(cmd) logger.info("redistribute %d cmds"%len(cmds)) for cmd in cmds: self.enqueue(cmd) def redistribute_crawler_queue(self, crawler_id): if (crawler_id in self.crawlers): logger.warn('%s just failed... redistributing its workload'%(crawler_id)) try: self.node_coordinator.distribute_to_nodes(self.crawlers[crawler_id]['crawler_queue']) wait_timer = 180 # wait until it dies (flushed all the data...) while(self.crawlers[crawler_id]['crawler'].is_alive() and wait_timer > 0): time.sleep(60) wait_timer -= 60 self.crawlers[crawler_id]['retry_timer_start_ts'] = int(time.time()) except Exception as exc: logger.error(full_stack()) else: logger.warn("whatever are you trying to do? crawler_id: [%s] is not valid..."%(crawler_id)) def enqueue(self, cmd): if (cmd['cmd'] == 'TERMINATE'): [self.crawlers[crawler_id]['crawler_queue'].put(cmd) for crawler_id in self.crawlers] elif(cmd['cmd'] == 'CRAWLER_FLUSH'): [self.crawlers[crawler_id]['crawler_queue'].put(cmd) for crawler_id in self.crawlers] elif(cmd['cmd'] == 'BALANCING_LOAD'): self.balancing_load() elif(cmd['cmd'] == 'CRAWLER_FAILED'): crawler_id = cmd['crawler_id'] self.redistribute_crawler_queue(crawler_id) else: '''distribute item to the local crawler that has the least tasks in queue''' for crawler_id, qsize in self.sorted_local_queue(False): if self.crawlers[crawler_id]['crawler'].is_alive(): self.crawlers[crawler_id]['crawler_queue'].put(cmd) logger.debug("pushed %s to crawler: %s"%(cmd, crawler_id)) break def check_crawler_qsizes(self): return {crawler_id:self.crawlers[crawler_id]['crawler_queue'].qsize() for crawler_id in self.crawlers} def sorted_local_queue(self, reverse=False): local_qsizes = self.check_crawler_qsizes() return sorted(local_qsizes.iteritems(), key=itemgetter(1), reverse=reverse) def split(self, lst, n): """ Yield successive n chunks of even sized sub-lists from lst.""" lsize = {} results = {} for i in range(n): lsize[i] = 0 results[i] = [] for x in lst: idx = get_keys_by_min_value(lsize)[0] results[idx].append(x) lsize[idx] += 1 for i in range(n): yield results[i]
def cmd(config, args): if (args.command not in avaliable_cmds): raise Exception("not a valid command...") nid = args.node_id logger.info("node_id: %s" % (nid)) node_queue = NodeQueue(nid, redis_config=config['redis_config']) node_coordinator = NodeCoordinator(config['redis_config']) # this can be done locally without sending the command to the servers... if (args.command == 'GET_UIDS_FROM_SCREEN_NAMES'): apikeys = config["apikeys"].values()[0] if (not os.path.exists(args.json)): raise Exception("doesn't exist... ") with open(os.path.abspath(args.json), 'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f: screen_names = json.load(f) twitter_api = TwitterAPI(apikeys=apikeys) user_ids = twitter_api.get_user_ids_by_screen_names(screen_names) json.dump(list(user_ids), o_f) elif (args.command == 'GET_USERS_FROM_IDS'): apikeys = config["apikeys"].values()[0] if (not os.path.exists(args.json)): raise Exception("doesn't exist... ") with open(os.path.abspath(args.json), 'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f: user_ids = json.load(f) twitter_api = TwitterAPI(apikeys=apikeys) users = twitter_api.get_users(user_ids) json.dump(list(users), o_f) elif (args.command.startswith('BATCH_')): new_command = args.command.replace('BATCH_', '') args_dict = copy.copy(args.__dict__) if (not os.path.exists(args.json)): raise Exception("doesn't exist... ") with open(os.path.abspath(args.json), 'rb') as f: if (args.command == 'BATCH_CRAWL_TWEET'): tweet_ids = json.load(f) for tweet_id in tweet_ids: print "Loading Tweet ID: ", tweet_id args_dict['tweet_id'] = tweet_id cmd = new_cmd(new_command, args_dict) node_queue.put(cmd) else: user_ids = json.load(f) for user_id in user_ids: args_dict['user_id'] = user_id cmd = new_cmd(new_command, args_dict) node_queue.put(cmd) elif (args.command == 'LIST_NODES'): pp.pprint(node_coordinator.list_nodes()) elif (args.command == 'NODE_QSIZES'): raise NotImplemented("NotImplemented yet...") #pp.pprint(node_coordinator.list_nodes()) elif (args.command == 'SHUTDOWN_NODE'): #node_coordinator.remove_node(nid) #pp.pprint(node_coordinator.list_nodes()) raise NotImplemented("NotImplemented yet...") elif (args.command == 'CLEAR_NODE_QUEUES'): node_queue.clear_all_queues() else: args_dict = copy.copy(args.__dict__) cmd = new_cmd(args.command, args_dict) node_queue.put(cmd) logger.info('sent [%s]' % (cmd))