def test_client(self): nid = node_id() logger.info("sending to %s" % (nid)) node_queue = NodeQueue(nid, redis_config=self.config['redis_config']) #redis_cmd_queue.clear() cmd = { "cmd": "CRAWL_FRIENDS", "user_id": 1948122342, "data_type": "ids", "depth": 2, "bucket": "friend_ids" } # cmd = { # "cmd": "CRAWL_USER_TIMELINE", # "user_id": 1948122342,#53039176, # "bucket": "timelines" # } node_queue.put(cmd) #cmd = {"cmd":"TERMINATE"} #node_queue.put(cmd) return True
def __init__(self, node_id, crawler_id, apikeys, handlers, redis_config, proxies=None): if (handlers == None): raise MissingArgs("you need a handler to write the data to...") super(UserRelationshipCrawler, self).__init__(node_id, crawler_id, redis_config, handlers) self.apikeys = copy.copy(apikeys) self.tasks = { "TERMINATE": "TERMINATE", "CRAWL_FRIENDS": { "users": "find_all_friends", "ids": "find_all_friend_ids", "network_type": "friends" }, "CRAWL_FOLLOWERS": { "users": "find_all_followers", "ids": "find_all_follower_ids", "network_type": "followers" }, "CRAWL_USER_TIMELINE": "fetch_user_timeline", "CRAWL_TWEET": "fetch_tweet_by_id" } self.node_queue = NodeQueue(self.node_id, redis_config=redis_config) self.client_args = {"timeout": 300} self.proxies = iter(proxies) if proxies else None self.user_api = None self.init_user_api()
def test_redis_connections(self): nodes = {} cnt = 0 while True: nodes[cnt] = NodeQueue("node_id", redis_config=self.config['redis_config']) cnt += 1 if (cnt % 5 == 0): nodes.clear() time.sleep(1)
def flush_cmd(bulk, data_type, template, redis_config): try: node_coordinator = NodeCoordinator(redis_config=redis_config) qsizes = node_coordinator.node_qsizes() logger.debug(qsizes) node_queues = {} for element in bulk: if data_type == "ids" and type(element) == int: user_id = element elif data_type == "users" and type( element) == dict and "id" in element: user_id = element['id'] t = copy.copy(template) t["user_id"] = int(user_id) t["depth"] = int(t["depth"]) - 1 node_id = get_keys_by_min_value(qsizes)[0] if (node_id in node_queues): node_queue = node_queues[node_id] else: node_queue = NodeQueue(node_id, redis_config=redis_config) node_queues[node_id] = node_queue t['cmd_hash'] = hash_cmd(t) node_queue.put(t) qsizes[node_id] += 1 logger.debug("send [%s] to node: %s" % (json.dumps(t), node_id)) # intend to close all redis connections, but not sure yet... node_queues.clear() del node_coordinator except Exception as exc: logger.error('error during flush: %s' % exc) return True
def start_server(config, proxies): check_config(config) config = copy.copy(config) folders_to_create = [] buckets = [ "tweets", "followers", "follower_ids", "friends", "friend_ids", "timelines" ] ouput_folder = os.path.abspath(config['output']) archive_output = os.path.abspath( config['archive_output']) if config['archive_output'] else ouput_folder archive_output = os.path.join(archive_output, 'archived') folders_to_create.append(ouput_folder) folders_to_create.append(archive_output) for bucket in buckets: folders_to_create.append(os.path.join(ouput_folder, bucket)) folders_to_create.append(os.path.join(archive_output, bucket)) for folder_to_create in folders_to_create: if not os.path.exists(folder_to_create): os.makedirs(folder_to_create) logger.info("output to %s" % ouput_folder) logger.info("archived to %s" % archive_output) this_node_id = node_id() node_queue = NodeQueue(this_node_id, redis_config=config['redis_config']) node_queue.clear() scheduler = Scheduler(this_node_id, config=config, proxies=proxies) logger.info('starting node_id: %s' % this_node_id) node_coordinator = NodeCoordinator(config['redis_config']) # node_coordinator.clear() # the main event loop, actually we don't need one, since we can just join on the crawlers and don't stop until a terminate command is issued to each crawler; # but we need one to report the status of each crawler and perform the tarball tashs... last_archive_ts = time.time( ) + 3600 # the first archive event starts 2 hrs later... pre_time = time.time() last_load_balancing_task_ts = time.time() while True: if time.time() - pre_time > 120: logger.info(pprint.pformat(scheduler.crawler_status())) pre_time = time.time() if scheduler.is_alive(): cmd = {'cmd': 'CRAWLER_FLUSH'} scheduler.enqueue(cmd) if time.time() - last_archive_ts > 3600: logger.info("start archive procedure...") with concurrent.futures.ProcessPoolExecutor( max_workers=len(buckets)) as executor: future_proxies = { executor.submit(tarball_results, ouput_folder, bucket, archive_output, int(time.time()) - 3600): bucket for bucket in buckets } for future in future_proxies: future.add_done_callback(lambda f: logger.info( "archive created? %s: [%s]" % f.result())) last_archive_ts = time.time() # block, the main process...for a command if not scheduler.is_alive(): logger.info( "no crawler is alive... waiting to recreate all crawlers...") time.sleep(120) # sleep for a minute and retry continue if time.time( ) - last_load_balancing_task_ts > 1800: # try to balance the local queues every 30 mins last_load_balancing_task_ts = time.time() cmd = {'cmd': 'BALANCING_LOAD'} scheduler.enqueue(cmd) cmd = node_queue.get(block=True, timeout=360) if cmd: scheduler.enqueue(cmd)
def cmd(config, args): if (args.command not in avaliable_cmds): raise Exception("not a valid command...") nid = args.node_id logger.info("node_id: %s" % (nid)) node_queue = NodeQueue(nid, redis_config=config['redis_config']) node_coordinator = NodeCoordinator(config['redis_config']) # this can be done locally without sending the command to the servers... if (args.command == 'GET_UIDS_FROM_SCREEN_NAMES'): apikeys = config["apikeys"].values()[0] if (not os.path.exists(args.json)): raise Exception("doesn't exist... ") with open(os.path.abspath(args.json), 'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f: screen_names = json.load(f) twitter_api = TwitterAPI(apikeys=apikeys) user_ids = twitter_api.get_user_ids_by_screen_names(screen_names) json.dump(list(user_ids), o_f) elif (args.command == 'GET_USERS_FROM_IDS'): apikeys = config["apikeys"].values()[0] if (not os.path.exists(args.json)): raise Exception("doesn't exist... ") with open(os.path.abspath(args.json), 'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f: user_ids = json.load(f) twitter_api = TwitterAPI(apikeys=apikeys) users = twitter_api.get_users(user_ids) json.dump(list(users), o_f) elif (args.command.startswith('BATCH_')): new_command = args.command.replace('BATCH_', '') args_dict = copy.copy(args.__dict__) if (not os.path.exists(args.json)): raise Exception("doesn't exist... ") with open(os.path.abspath(args.json), 'rb') as f: if (args.command == 'BATCH_CRAWL_TWEET'): tweet_ids = json.load(f) for tweet_id in tweet_ids: print "Loading Tweet ID: ", tweet_id args_dict['tweet_id'] = tweet_id cmd = new_cmd(new_command, args_dict) node_queue.put(cmd) else: user_ids = json.load(f) for user_id in user_ids: args_dict['user_id'] = user_id cmd = new_cmd(new_command, args_dict) node_queue.put(cmd) elif (args.command == 'LIST_NODES'): pp.pprint(node_coordinator.list_nodes()) elif (args.command == 'NODE_QSIZES'): raise NotImplemented("NotImplemented yet...") #pp.pprint(node_coordinator.list_nodes()) elif (args.command == 'SHUTDOWN_NODE'): #node_coordinator.remove_node(nid) #pp.pprint(node_coordinator.list_nodes()) raise NotImplemented("NotImplemented yet...") elif (args.command == 'CLEAR_NODE_QUEUES'): node_queue.clear_all_queues() else: args_dict = copy.copy(args.__dict__) cmd = new_cmd(args.command, args_dict) node_queue.put(cmd) logger.info('sent [%s]' % (cmd))