def distribute_to_nodes(self, crawler_queue): qsizes = self.node_qsizes() while (crawler_queue.get(timeout=60)): node_id = get_keys_by_min_value(qsizes)[0] node = self.get_node(node_id) node.put(cmd) qsizes[node_id] += 1
def distribute_to_nodes(self, crawler_queue): qsizes = self.node_qsizes() cmd = crawler_queue.get(timeout=60) while (cmd): node_id = get_keys_by_min_value(qsizes)[0] node = self.get_node(node_id) node.put(cmd) qsizes[node_id] += 1 cmd = crawler_queue.get(timeout=60)
def split(lst, n): lsize = {} results = {} for i in range(n): lsize[i] = 0 results[i] = [] for x in lst: idx = get_keys_by_min_value(lsize)[0] results[idx].append(x) lsize[idx] += 1 for i in range(n): yield results[i]
def split(self, lst, n): """ Yield successive n chunks of even sized sub-lists from lst.""" lsize = {} results = {} for i in range(n): lsize[i] = 0 results[i] = [] for x in lst: idx = get_keys_by_min_value(lsize)[0] results[idx].append(x) lsize[idx] += 1 for i in range(n): yield results[i]
def flush_cmd(bulk, data_type, template, redis_config): try: node_coordinator = NodeCoordinator(redis_config=redis_config) qsizes = node_coordinator.node_qsizes() logger.debug(qsizes) node_queues = {} for element in bulk: if data_type == "ids" and type(element) == int: user_id = element elif data_type =="users" and type(element) == dict and "id" in element: user_id = element['id'] t = copy.copy(template) t["user_id"] = int(user_id) t["depth"] = int(t["depth"]) -1 node_id = get_keys_by_min_value(qsizes)[0] if (node_id in node_queues): node_queue = node_queues[node_id] else: node_queue = NodeQueue(node_id, redis_config=redis_config) node_queues[node_id] = node_queue t['cmd_hash'] = hash_cmd(t) node_queue.put(t) qsizes[node_id] += 1 logger.debug("send [%s] to node: %s"%(json.dumps(t),node_id)) # intend to close all redis connections, but not sure yet... node_queues.clear() del node_coordinator except Exception as exc: logger.error('error during flush: %s'%exc) return True
def flush_cmd(bulk, data_type, template, redis_config): try: node_coordinator = NodeCoordinator(redis_config=redis_config) qsizes = node_coordinator.node_qsizes() logger.debug(qsizes) node_queues = {} for element in bulk: if data_type == "ids" and type(element) == int: user_id = element elif data_type == "users" and type( element) == dict and "id" in element: user_id = element['id'] t = copy.copy(template) t["user_id"] = int(user_id) t["depth"] = int(t["depth"]) - 1 node_id = get_keys_by_min_value(qsizes)[0] if (node_id in node_queues): node_queue = node_queues[node_id] else: node_queue = NodeQueue(node_id, redis_config=redis_config) node_queues[node_id] = node_queue t['cmd_hash'] = hash_cmd(t) node_queue.put(t) qsizes[node_id] += 1 logger.debug("send [%s] to node: %s" % (json.dumps(t), node_id)) # intend to close all redis connections, but not sure yet... node_queues.clear() del node_coordinator except Exception as exc: logger.error('error during flush: %s' % exc) return True