def get(self, *args, **kwargs): """ list of workers with checked-in data """ def add_status(worker): not_seen_since = getnow() - worker["last_seen"] worker["status"] = ("online" if not_seen_since.total_seconds() < OFFLINE_DELAY else "offline") return worker request_args = SkipLimitSchema().load(request.args.to_dict()) skip, limit = request_args["skip"], request_args["limit"] query = {} count = Workers().count_documents(query) projection = { "_id": 0, "name": 1, "username": 1, "offliners": 1, "resources": 1, "last_seen": 1, } cursor = (Workers().find(query, projection).sort( "name", pymongo.ASCENDING).skip(skip).limit(limit)) workers = list(map(add_status, cursor)) return jsonify({ "meta": { "skip": skip, "limit": limit, "count": count }, "items": workers })
def get(self, token: AccessToken.Payload): """ list of requested tasks to be retrieved by workers, auth-only """ request_args = request.args.to_dict() worker_name = request_args.get("worker") # record we've seen a worker, if applicable if token and worker_name: Workers().update_one( {"name": worker_name, "username": token.username}, {"$set": {"last_seen": getnow()}}, ) request_args = WorkerRequestedTaskSchema().load(request_args) task = find_requested_task_for( token.username, worker_name, request_args["avail_cpu"], request_args["avail_memory"], request_args["avail_disk"], ) return jsonify( { "meta": {"skip": 0, "limit": 1, "count": 1 if task else 0}, "items": [task] if task else [], } )
def delete(self, token: AccessToken.Payload, username: str): # delete user deleted_count = Users().delete_one({ "username": username }).deleted_count if deleted_count == 0: raise errors.NotFound() Workers().delete_many({"username": username}) return Response(status=HTTPStatus.NO_CONTENT)
def put(self, name: str, *args, **kwargs): try: request_json = WorkerCheckInSchema().load(request.get_json()) except ValidationError as e: raise InvalidRequestJSON(e.messages) document = { "name": name, "username": request_json["username"], "selfish": request_json["selfish"], "resources": { "cpu": request_json["cpu"], "memory": request_json["memory"], "disk": request_json["disk"], }, "offliners": request_json["offliners"], "platforms": request_json.get("platforms", {}), "last_seen": getnow(), } Workers().replace_one({"name": name}, document, upsert=True) BROADCASTER.broadcast_worker_checkin(document) return Response(status=HTTPStatus.NO_CONTENT)
def find_requested_task_for(username, worker_name, avail_cpu, avail_memory, avail_disk): """optimal requested_task to run now for a given worker Accounts for: - longest tasks this worker can do (total resources) - available resources now (sent) - extimated duration to reclaim resources for longest tasks """ # get total resources for that worker worker = Workers().find_one( { "username": username, "name": worker_name }, { "resources": 1, "offliners": 1, "last_seen": 1, "name": 1, "selfish": 1, "platforms": 1, }, ) # worker is not checked-in if worker is None: logger.error(f"worker `{worker_name}` not checked-in") return None # retrieve list of tasks we are currently running with associated resources running_tasks = get_currently_running_tasks(worker_name) # find all requested tasks that this worker can do with its total resources # sorted by priorities # sorted by max durations tasks_worker_could_do = get_reqs_doable_by(worker) # filter-out requested tasks that are not doable now due to platform limitations worker_platform_filter = functools.partial( does_platform_allow_worker_to_run, worker, running_tasks) tasks_worker_could_do = filter(worker_platform_filter, tasks_worker_could_do) # record available resources available_resources = { "cpu": avail_cpu, "memory": avail_memory, "disk": avail_disk } try: # candidate is task[0] candidate = next(tasks_worker_could_do) except StopIteration: logger.debug( f"no request doable by worker (selfish={worker.get('selfish')})") return None # can worker do task[0] ? # if yes -> return task[0] if can_run(candidate, available_resources): logger.debug("first candidate can be run!") return candidate # we don't have enough resources for task[0]. # find out missing resources missing_cpu = max([candidate["config"]["resources"]["cpu"] - avail_cpu, 0]) missing_memory = max( [candidate["config"]["resources"]["memory"] - avail_memory, 0]) missing_disk = max( [candidate["config"]["resources"]["disk"] - avail_disk, 0]) logger.debug( f"missing cpu:{missing_cpu}, mem:{missing_memory}, dsk:{missing_disk}") # pile-up all of those which we need to complete to have enough resources preventing_tasks = [] # sorted by ETA as it's the order in which there're gonna complete for task in sorted(running_tasks, key=lambda x: x["eta"]): preventing_tasks.append(task) if (sum([t["config"]["resources"]["cpu"] for t in preventing_tasks]) >= missing_cpu and sum([ t["config"]["resources"]["memory"] for t in preventing_tasks ]) >= missing_memory and sum([ t["config"]["resources"]["disk"] for t in preventing_tasks ]) >= missing_disk): # stop when we'd have reclaimed our missing resources break if not preventing_tasks: # we should not get there: no preventing task yet we don't have our total # resources available? problem. logger.error("we have no preventing tasks. oops") return None logger.debug(f"we have {len(preventing_tasks)} tasks blocking out way") opening_eta = preventing_tasks[-1]["eta"] logger.debug(f"opening_eta:{opening_eta}") # get the number of available seconds from now to that ETA available_time = (opening_eta - getnow()).total_seconds() logger.debug("we have approx. {}mn to reclaim resources".format( available_time / 60)) # loop on task[1+] to find the first task which can fit temp_candidate = get_possible_task_with(tasks_worker_could_do, available_resources, available_time) if temp_candidate: return temp_candidate # if none in the loop are possible, return None (worker will wait) logger.debug( "unable to fit anything, you'll have to wait for task to complete") return None
def list_of_requested_tasks(token: AccessToken.Payload = None): """ list of requested tasks """ request_args = request.args.to_dict() worker = request_args.get("worker") # record we've seen a worker, if applicable if token and worker: Workers().update_one( { "name": worker, "username": token.username }, {"$set": { "last_seen": getnow() }}, ) request_args["matching_offliners"] = request.args.getlist( "matching_offliners") request_args["schedule_name"] = request.args.getlist("schedule_name") request_args = RequestedTaskSchema().load(request_args) # unpack query parameter skip, limit = request_args["skip"], request_args["limit"] schedule_names = request_args["schedule_name"] priority = request_args.get("priority") # get requested tasks from database query = {} if schedule_names: query["schedule_name"] = {"$in": schedule_names} if priority: query["priority"] = {"$gte": priority} if worker: query["worker"] = {"$in": [None, worker]} for res_key in ("cpu", "memory", "disk"): key = f"matching_{res_key}" if key in request_args: query[f"config.resources.{res_key}"] = {"$lte": request_args[key]} matching_offliners = request_args.get("matching_offliners") if matching_offliners: query["config.task_name"] = {"$in": matching_offliners} cursor = (RequestedTasks().find( query, { "_id": 1, "status": 1, "schedule_name": 1, "config.task_name": 1, "config.resources": 1, "timestamp.requested": 1, "requested_by": 1, "priority": 1, "worker": 1, }, ).sort([ ("priority", pymongo.DESCENDING), ("timestamp.reserved", pymongo.DESCENDING), ("timestamp.requested", pymongo.DESCENDING), ]).skip(skip).limit(limit)) count = RequestedTasks().count_documents(query) return jsonify({ "meta": { "skip": skip, "limit": limit, "count": count }, "items": [task for task in cursor], })