Example #1
0
 def post(self, actor_id):
     """Ensure a certain number of workers are running for an actor"""
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[id])
     except KeyError:
         raise ResourceError("actor not found: {}'".format(actor_id), 404)
     args = self.validate_post()
     num = args.get('num')
     if not num or num == 0:
         num = 1
     dbid = Actor.get_dbid(g.tenant, actor_id)
     workers = Worker.get_workers(dbid)
     if len(workers.items()) < num:
         worker_ids = []
         num_to_add = int(num) - len(workers.items())
         for idx in range(num_to_add):
             worker_ids.append(Worker.request_worker(actor_id))
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id,
                    worker_ids=worker_ids,
                    image=actor.image,
                    tenant=g.tenant,
                    num=num_to_add,
                    stop_existing=False)
         return ok(
             result=None,
             msg="Scheduled {} new worker(s) to start. There were only".
             format(num_to_add))
     else:
         return ok(result=None,
                   msg="Actor {} already had {} worker(s).".format(
                       actor_id, num))
Example #2
0
 def get(self, actor_id):
     logger.debug("top of GET /actors/{}/workers for tenant {}.".format(
         actor_id, g.tenant))
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         Actor.from_db(actors_store[dbid])
     except KeyError:
         logger.debug("did not find actor: {}.".format(actor_id))
         raise ResourceError("No actor found with id: {}.".format(actor_id),
                             404)
     try:
         workers = Worker.get_workers(dbid)
     except WorkerException as e:
         logger.debug(
             "did not find workers for actor: {}.".format(actor_id))
         raise ResourceError(e.msg, 404)
     result = []
     for id, worker in workers.items():
         worker.update({'id': id})
         try:
             w = Worker(**worker)
             result.append(w.display())
         except Exception as e:
             logger.error(
                 "Unable to instantiate worker in workers endpoint from description: {}. "
                 .format(worker))
     return ok(result=result, msg="Workers retrieved successfully.")
Example #3
0
async def jobs_dispatcher():
    if Worker.select().count() == 0:
        for i in range(1):
            Worker.create(state="available")

    workers = Worker.select().where(Worker.state == "available")

    # no available workers, wait
    if workers.count() == 0:
        return

    with db.atomic('IMMEDIATE'):
        jobs = Job.select().where(Job.state == "scheduled")

        # no jobs to process, wait
        if jobs.count() == 0:
            await asyncio.sleep(3)
            return

        for i in range(min(workers.count(), jobs.count())):
            job = jobs[i]
            worker = workers[i]

            job.state = "running"
            job.started_time = datetime.now()
            job.save()

            worker.state = "busy"
            worker.save()

            jobs_in_memory_state[job.id] = {
                "worker": worker.id,
                "task": asyncio.ensure_future(run_job(worker, job)),
            }
Example #4
0
 def check_new_params(self, cmd):
     """Additional checks for new client requests."""
     valid, msg = self.check_common(cmd)
     # validate the actor_id
     try:
         actor = Actor.from_db(actors_store[cmd.get('actor_id')])
     except KeyError:
         m = "Unable to look up actor with id: {}".format(
             cmd.get('actor_id'))
         logger.error(m)
         return False, m, None
     # validate the worker id
     try:
         Worker.get_worker(actor_id=cmd.get('actor_id'),
                           worker_id=cmd.get('worker_id'))
     except WorkerException as e:
         m = "Unable to look up worker: {}".format(e.msg)
         logger.error(m)
         return False, m, None
     logger.debug("new params were valid.")
     owner_prefix = get_tenant_userstore_prefix(actor.tenant)
     logger.debug(
         f"using owner prefix: {owner_prefix} for tenant: {actor.tenant}")
     if owner_prefix:
         owner = f"{owner_prefix}/{actor.owner}"
     else:
         owner = actor.owner
     logger.debug(f"using owner: {owner}")
     return valid, msg, owner
Example #5
0
 def post(self, actor_id):
     """Ensure a certain number of workers are running for an actor"""
     logger.debug("top of POST /actors/{}/workers.".format(actor_id))
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[id])
     except KeyError:
         logger.debug("did not find actor: {}.".format(actor_id))
         raise ResourceError("No actor found with id: {}.".format(actor_id),
                             404)
     args = self.validate_post()
     logger.debug(
         "workers POST params validated. actor: {}.".format(actor_id))
     num = args.get('num')
     if not num or num == 0:
         logger.debug("did not get a num: {}.".format(actor_id))
         num = 1
     logger.debug("ensuring at least {} workers. actor: {}.".format(
         num, actor_id))
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         workers = Worker.get_workers(dbid)
     except WorkerException as e:
         logger.debug(
             "did not find workers for actor: {}.".format(actor_id))
         raise ResourceError(e.msg, 404)
     current_number_workers = len(workers.items())
     if current_number_workers < num:
         logger.debug(
             "There were only {} workers for actor: {} so we're adding more."
             .format(current_number_workers, actor_id))
         worker_ids = []
         num_to_add = int(num) - len(workers.items())
         logger.info("adding {} more workers for actor {}".format(
             num_to_add, actor_id))
         for idx in range(num_to_add):
             worker_ids.append(
                 Worker.request_worker(tenant=g.tenant, actor_id=actor_id))
         logger.info("New worker ids: {}".format(worker_ids))
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id,
                    worker_ids=worker_ids,
                    image=actor.image,
                    tenant=g.tenant,
                    num=num_to_add,
                    stop_existing=False)
         ch.close()
         logger.info(
             "Message put on command channel for new worker ids: {}".format(
                 worker_ids))
         return ok(
             result=None,
             msg="Scheduled {} new worker(s) to start. There were only".
             format(num_to_add))
     else:
         return ok(result=None,
                   msg="Actor {} already had {} worker(s).".format(
                       actor_id, num))
Example #6
0
def process_worker_ch(tenant, worker_ch, actor_id, worker_id, actor_ch,
                      ag_client):
    """ Target for a thread to listen on the worker channel for a message to stop processing.
    :param worker_ch:
    :return:
    """
    global keep_running
    logger.info("Worker subscribing to worker channel...")
    while True:
        try:
            msg = worker_ch.get(timeout=2)
        except channelpy.ChannelTimeoutException:
            continue
        logger.debug("Received message in worker channel: {}".format(msg))
        logger.debug("Type(msg)={}".format(type(msg)))
        if type(msg) == dict:
            value = msg.get('value', '')
            if value == 'status':
                # this is a health check, return 'ok' to the reply_to channel.
                logger.debug("received health check. returning 'ok'.")
                ch = msg['reply_to']
                ch.put('ok')
        elif msg == 'stop':
            logger.info("Received stop message, stopping worker...")
            # first, delete an associated client
            # its possible this worker was not passed a client,
            # but if so, we need to delete it before shutting down.
            if ag_client:
                logger.info("Requesting client {} be deleted.".format(
                    ag_client.api_key))
                secret = os.environ.get('_abaco_secret')
                clients_ch = ClientsChannel()
                msg = clients_ch.request_delete_client(
                    tenant=tenant,
                    actor_id=actor_id,
                    worker_id=worker_id,
                    client_id=ag_client.api_key,
                    secret=secret)

                if msg['status'] == 'ok':
                    logger.info("Delete request completed successfully.")
                else:
                    logger.error("Error deleting client. Message: {}".format(
                        msg['message']))
            else:
                logger.info(
                    "Did not receive client. Not issuing delete. Exiting.")
            try:
                Worker.delete_worker(actor_id, worker_id)
            except WorkerException as e:
                logger.info(
                    "Got WorkerException from delete_worker(). Exception: {}".
                    format(e))
            keep_running = False
            actor_ch.close()
            logger.info("Closing actor channel for actor: {}".format(actor_id))
            logger.info("Worker is now exiting.")
            sys.exit()
Example #7
0
def create_worker(username, password, role,):
    user = User.objects.create_user(username, password=None)
    user.set_password(password)
    user.save()

    worker = Worker(role=role, user=user)
    worker.save()

    return worker.id
Example #8
0
    def test_boss_workers_sequence(self):
        worker1 = Worker(1, 'w1', '')
        worker2 = Worker(1, 'w2', '')
        worker3 = Worker(1, 'w3', '')

        _workers = [worker1, worker3, worker2]

        for worker in _workers:
            self.boss.add_worker(worker)

        self.assertTrue(self.boss.workers == _workers)
Example #9
0
 def kill_worker(self, actor_id, worker_id):
     try:
         Worker.delete_worker(actor_id, worker_id)
     except WorkerException as e:
         logger.info("Got WorkerException from delete_worker(). "
                     "worker_id: {}"
                     "Exception: {}".format(worker_id, e))
     except Exception as e:
         logger.error("Got an unexpected exception from delete_worker(). "
                     "worker_id: {}"
                     "Exception: {}".format(worker_id, e))
Example #10
0
 def kill_worker(self, actor_id, worker_id):
     logger.debug(f"top of kill_worker: {actor_id}_{worker_id}")
     try:
         Worker.delete_worker(actor_id, worker_id)
         logger.debug(f"worker deleted; {actor_id}_{worker_id}")
     except WorkerException as e:
         logger.info("Got WorkerException from delete_worker(). "
                     "worker_id: {}"
                     "Exception: {}".format(worker_id, e))
     except Exception as e:
         logger.error("Got an unexpected exception from delete_worker(). "
                      "worker_id: {}"
                      "Exception: {}".format(worker_id, e))
Example #11
0
 def check_new_params(self, cmd):
     valid, msg = self.check_common(cmd)
     # validate the actor_id
     try:
         actor = Actor.from_db(actors_store[cmd.get('actor_id')])
     except KeyError:
         return False, "Unable to look up actor with id: {}".format(cmd.get('actor_id')), None
     # validate the worker id
     try:
         Worker.get_worker(actor_id=cmd.get('actor_id'), worker_id=cmd.get('worker_id'))
     except WorkerException as e:
         return False, "Unable to look up worker: {}".format(e.msg), None
     return valid, msg, actor.owner
Example #12
0
 def check_new_params(self, cmd):
     valid, msg = self.check_common(cmd)
     # validate the actor_id
     try:
         actor = Actor.from_db(actors_store[cmd.get('actor_id')])
     except KeyError:
         return False, "Unable to look up actor with id: {}".format(cmd.get('actor_id')), None
     # validate the worker id
     try:
         Worker.get_worker(actor_id=cmd.get('actor_id'), ch_name=cmd.get('worker_id'))
     except WorkerException as e:
         return False, "Unable to look up worker: {}".format(e.msg), None
     return valid, msg, actor.owner
Example #13
0
def handle_signup_worker():
    """ Registra un trabajador para incluirlo en la DB"""

    data = request.get_json()

    if data is None:
        raise APIException(
            "You need to specify the request body as a json object",
            status_code=400)

    new_worker = Worker(
        init_date=data['init_date'],
        Consultor=data['Consultor'],
        candidate=data['candidate'],
        cedula=data['cedula'],
        status=data['status'],
        phone_number=data['phone_number'],
        email=data['email'],
        catchment_source=data['catchment_source'],
        managment=data['managment'],
        vacant=data['vacant'],
        Observations=data['Observations'],
        interview_date=data['interview_date'],
        actual_charge=data['actual_charge'],
        company=data['company'],
        sector=data['sector'],
        coin=data['coin'],
        basic_salary=data['basic_salary'],
        variable_salary=data['variable_salary'],
        cesta_ticket=data['cesta_ticket'],
        Profit_Days=data['Profit_Days'],
        vacations=data['vacations'],
        Vacation_Bonus=data['Vacation_Bonus'],
        Factor=data['Factor'],
        Estimated_annual_package=data['Estimated_annual_package'],
        Production_bonus=data['Production_bonus'],
        Transport_bonus=data['Transport_bonus'],
        Savings_Bank=data['Savings_Bank'],
        food_bags=data['food_bags'],
        parking_payment=data['parking_payment'],
        partial_HCM_Emp_Family=data['partial_HCM_Emp_Family'],
        Vehicle_insurance=data['Vehicle_insurance'],
        life_insurance=data['life_insurance'],
        dinning_room=data['dinning_room'],
        full_HCM_Emp_Family=data['full_HCM_Emp_Family'])
    db.session.add(new_worker)
    db.session.commit()
    if new_worker:
        return new_worker.serialize(), 201
Example #14
0
File: worker.py Project: TACC/abaco
def process_worker_ch(tenant, worker_ch, actor_id, actor_ch, ag_client):
    """ Target for a thread to listen on the worker channel for a message to stop processing.
    :param worker_ch:
    :return:
    """
    global keep_running
    print("Worker subscribing to worker channel...")
    while True:
        try:
            msg = worker_ch.get(timeout=2)
        except channelpy.ChannelTimeoutException:
            continue
        print("Received message in worker channel: {}".format(msg))
        print("Type(msg)={}".format(type(msg)))
        if type(msg) == dict:
            value = msg.get('value', '')
            if value == 'status':
                # this is a health check, return 'ok' to the reply_to channel.
                ch = msg['reply_to']
                ch.put('ok')
        elif msg == 'stop':
            print("Received stop message, stopping worker...")
            # first, delete an associated client
            # its possible this worker was not passed a client,
            # but if so, we need to delete it before shutting down.
            if ag_client:
                print("Requesting client {} be deleted.".format(ag_client.api_key))
                secret = os.environ.get('_abaco_secret')
                clients_ch = ClientsChannel()
                msg = clients_ch.request_delete_client(tenant=tenant,
                                                       actor_id=actor_id,
                                                       worker_id=worker_ch.name,
                                                       client_id=ag_client.api_key,
                                                       secret=secret)

                if msg['status'] == 'ok':
                    print("Delete request completed successfully.")
                else:
                    print("Error deleting client. Message: {}".format(msg['message']))
            else:
                print("Did not receive client. Not issuing delete. Exiting.")
            try:
                Worker.delete_worker(actor_id, worker_ch.name)
            except WorkerException:
                pass
            keep_running = False
            actor_ch.close()
            sys.exit()
Example #15
0
def forget_password():
    email = request.json.get('email', None)
    if not email or email == '':
        return None
    worker = Worker()
    worker.email = email
    worker = Worker.query.filter_by(email=email).first()
    if not worker:
        return jsonify({"msg": "This email is not registered"}), 404
    token = generate_confirmation_token(worker.email)
    print(token)
    confirm_url = 'http://localhost:3000/confirmation/' + token
    html = render_template('email_confirmation.html', confirm_url=confirm_url)
    subject = "Por favor, Confirmar su email."
    sendMail("Por favor, Confirmar su email.", '', '', worker.email, html)
    return jsonify({"success": "Email send successfully"}), 200
Example #16
0
def scale_down(actor_id):
    workers = Worker.get_workers(actor_id)
    logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers)))
    try:
        # if len(workers) == 1:
        #     logger.debug("METRICS only one worker, won't scale down")
        # else:
        while len(workers) > 0:
            logger.debug('METRICS made it STATUS check')
            worker = workers.popitem()[1]
            logger.debug('METRICS SCALE DOWN current worker: {}'.format(
                worker['status']))
            # check status of the worker is ready
            if worker['status'] == 'READY':
                # scale down
                try:
                    shutdown_worker(worker['id'], delete_actor_ch=False)
                    continue
                except Exception as e:
                    logger.debug(
                        'METRICS ERROR shutting down worker: {} - {} - {}'.
                        format(type(e), e, e.args))
                logger.debug('METRICS shut down worker {}'.format(
                    worker['id']))

    except IndexError:
        logger.debug('METRICS only one worker found for actor {}. '
                     'Will not scale down'.format(actor_id))
    except Exception as e:
        logger.debug("METRICS SCALE UP FAILED: {}".format(e))
Example #17
0
 def start_worker(self, image, tenant, worker_id):
     ch = SpawnerWorkerChannel(worker_id=worker_id)
     # start an actor executor container and wait for a confirmation that image was pulled.
     worker_dict = run_worker(image, worker_id)
     worker_dict['ch_name'] = WorkerChannel.get_name(worker_id)
     worker = Worker(tenant=tenant, **worker_dict)
     logger.info(
         "worker started successfully, waiting on ack that image was pulled..."
     )
     result = ch.get()
     logger.debug(
         "Got response back from worker. Response: {}".format(result))
     if result.get('status') == 'error':
         # there was a problem pulling the image; put the actor in an error state:
         msg = "Got an error back from the worker. Message: {}", format(
             result)
         logger.info(msg)
         if 'msg' in result:
             raise SpawnerException(message=result['msg'])
         else:
             logger.error(
                 "Spawner received invalid message from worker. 'msg' field missing. Message: {}"
                 .format(result))
             raise SpawnerException(
                 message="Internal error starting worker process.")
     elif result['value']['status'] == 'ok':
         logger.debug("received ack from worker.")
         return ch, result['reply_to'], worker
     else:
         msg = "Got an error status from worker: {}. Raising an exception.".format(
             str(result))
         logger.error(
             "Spawner received an invalid message from worker. Message: ".
             format(result))
         raise SpawnerException(msg)
Example #18
0
 def put(self, actor_id):
     logger.debug("top of PUT /actors/{}".format(actor_id))
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[dbid])
     except KeyError:
         logger.debug("did not find actor {} in store.".format(dbid))
         raise ResourceError(
             "No actor found with id: {}.".format(actor_id), 404)
     previous_image = actor.image
     args = self.validate_put(actor)
     logger.debug("PUT args validated successfully.")
     args['tenant'] = g.tenant
     update_image = False
     if args['image'] == previous_image:
         logger.debug("new image is the same. not updating actor.")
         args['status'] = actor.status
     else:
         update_image = True
         args['status'] = SUBMITTED
         logger.debug("new image is different. updating actor.")
     args['api_server'] = g.api_server
     args['owner'] = g.user
     actor = Actor(**args)
     actors_store[actor.db_id] = actor.to_db()
     logger.info("updated actor {} stored in db.".format(actor_id))
     worker_ids = Worker.request_worker(actor.db_id)
     if update_image:
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant'])
         logger.debug("put new command on command channel to update actor.")
     return ok(result=actor.display(),
               msg="Actor updated successfully.")
Example #19
0
 def start_worker(self, image, tenant, worker_id):
     ch = WorkerChannel()
     # start an actor executor container and wait for a confirmation that image was pulled.
     worker_dict = run_worker(image, ch.name, worker_id)
     worker = Worker(tenant=tenant, **worker_dict)
     print(
         "worker started successfully, waiting on ack that image was pulled..."
     )
     result = ch.get()
     if result.get('status') == 'error':
         # there was a problem pulling the image; put the actor in an error state:
         msg = "got an error back from the worker. Message: {}", format(
             result)
         print(msg)
         if 'msg' in result:
             raise SpawnerException(message=result['msg'])
         else:
             raise SpawnerException(
                 message="Internal error starting worker process.")
     elif result['value']['status'] == 'ok':
         print("received ack from worker.")
         return ch, result['reply_to'], worker
     else:
         msg = "Got an error status from worker: {}. Raising an exception.".format(
             str(result))
         print(msg)
         raise SpawnerException(msg)
Example #20
0
 def put(self, actor_id):
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[dbid])
     except KeyError:
         raise ResourceError("actor not found: {}'".format(actor_id), 404)
     previous_image = actor.image
     args = self.validate_put(actor)
     args['tenant'] = g.tenant
     update_image = False
     if args['image'] == previous_image:
         args['status'] = actor.status
     else:
         update_image = True
         args['status'] = SUBMITTED
     args['api_server'] = g.api_server
     args['owner'] = g.user
     actor = Actor(**args)
     actors_store[actor.db_id] = actor.to_db()
     worker_ids = Worker.request_worker(actor.db_id)
     if update_image:
         ch = CommandChannel()
         ch.put_cmd(actor_id=actor.db_id,
                    worker_ids=worker_ids,
                    image=actor.image,
                    tenant=args['tenant'])
     # return ok(result={'update_image': str(update_image)},
     #           msg="Actor updated successfully.")
     return ok(result=actor.display(), msg="Actor updated successfully.")
def add_worker():
    try:
        id = int(request.form.get('worker_id'))
        name = request.form.get('worker_name')
        id_card = request.form.get('id_card_no')
        age = int(request.form.get('age'))
        sex = request.form.get('sex')
        tel = request.form.get('telephone')
        address = request.form.get('address')
        salary = int(request.form.get('salary'))
        type = request.form.get('worker_type')
        user_id = session['user_id']
        agent = Worker.query.filter(Worker.worker_id == user_id).first()
        workshop_id = agent.workshop_id
        worker = Worker(worker_id=id,
                        name=name,
                        id_card_NO=id_card,
                        age=age,
                        sex=sex,
                        contart_info=tel,
                        address=address,
                        salary=salary,
                        work_type=type,
                        workshop_id=workshop_id)
        db.session.add(worker)
        db.session.commit()
        return render_template('agent/agent_add.html',
                               success=True,
                               msg="添加成功~")
    except:
        db.session.rollback()
        return render_template('agent/agent_add.html',
                               success=False,
                               msg="添加失败,请确保输入信息无误~")
Example #22
0
def scale_up(actor_id):
    tenant, aid = actor_id.decode('utf8').split('_')
    logger.debug(
        'METRICS Attempting to create a new worker for {}'.format(actor_id))
    try:
        # create a worker & add to this actor
        actor = Actor.from_db(actors_store[actor_id])
        worker_id = Worker.request_worker(tenant=tenant, actor_id=actor_id)
        logger.info("New worker id: {}".format(worker_id))
        if actor.queue:
            channel_name = actor.queue
        else:
            channel_name = 'default'
        ch = CommandChannel(name=channel_name)
        ch.put_cmd(actor_id=actor.db_id,
                   worker_id=worker_id,
                   image=actor.image,
                   tenant=tenant,
                   stop_existing=False)
        ch.close()
        logger.debug(
            'METRICS Added worker successfully for {}'.format(actor_id))
        return channel_name
    except Exception as e:
        logger.debug("METRICS - SOMETHING BROKE: {} - {} - {}".format(
            type(e), e, e.args))
        return None
Example #23
0
    def error_out_actor(self, actor_id, worker_id, message):
        """In case of an error, put the actor in error state and kill all workers"""
        logger.debug("top of error_out_actor for worker: {}_{}".format(
            actor_id, worker_id))
        # it is possible the actor was deleted already -- only set the actor status to ERROR if
        # it still exists in the store
        actor = actors_store.get(actor_id)
        if actor:
            Actor.set_status(actor_id, ERROR, status_message=message)
        # check to see how far the spawner got setting up the worker:
        try:
            worker = Worker.get_worker(actor_id, worker_id)
            worker_status = worker.get('status')
            logger.debug(
                f"got worker status for {actor_id}_{worker_id}; status: {worker_status}"
            )
        except Exception as e:
            logger.debug(
                f"got exception in error_out_actor trying to determine worker status for {actor_id}_{worker_id}; "
                f"e:{e};")
            # skip all worker processing is skipped.
            return

        if worker_status == UPDATING_STORE or worker_status == READY or worker_status == BUSY:
            logger.debug(
                f"worker status was: {worker_status}; trying to stop_worker")
            # for workers whose containers are running, we first try to stop workers using the "graceful" approach -
            try:
                self.stop_workers(actor_id, worker_ids=[])
                logger.info(
                    "Spawner just stopped worker {}_{} in error_out_actor".
                    format(actor_id, worker_id))
                return
            except Exception as e:
                logger.error(
                    "spawner got exception trying to run stop_workers. Exception: {}"
                    .format(e))
                logger.info(
                    "setting worker_status to ERROR so that kill_worker will run."
                )
                worker_status = ERROR

        # if the spawner was never able to start the worker container, we need to simply delete the worker record
        if worker_status == REQUESTED or worker_status == SPAWNER_SETUP or worker_status == PULLING_IMAGE or \
            worker_status == ERROR:
            logger.debug(
                f"worker status was: {worker_status}; trying to kill_worker")
            try:
                self.kill_worker(actor_id, worker_id)
                logger.info(
                    "Spawner just killed worker {}_{} in error_out_actor".
                    format(actor_id, worker_id))
            except DockerError as e:
                logger.info(
                    "Received DockerError trying to kill worker: {}. Exception: {}"
                    .format(worker_id, e))
                logger.info(
                    "Spawner will continue on since this is exception processing."
                )
Example #24
0
 def delete(self, actor_id, worker_id):
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         worker = Worker.get_worker(id, worker_id)
     except WorkerException as e:
         raise ResourceError(e.msg, 404)
     shutdown_worker(worker['ch_name'])
     return ok(result=None, msg="Worker scheduled to be stopped.")
Example #25
0
def make_new_worker(amz_dict, session):
    worker = Worker(workerid=amz_dict['workerId'],
                    assignmentid=amz_dict['assignmentId'],
                    hitid=amz_dict['hitId'],
                    list_id=amz_dict['list'])
    session.add(worker)
    session.commit()
    return worker
Example #26
0
 def delete(self, actor_id, ch_name):
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         worker = Worker.get_worker(id, ch_name)
     except WorkerException as e:
         raise APIException(e.msg, 404)
     shutdown_worker(ch_name)
     return ok(result=None, msg="Worker scheduled to be stopped.")
Example #27
0
def shutdown_workers(actor_id):
    """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument."""
    workers = Worker.get_workers(actor_id)
    # @TODO - this code is not thread safe. we need to update the workers state in a transaction:
    for _, worker in workers.items():
        # if there is already a channel, tell the worker to shut itself down
        if 'ch_name' in worker:
            shutdown_worker(worker['ch_name'])
        else:
            # otherwise, just remove from db:
            try:
                worker_id = worker.get('id')
                Worker.delete_worker(actor_id, worker_id)
            except WorkerException as e:
                print(
                    "Got a WorkerException trying to delete worker with id: {}; exception: {}"
                    .format(worker_id, e))
Example #28
0
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    print("Checking health for actors: {}".format(actor_id))
    workers = Worker.get_workers(actor_id)
    print("workers: {}".format(workers))
    for _, worker in workers.items():
        # if the worker has only been requested, it will not have a host_id.
        if 'host_id' not in worker:
            # @todo- we will skip for now, but we need something more robust in case the worker is never claimed.
            continue
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        print("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(name=worker['ch_name'])
        try:
            print("Issuing status check to channel: {}".format(worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            print("Worker did not respond, removing container and deleting worker.")
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            Worker.delete_worker(actor_id, worker['ch_name'])
            continue
        if not result == 'ok':
            print("Worker responded unexpectedly: {}, deleting worker.".format(result))
            rm_container(worker['cid'])
            Worker.delete_worker(actor_id, worker['ch_name'])
        else:
            print("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            print("Infinite ttl configured; leaving worker")
            return
        if worker['status'] == codes.READY and \
            worker['last_execution'] + ttl < time.time():
            # shutdown worker
            print("Shutting down worker beyond ttl.")
            shutdown_worker(worker['ch_name'])
        else:
            print("Worker still has life.")
Example #29
0
File: health.py Project: TACC/abaco
def manage_workers(actor_id):
    """Scale workers for an actor if based on message queue size and policy."""
    print("Entering manage_workers for {}".format(actor_id))
    try:
        actor = Actor.from_db(actors_store[actor_id])
    except KeyError:
        print("Did not find actor; returning.")
        return
    workers = Worker.get_workers(actor_id)
Example #30
0
def shutdown_worker(actor_id, worker_id, delete_actor_ch=True):
    """Gracefully shutdown a single worker."
    actor_id (str) - the dbid of the associated actor.
    """
    logger.debug("top of shutdown_worker for worker_id: {}".format(worker_id))
    # set the worker status to SHUTDOWN_REQUESTED:
    try:
        Worker.update_worker_status(actor_id, worker_id, SHUTDOWN_REQUESTED)
    except Exception as e:
        logger.error(f"worker got exception trying to update status to SHUTODWN_REQUESTED. actor_id: {actor_id};"
                     f"worker_id: {worker_id}; exception: {e}")
    ch = WorkerChannel(worker_id=worker_id)
    if not delete_actor_ch:
        ch.put("stop-no-delete")
    else:
        ch.put("stop")
    logger.info("A 'stop' message was sent to worker: {}".format(worker_id))
    ch.close()
Example #31
0
    def render(self):
        print("===Add===")
        
        if not self._tempWorker:
            name = input("\nName: ")
            age = input("Age: ")
            self._tempWorker = Worker(name, age, "")

        print("\nAdd {0}? (y/n)".format(self._tempWorker.name))
Example #32
0
 def get(self, actor_id, worker_id):
     logger.debug("top of GET /actors/{}/workers/{}.".format(actor_id, worker_id))
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         Actor.from_db(actors_store[id])
     except KeyError:
         logger.debug("Did not find actor: {}.".format(actor_id))
         raise ResourceError("No actor found with id: {}.".format(actor_id), 404)
     try:
         worker = Worker.get_worker(id, worker_id)
     except WorkerException as e:
         logger.debug("Did not find worker: {}. actor: {}.".format(worker_id, actor_id))
         raise ResourceError(e.msg, 404)
     # worker is an honest python dictionary with a single key, the id of the worker. need to
     # convert it to a Worker object
     worker.update({'id': worker_id})
     w = Worker(**worker)
     return ok(result=w.display(), msg="Worker retrieved successfully.")
Example #33
0
def manage_workers(actor_id):
    """Scale workers for an actor if based on message queue size and policy."""
    logger.info("Entering manage_workers for {}".format(actor_id))
    try:
        actor = Actor.from_db(actors_store[actor_id])
    except KeyError:
        logger.info("Did not find actor; returning.")
        return
    workers = Worker.get_workers(actor_id)
Example #34
0
 def get(self, actor_id, worker_id):
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         Actor.from_db(actors_store[id])
     except KeyError:
         raise WorkerException("actor not found: {}'".format(actor_id))
     try:
         worker = Worker.get_worker(id, worker_id)
     except WorkerException as e:
         raise ResourceError(e.msg, 404)
     return ok(result=worker, msg="Worker retrieved successfully.")
Example #35
0
 def get(self, actor_id, ch_name):
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         Actor.from_db(actors_store[id])
     except KeyError:
         raise WorkerException("actor not found: {}'".format(actor_id))
     try:
         worker = Worker.get_worker(id, ch_name)
     except WorkerException as e:
         raise APIException(e.msg, 404)
     return ok(result=worker, msg="Worker retrieved successfully.")
Example #36
0
File: health.py Project: TACC/abaco
def check_workers(actor_id, ttl):
    """Check health of all workers for an actor."""
    print("Checking health for actors: {}".format(actor_id))
    workers = Worker.get_workers(actor_id)
    print("workers: {}".format(workers))
    for _, worker in workers.items():
        # ignore workers on different hosts
        if not Config.get('spawner', 'host_id') == worker['host_id']:
            continue
        # first check if worker is responsive; if not, will need to manually kill
        print("Checking health for worker: {}".format(worker))
        ch = WorkerChannel(name=worker['ch_name'])
        try:
            print("Issuing status check to channel: {}".format(worker['ch_name']))
            result = ch.put_sync('status', timeout=5)
        except channelpy.exceptions.ChannelTimeoutException:
            print("Worker did not respond, removing container and deleting worker.")
            try:
                rm_container(worker['cid'])
            except DockerError:
                pass
            Worker.delete_worker(actor_id, worker['ch_name'])
            continue
        if not result == 'ok':
            print("Worker responded unexpectedly: {}, deleting worker.".format(result))
            rm_container(worker['cid'])
            Worker.delete_worker(actor_id, worker['ch_name'])
        else:
            print("Worker ok.")
        # now check if the worker has been idle beyond the ttl:
        if ttl < 0:
            # ttl < 0 means infinite life
            print("Infinite ttl configured; leaving worker")
            return
        if worker['status'] == codes.READY and \
            worker['last_execution'] + ttl < time.time():
            # shutdown worker
            print("Shutting down worker beyond ttl.")
            shutdown_worker(worker['ch_name'])
        else:
            print("Worker still has life.")
Example #37
0
def shutdown_workers(actor_id):
    """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument."""
    logger.debug("shutdown_workers() called for actor: {}".format(actor_id))
    try:
        workers = Worker.get_workers(actor_id)
    except Exception as e:
        logger.error("Got exception from get_workers: {}".format(e))
    if workers == {}:
        logger.info("shutdown_workers did not receive any workers from Worker.get_worker for actor: {}".format(actor_id))
    # @TODO - this code is not thread safe. we need to update the workers state in a transaction:
    for _, worker in workers.items():
        shutdown_worker(worker['id'])
Example #38
0
    def post(self, actor_id):
        def get_hypermedia(actor, exc):
            return {'_links': {'self': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc),
                               'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner),
                               'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id)},}

        args = self.validate_post()
        d = {}
        # build a dictionary of k:v pairs from the query parameters, and pass a single
        # additional object 'message' from within the post payload. Note that 'message'
        # need not be JSON data.
        for k, v in request.args.items():
            if k == 'message':
                continue
            d[k] = v
        if hasattr(g, 'user'):
            d['_abaco_username'] = g.user
        if hasattr(g, 'api_server'):
            d['_abaco_api_server'] = g.api_server
        # if hasattr(g, 'jwt'):
        #     d['_abaco_jwt'] = g.jwt
        # if hasattr(g, 'jwt_server'):
        #     d['_abaco_jwt_server'] = g.jwt_server
        if hasattr(g, 'jwt_header_name'):
            d['_abaco_jwt_header_name'] = g.jwt_header_name
        dbid = Actor.get_dbid(g.tenant, actor_id)
        # create an execution
        exc = Execution.add_execution(dbid, {'cpu': 0,
                                             'io': 0,
                                             'runtime': 0,
                                             'status': SUBMITTED,
                                             'executor': g.user})
        d['_abaco_execution_id'] = exc
        d['_abaco_Content-Type'] = args.get('_abaco_Content-Type', '')
        ch = ActorMsgChannel(actor_id=dbid)
        ch.put_msg(message=args['message'], d=d)
        # make sure at least one worker is available
        workers = Worker.get_workers(dbid)
        actor = Actor.from_db(actors_store[dbid])
        if len(workers.items()) < 1:
            ch = CommandChannel()
            ch.put_cmd(actor_id=dbid, image=actor.image, tenant=g.tenant, num=1, stop_existing=False)
        result={'execution_id': exc, 'msg': args['message']}
        result.update(get_hypermedia(actor, exc))
        case = Config.get('web', 'case')
        if not case == 'camel':
            return ok(result)
        else:
            return ok(dict_to_camel(result))
Example #39
0
 def get(self, actor_id):
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         Actor.from_db(actors_store[dbid])
     except KeyError:
         raise APIException("actor not found: {}'".format(actor_id), 400)
     try:
         workers = Worker.get_workers(dbid)
     except WorkerException as e:
         raise APIException(e.msg, 404)
     result = []
     for id, worker in workers.items():
         worker.update({'id': id})
         result.append(worker)
     return ok(result=result, msg="Workers retrieved successfully.")
Example #40
0
def execute_actor(actor_id, worker_ch, image, msg, d={}, privileged=False):
    result = {'cpu': 0,
              'io': 0,
              'runtime': 0 }
    cli = docker.AutoVersionClient(base_url=dd)
    d['MSG'] = msg
    binds = {}
    volumes = []
    # if container is privileged, mount the docker daemon so that additional
    # containers can be started.
    if privileged:
        binds = {'/var/run/docker.sock':{
                    'bind': '/var/run/docker.sock',
                    'ro': False }}
        volumes = ['/var/run/docker.sock']
    host_config = cli.create_host_config(binds=binds, privileged=privileged)
    container = cli.create_container(image=image, environment=d, volumes=volumes, host_config=host_config)
    try:
        cli.start(container=container.get('Id'))
    except Exception as e:
        # if there was an error starting the container, user will need to debig
        raise DockerStartContainerError("Could not start container {}. Exception {}".format(container.get('Id'), str(e)))
    start = timeit.default_timer()
    Worker.update_worker_status(actor_id, worker_ch.name, BUSY)
    running = True
    # create a separate cli for checkin stats objects since these should be fast and we don't want to wait
    stats_cli = docker.AutoVersionClient(base_url=dd, timeout=1)
    try:
        stats_obj = stats_cli.stats(container=container.get('Id'), decode=True)
    except ReadTimeout:
        # if the container execution is so fast that the inital stats object cannot be created,
        # we skip the running loop and return a minimal stats object
        result['cpu'] = 1
        result['runtime'] = 1
        return result
    while running:
        try:
            print("waiting on a stats obj: {}".format(timeit.default_timer()))
            stats = next(stats_obj)
        except ReadTimeoutError:
            print("next(stats) just timed out: {}".format(timeit.default_timer()))
            # container stopped before another stats record could be read, just ignore and move on
            running = False
            break
        try:
            result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage']
            result['io'] += stats['network']['rx_bytes']
        except KeyError:
            # as of docker 1.9, the stats object returns bytes that must be decoded
            # and the network key is now 'networks' with multiple subkeys.
            print("got a stats obj: {}".format(timeit.default_timer()))
            if type(stats) == bytes:
                stats = json.loads(stats.decode("utf-8"))
            result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage']
            # even running docker 1.9, there seems to be a race condition where the 'networks' key doesn't
            # always get populated.
            try:
                result['io'] += stats['networks']['eth0']['rx_bytes']
            except KeyError:
                pass
            print("Recorded a stats obj:".format(timeit.default_timer()))
        if running:
            try:
                print("waiting on cli.wait: {}".format(timeit.default_timer()))
                cli.wait(container=container.get('Id'), timeout=1)
                print("container finished: {}".format(timeit.default_timer()))
                running = False
            except ReadTimeout:
                print("cli.wait just timed out: {}".format(timeit.default_timer()))
                # the wait timed out so check if we are beyond the max_run_time
                runtime = timeit.default_timer() - start
                if max_run_time > 0 and max_run_time < runtime:
                    print("hit runtime limit: {}".format(timeit.default_timer()))
                    cli.stop(container.get('Id'))
                    running = False
    print("container stopped:{}".format(timeit.default_timer()))
    stop = timeit.default_timer()
    # get logs from container
    logs = cli.logs(container.get('Id'))
    # remove container, ignore errors
    try:
        cli.remove_container(container=container)
        print("Container removed.")
    except Exception as e:
        print("Exception trying to remove actor: {}".format(e))
    result['runtime'] = int(stop - start)
    return result, logs
Example #41
0
File: worker.py Project: TACC/abaco
def subscribe(tenant,
              actor_id,
              api_server,
              client_id,
              client_secret,
              access_token,
              refresh_token,
              worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also subscribes to the worker channel for future communications.
    :return:
    """
    actor_ch = ActorMsgChannel(actor_id)
    ag = None
    if api_server and client_id and client_secret and access_token and refresh_token:
        ag = Agave(api_server=api_server,
                   token=access_token,
                   refresh_token=refresh_token,
                   api_key=client_id,
                   api_secret=client_secret)
    else:
        print("Not creating agave client.")
    t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, actor_ch, ag))
    t.start()
    print("Worker subscribing to actor channel...")
    global keep_running
    while keep_running:
        Worker.update_worker_status(actor_id, worker_ch.name, READY)
        try:
            msg = actor_ch.get(timeout=2)
        except channelpy.ChannelTimeoutException:
            continue
        except channelpy.ChannelClosedException:
            print("Channel closed, worker exiting...")
            keep_running = False
            sys.exit()
        print("Received message {}. Starting actor container...".format(str(msg)))
        # the msg object is a dictionary with an entry called message and an arbitrary
        # set of k:v pairs coming in from the query parameters.
        message = msg.pop('message', '')
        actor = Actor.from_db(actors_store[actor_id])
        execution_id = msg['_abaco_execution_id']
        privileged = False
        if actor['privileged'] == 'TRUE':
            privileged = True
        environment = actor['default_environment']
        print("Actor default environment: {}".format(environment))
        print("Actor privileged: {}".format(privileged))
        # overlay the default_environment registered for the actor with the msg
        # dictionary
        environment.update(msg)
        environment['_abaco_access_token'] = ''
        # if we have an agave client, get a fresh set of tokens:
        if ag:
            try:
                ag.token.refresh()
                token = ag.token.token_info['access_token']
                environment['_abaco_access_token'] = token
                print("Refreshed the tokens. Passed {} to the environment.".format(token))
            except Exception as e:
                print("Got an exception trying to get an access token: {}".format(e))
        else:
            print("Agave client `ag` is None -- not passing access token.")
        print("Passing update environment: {}".format(environment))
        try:
            stats, logs = execute_actor(actor_id, worker_ch, image, message,
                                        environment, privileged)
        except DockerStartContainerError as e:
            print("Got DockerStartContainerError: {}".format(str(e)))
            Actor.set_status(actor_id, ERROR)
            continue
        # add the execution to the actor store
        print("Actor container finished successfully. Got stats object:{}".format(str(stats)))
        Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats)
        print("Added execution: {}".format(execution_id))
        Execution.set_logs(execution_id, logs)
        Worker.update_worker_execution_time(actor_id, worker_ch.name)
Example #42
0
File: worker.py Project: TACC/abaco
def shutdown_workers(actor_id):
    """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument."""
    workers = Worker.get_workers(actor_id)
    for _, worker in workers.items():
        shutdown_worker(worker['ch_name'])
Example #43
0
    def process(self, cmd):
        print("Processing cmd:{}".format(str(cmd)))
        actor_id = cmd['actor_id']
        image = cmd['image']
        tenant = cmd['tenant']
        stop_existing = cmd.get('stop_existing', True)
        num_workers = cmd.get('num', self.num_workers)
        print("Actor id:{}".format(actor_id))
        try:
            new_channels, anon_channels, new_workers = self.start_workers(actor_id, image, tenant, num_workers)
        except SpawnerException as e:
            # for now, start_workers will do clean up for a SpawnerException, so we just need
            # to return back to the run loop.
            return
        print("Created new workers: {}".format(str(new_workers)))

        # stop any existing workers:
        if stop_existing:
            self.stop_workers(actor_id)

        # add workers to store first so that the records will be there when the workers go
        # to update their status
        if not stop_existing:
            for _, worker in new_workers.items():
                Worker.add_worker(actor_id, worker)
        else:
            workers_store[actor_id] = new_workers
        # send new workers their clients and tell them to subscribe to the actor channel.
        for idx, channel in enumerate(anon_channels):
            print("Getting client for worker {}".format(idx))
            client_ch = ClientsChannel()
            client_msg = client_ch.request_client(tenant=tenant,
                                                  actor_id=actor_id,
                                                  # new_workers is a dictionary of dictionaries; list(d) creates a
                                                  # list of keys for a dictionary d. hence, the idx^th entry
                                                  # of list(ner_workers) should be the key.
                                                  worker_id=new_workers[list(new_workers)[idx]]['ch_name'],
                                                  secret=self.secret)
            # we need to ignore errors when generating clients because it's possible it is not set up for a specific
            # tenant. we log it instead.
            if client_msg.get('status') == 'error':
                print("Error generating client: {}".format(client_msg.get('message')))
                channel.put({'status': 'ok',
                             'actor_id': actor_id,
                             'tenant': tenant,
                             'client': 'no'})
            # else, client was generated successfully:
            else:
                print("Got a client: {}, {}, {}".format(client_msg['client_id'],
                                                        client_msg['access_token'],
                                                        client_msg['refresh_token']))
                channel.put({'status': 'ok',
                             'actor_id': actor_id,
                             'tenant': tenant,
                             'client': 'yes',
                             'client_id': client_msg['client_id'],
                             'client_secret': client_msg['client_secret'],
                             'access_token': client_msg['access_token'],
                             'refresh_token': client_msg['refresh_token'],
                             'api_server': client_msg['api_server'],
                             })
        print("Done processing command.")