Exemple #1
0
    def _schedule_job(self, job_id):
        job = Job.query.get(job_id)
        assert job is not None, f'Job {job_id} not found'
        if job.state != JobState.WAITING:
            return job.state

        with get_session(self._db_engine) as session:
            job_service = JobService(session)
            if not job_service.is_ready(job):
                return job.state
            config = job.get_config()
            if config.is_federated:
                if not job_service.is_peer_ready(job):
                    return job.state

        try:
            yaml = generate_job_run_yaml(job)
            k8s_client.create_flapp(yaml)
        except Exception as e:
            logging.error(f'Start job {job_id} has error msg: {e.args}')
            job.error_message = str(e)
            db.session.commit()
            return job.state
        job.error_message = None
        job.start()
        db.session.commit()

        return job.state
Exemple #2
0
    def _schedule_job(self, job_id):
        job = Job.query.get(job_id)
        assert job is not None, 'Job %d not found' % job_id
        if job.state != JobState.WAITING:
            return job.state
        deps = JobDependency.query.filter(
            JobDependency.dst_job_id == job.id).all()
        for dep in deps:
            src_job = Job.query.get(dep.src_job_id)
            assert src_job is not None, 'Job %d not found' % dep.src_job_id
            if not src_job.is_complete():
                return job.state

        k8s_client = get_client()
        yaml = generate_job_run_yaml(job)
        try:
            k8s_client.create_or_replace_custom_object(CrdKind.FLAPP, yaml)
        except RuntimeError as e:
            logging.error('Start job %d has Runtime error msg: %s', job_id,
                          e.args)
            return job.state
        job.start()
        db.session.commit()

        return job.state
Exemple #3
0
 def get_checkpoint_path(job):
     try:
         yaml = generate_job_run_yaml(job)
         # ToDo : OUTPUT_BASE_DIR may not be accurate
         output_base_dir = yaml['OUTPUT_BASE_DIR']
     except Exception as e:
         logging.warning('Error building metrics: %s', repr(e))
         output_base_dir = ''
     return output_base_dir
Exemple #4
0
    def patch(self, workflow_id):
        parser = reqparse.RequestParser()
        parser.add_argument('target_state', type=str, required=False,
                            default=None, help='target_state is empty')
        parser.add_argument('state', type=str, required=False,
                            default=None, help='state is empty')
        parser.add_argument('forkable', type=bool)
        parser.add_argument('metric_is_public', type=bool)
        parser.add_argument('config', type=dict, required=False,
                            default=None, help='updated config')
        data = parser.parse_args()

        workflow = _get_workflow(workflow_id)

        forkable = data['forkable']
        if forkable is not None:
            workflow.forkable = forkable
            db.session.flush()

        metric_is_public = data['metric_is_public']
        if metric_is_public is not None:
            workflow.metric_is_public = metric_is_public
            db.session.flush()

        target_state = data['target_state']
        if target_state:
            try:
                if WorkflowState[target_state] == WorkflowState.RUNNING:
                    for job in workflow.owned_jobs:
                        try:
                            generate_job_run_yaml(job)
                        # TODO: check if peer variables is valid
                        except RuntimeError as e:
                            raise ValueError(
                                f'Invalid Variable when try '
                                f'to format the job {job.name}:{str(e)}')
                workflow.update_target_state(WorkflowState[target_state])
                db.session.flush()
                logging.info('updated workflow %d target_state to %s',
                            workflow.id, workflow.target_state)
                scheduler.wakeup(workflow.id)
            except ValueError as e:
                raise InvalidArgumentException(details=str(e)) from e

        state = data['state']
        if state:
            try:
                assert state == 'INVALID', \
                    'Can only set state to INVALID for invalidation'
                workflow.invalidate()
                db.session.flush()
                logging.info('invalidate workflow %d', workflow.id)
            except ValueError as e:
                raise InvalidArgumentException(details=str(e)) from e

        config = data['config']
        if config:
            try:
                if workflow.target_state != WorkflowState.INVALID or \
                        workflow.state not in \
                        [WorkflowState.READY, WorkflowState.STOPPED]:
                    raise NoAccessException('Cannot edit running workflow')
                config_proto = dict_to_workflow_definition(data['config'])
                workflow.set_config(config_proto)
                db.session.flush()
            except ValueError as e:
                raise InvalidArgumentException(details=str(e)) from e

        db.session.commit()
        return {'data': workflow.to_dict()}, HTTPStatus.OK
Exemple #5
0
    def patch(self, workflow_id):
        parser = reqparse.RequestParser()
        parser.add_argument('target_state', type=str, required=False,
                            default=None, help='target_state is empty')
        parser.add_argument('state',
                            type=str,
                            required=False,
                            help='state is empty')
        parser.add_argument('forkable', type=bool)
        parser.add_argument('metric_is_public', type=bool)
        parser.add_argument('config',
                            type=dict,
                            required=False,
                            help='updated config')
        parser.add_argument('create_job_flags', type=list, required=False,
                            location='json',
                            help='flags in common.CreateJobFlag')
        parser.add_argument('batch_update_interval',
                            type=int,
                            required=False,
                            help='interval for restart workflow in minute')
        data = parser.parse_args()

        workflow = _get_workflow(workflow_id)

        # start workflow every interval time
        batch_update_interval = data['batch_update_interval']
        if batch_update_interval:
            start_or_stop_cronjob(batch_update_interval, workflow)

        forkable = data['forkable']
        if forkable is not None:
            workflow.forkable = forkable
            db.session.flush()

        metric_is_public = data['metric_is_public']
        if metric_is_public is not None:
            workflow.metric_is_public = metric_is_public
            db.session.flush()

        target_state = data['target_state']
        if target_state:
            try:
                if WorkflowState[target_state] == WorkflowState.RUNNING:
                    for job in workflow.owned_jobs:
                        try:
                            generate_job_run_yaml(job)
                        # TODO: check if peer variables is valid
                        except Exception as e:  # pylint: disable=broad-except
                            raise ValueError(
                                f'Invalid Variable when try '
                                f'to format the job {job.name}:{str(e)}')
                workflow.update_target_state(WorkflowState[target_state])
                db.session.flush()
                logging.info('updated workflow %d target_state to %s',
                             workflow.id, workflow.target_state)
            except ValueError as e:
                raise InvalidArgumentException(details=str(e)) from e

        state = data['state']
        if state:
            try:
                assert state == 'INVALID', \
                    'Can only set state to INVALID for invalidation'
                workflow.invalidate()
                db.session.flush()
                logging.info('invalidate workflow %d', workflow.id)
            except ValueError as e:
                raise InvalidArgumentException(details=str(e)) from e

        config = data['config']
        if config:
            try:
                if workflow.target_state != WorkflowState.INVALID or \
                        workflow.state not in \
                        [WorkflowState.READY, WorkflowState.STOPPED]:
                    raise NoAccessException('Cannot edit running workflow')
                config_proto = dict_to_workflow_definition(data['config'])
                workflow.set_config(config_proto)
                db.session.flush()
            except ValueError as e:
                raise InvalidArgumentException(details=str(e)) from e

        create_job_flags = data['create_job_flags']
        if create_job_flags:
            jobs = workflow.get_jobs()
            if len(create_job_flags) != len(jobs):
                raise InvalidArgumentException(
                    details='Number of job defs does not match number '
                            f'of create_job_flags {len(jobs)} '
                            f'vs {len(create_job_flags)}')
            workflow.set_create_job_flags(create_job_flags)
            flags = workflow.get_create_job_flags()
            for i, job in enumerate(jobs):
                if job.workflow_id == workflow.id:
                    job.is_disabled = flags[i] == \
                                      common_pb2.CreateJobFlag.DISABLED

        db.session.commit()
        scheduler.wakeup(workflow.id)
        return {'data': workflow.to_dict()}, HTTPStatus.OK