def do_train(self, subscription, model_id, model_dir, parameters): request = Request() request.headers['apim-subscription-id'] = subscription request.data = self.prepare_training_data(parameters) result = self.magaclient.train(request) if 'modelId' in result: update_state(self.config, subscription, model_id, ModelState.Training, json.dumps(result), None) while True: state = self.magaclient.state(request, result['modelId']) if state['summary']['status'] != 'CREATED' and state[ 'summary']['status'] != 'RUNNING': break else: update_state(self.config, subscription, model_id, ModelState.Training, json.dumps(state), None) time.sleep(5) if state['summary']['status'] == 'READY': update_state(self.config, subscription, model_id, ModelState.Ready, json.dumps(state), None) return STATUS_SUCCESS, json.dumps(state) else: update_state(self.config, subscription, model_id, ModelState.Failed, json.dumps(state), None) return STATUS_FAIL, json.dumps(state) else: update_state(self.config, subscription, model_id, ModelState.Failed, json.dumps(result), result['message']) return STATUS_FAIL, result['message']
def train(self, request): request_body = json.loads(request.data) instance_id = request_body['instance']['instanceId'] subscription = request.headers.get('apim-subscription-id', 'Official') result, message = self.do_verify(subscription, request_body) if result != STATUS_SUCCESS: return make_response(jsonify(dict(instanceId=instance_id, modelId='', result=STATUS_FAIL, message='Verify failed! ' + message, modelState=ModelState.Deleted.name)), 400) models_in_train = [] for model in get_model_list(self.config, subscription): if 'inst_id' in model and model['inst_id'] == request_body['instance']['instanceId'] and model['state'] == ModelState.Training.name: models_in_train.append(model['model_id']) if len(models_in_train) >= self.config.models_in_training_limit_per_instance: return make_response(jsonify(dict(instanceId=instance_id, modelId='', result=STATUS_FAIL, message='Models in training limit reached! Abort training this time.', modelState=ModelState.Deleted.name)), 400) log.info('Create training task') try: model_id = str(uuid.uuid1()) insert_meta(self.config, subscription, model_id, request_body) meta = get_meta(self.config, subscription, model_id) timekey = meta['timekey'] asyncio.ensure_future(loop.run_in_executor(executor, self.train_wrapper, subscription, model_id, request_body, timekey, self.train_callback)) return make_response(jsonify(dict(instanceId=instance_id, modelId=model_id, result=STATUS_SUCCESS, message='Training task created', modelState=ModelState.Training.name)), 201) except Exception as e: meta = get_meta(self.config, subscription, model_id) if meta is not None and meta['timekey'] == timekey: update_state(self.config, subscription, model_id, ModelState.Failed, None, str(e)) return make_response(jsonify(dict(instanceId=instance_id, modelId=model_id, result=STATUS_FAIL, message='Fail to create new task ' + str(e), modelState=ModelState.Failed.name)), 400)
def delete(self, request, model_id): try: subscription = request.headers.get('apim-subscription-id', 'Official') result, message = self.do_delete(subscription, model_id) if result == STATUS_SUCCESS: update_state(self.config, subscription, model_id, ModelState.Deleted) return make_response(jsonify(dict(instanceId='', modelId=model_id, result=STATUS_SUCCESS, message='Model {} has been deleted'.format(model_id), modelState=ModelState.Deleted.name)), 200) else: raise Exception(message) except Exception as e: return make_response(jsonify(dict(instanceId='', modelId=model_id, result=STATUS_FAIL, message=str(e), modelState=ModelState.Failed.name)), 400)
def train_callback(self, subscription, model_id, parameters, model_state, timekey, last_error=None): log.info("Training callback %s by %s , state = %s" % (model_id, subscription, model_state)) meta = get_meta(self.config, subscription, model_id) if meta is None or meta['state'] == ModelState.Deleted.name: return STATUS_FAIL, 'Model is not found! ' # Train finish, save the model and call callback if model_state == ModelState.Ready: result, message = copy_tree_and_zip_and_update_remote(self.config, subscription, model_id, timekey) if result != STATUS_SUCCESS: model_state = ModelState.Failed last_error = 'Model storage failed!' update_state(self.config, subscription, model_id, model_state, None, last_error) return self.tsanaclient.save_training_result(parameters, model_id, model_state.name, last_error)
def train_callback(self, subscription, model_id, parameters, model_state, timekey, last_error=None): log.info("Training callback %s by %s , state = %s" % (model_id, subscription, model_state)) meta = get_meta(self.config, subscription, model_id) if meta is None or meta['state'] == ModelState.Deleted.name: return STATUS_FAIL, 'Model is not found! ' update_state(self.config, subscription, model_id, model_state, None, last_error) return self.tsanaclient.save_training_result(parameters, model_id, model_state.name, last_error)
def state(self, request, model_id): try: subscription = request.headers.get('apim-subscription-id', 'Official') meta = get_meta(self.config, subscription, model_id) if meta is None: return make_response( jsonify( dict(instanceId='', modelId=model_id, result=STATUS_SUCCESS, message='Model is not found!', modelState=ModelState.Deleted.name)), 200) if meta['state'] == ModelState.Training.name: return make_response( jsonify( dict(instanceId='', modelId=model_id, result=STATUS_SUCCESS, message=meta['context'] if 'context' in meta else '', modelState=meta['state'])), 200) if 'context' not in meta: raise Exception(meta['last_error']) context = json.loads(meta['context']) if 'modelId' not in context: raise Exception(meta['last_error']) actual_model_id = context['modelId'] state = self.magaclient.state(request, actual_model_id) if state['summary']['status'] == 'CREATED' or state['summary'][ 'status'] == 'RUNNING': model_state = ModelState.Training elif state['summary']['status'] == 'READY': model_state = ModelState.Ready elif state['summary']['status'] == 'DELETED': model_state = ModelState.Deleted else: model_state = ModelState.Failed update_state(self.config, subscription, model_id, model_state, json.dumps(state), None) return make_response( jsonify( dict(instanceId='', modelId=model_id, result=STATUS_SUCCESS, message=json.dumps(state), modelState=model_state.name)), 200) except Exception as e: update_state(self.config, subscription, model_id, ModelState.Failed, None, str(e)) return make_response( jsonify( dict(instanceId='', modelId=model_id, result=STATUS_FAIL, message=str(e), modelState=ModelState.Failed.name)), 400)