def start(logger, full_id, fetch=True, env=None, volumes=None, cpus=None, memory=None, gpu_devices=None, offline=False): """ Starts the job with all logging of a job_id """ owner, name, id = unpack_full_job_id(full_id) if isinstance(sys.stdout, GeneralLogger): # we don't want to have stuff written to stdout before in job's log sys.stdout.clear_buffer() job_backend = JobBackend(model_name=owner + '/' + name) if fetch: job_backend.fetch(id) job_backend.restart(id) job_backend.start(collect_system=False, offline=offline) job_backend.set_status('PREPARE', add_section=False) job = job_backend.get_job_model() if not cpus: cpus = job.get_cpu() if not memory: memory = job.get_memory() if not gpu_devices and job.get_gpu(): # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1] gpu_devices = [] for i in range(0, job.get_gpu()): gpu_devices.append(i) start_command(logger, job_backend, env, volumes, cpus=cpus, memory=memory, gpu_devices=gpu_devices, offline=offline)
def start(logger, full_id, fetch=True, env=None, volumes=None, gpu_devices=None): """ Starts the training process with all logging of a job_id """ owner, name, id = unpack_full_job_id(full_id) if isinstance(sys.stdout, GeneralLogger): # we don't want to have stuff written to stdout before in job's log sys.stdout.clear_buffer() job_backend = JobBackend(model_name=owner + '/' + name) if fetch: job_backend.fetch(id) job_backend.restart(id) job_backend.start() job_backend.set_status('PREPARE') job_backend.monitoring_thread.handle_max_time = False start_command(logger, job_backend, env, volumes, gpu_devices=gpu_devices)
def start(logger, full_id, hyperparameter=None, dataset_id=None, server='local', insights=False): """ Starts the training process with all logging of a job_id :type id: string : job id or model name """ id = None if full_id.count('/') == 1: owner, name = full_id.split('/') elif full_id.count('/') >= 2: owner, name, id = unpack_full_job_id(full_id) else: logger.error( "Invalid id %s given. Supported formats: owner/modelName or owner/modelName/jobId." % (full_id, )) sys.exit(1) job_backend = JobBackend(model_name=owner + '/' + name) if id: job_backend.restart(id) else: try: create_info = api.create_job_info(full_id, hyperparameter, dataset_id) except api.ApiError as e: if 'Connection refused' in e.reason: logger.error("You are offline") logger.error( "Can not start new job without knowing what model type it is. " "Use your script directly if its a Python model.") raise if not create_info: raise Exception( 'Could not fetch model information. Are you online and have access to the given model?' ) job_backend.create(create_info=create_info, hyperparameter=hyperparameter, server=server, insights=insights) if not len(job_backend.get_job_model().config): raise Exception( 'Job does not have a configuration. Make sure you created the job via AETROS Trainer.' ) if job_backend.is_simple_model(): start_keras(logger, job_backend) else: start_custom(logger, job_backend)
def start(logger, full_id, fetch=True, env=None, volumes=None, cpus=None, memory=None, gpu_devices=None, offline=False): """ Starts the job with all logging of a job_id """ owner, name, id = unpack_full_job_id(full_id) if isinstance(sys.stdout, GeneralLogger): # we don't want to have stuff written to stdout before in job's log sys.stdout.clear_buffer() job_backend = JobBackend(model_name=owner + '/' + name) if fetch: job_backend.fetch(id) job_backend.restart(id) job_backend.start(collect_system=False, offline=offline) job_backend.set_status('PREPARE', add_section=False) job = job_backend.get_job_model() if not cpus: cpus = job.get_cpu() if not memory: memory = job.get_memory() if not gpu_devices and job.get_gpu(): # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1] gpu_devices = [] for i in range(0, job.get_gpu()): gpu_devices.append(i) start_command(logger, job_backend, env, volumes, cpus=cpus, memory=memory, gpu_devices=gpu_devices, offline=offline)
def predict(logger, job_id, file_paths, weights_path=None): owner, name, id = unpack_full_job_id(job_id) job_backend = JobBackend(model_name=owner + '/' + name) job_backend.load(id) job_model = job_backend.get_job_model() os.chdir(job_backend.git.work_tree) if not weights_path: weights_path = job_model.get_weights_filepath_latest() from .Trainer import Trainer trainer = Trainer(job_backend) job_model.set_input_shape(trainer) import keras.backend if hasattr(keras.backend, 'set_image_dim_ordering'): keras.backend.set_image_dim_ordering('tf') if hasattr(keras.backend, 'set_image_data_format'): keras.backend.set_image_data_format('channels_last') job_backend.logger.info("Load model and compile ...") model = job_model.get_built_model(trainer) trainer.model = model from aetros.keras import load_weights logger.info('Load weights from ' + weights_path) load_weights(model, weights_path) inputs = [] for idx, file_path in enumerate(file_paths): inputs.append( job_model.convert_file_to_input_node( file_path, job_model.get_input_node(idx))) job_backend.logger.info("Start prediction ...") prediction = job_model.predict(trainer, np.array(inputs)) print(json.dumps(prediction, indent=4, default=invalid_json_values))
def predict(logger, job_id, file_paths, weights_path=None): owner, name, id = unpack_full_job_id(job_id) job_backend = JobBackend(model_name=owner+'/'+name) job_backend.fetch(id) job_backend.load(id) job_model = job_backend.get_job_model() os.chdir(job_backend.git.work_tree) if not weights_path: weights_path = job_model.get_weights_filepath_latest() from .Trainer import Trainer trainer = Trainer(job_backend) job_model.set_input_shape(trainer) import keras.backend if hasattr(keras.backend, 'set_image_dim_ordering'): keras.backend.set_image_dim_ordering('tf') if hasattr(keras.backend, 'set_image_data_format'): keras.backend.set_image_data_format('channels_last') job_backend.logger.info("Load model and compile ...") model = job_model.get_built_model(trainer) trainer.model = model from aetros.keras import load_weights logger.info('Load weights from ' + weights_path) load_weights(model, weights_path) inputs = [] for idx, file_path in enumerate(file_paths): inputs.append(job_model.convert_file_to_input_node(file_path, job_model.get_input_node(idx))) job_backend.logger.info("Start prediction ...") prediction = job_model.predict(trainer, np.array(inputs)) print(simplejson.dumps(prediction, indent=4, default=invalid_json_values))
def main(self, args): import aetros.const parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' run', description="Internal usage.") parser.add_argument( 'id', nargs='?', help= 'Job id, e.g. user/modelname/0db75a64acb74c27bd72c22e359de7a4c44a20e5 to start a pre-created job.' ) parser.add_argument('--fetch', action='store_true', help="Fetch job from server.") parsed_args = parser.parse_args(args) if not parsed_args.id: parser.print_help() sys.exit(1) owner, name, id = unpack_full_job_id(parsed_args.id) job_backend = JobBackend(model_name=owner + '/' + name) job_backend.section('checkout') if parsed_args.fetch: job_backend.fetch(id) job_backend.load(id) job_backend.start() start_keras(self.logger, job_backend)