예제 #1
0
    def main(self, args):

        from aetros import keras_model_utils

        import aetros.const
        from aetros.backend import JobBackend
        from aetros.Trainer import Trainer

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' upload-weights')
        parser.add_argument('id', help='model name or job id')
        parser.add_argument('weights', help="Weights path")
        parser.add_argument(
            '--api-key',
            help="Secure key. Alternatively use API_KEY environment variable.")
        parser.add_argument(
            '--kpi', help="You can overwrite or set the KPI for this job")
        parser.add_argument(
            '--latest',
            action="store_true",
            help="Instead of best epoch we upload latest weights.")
        parsed_args = parser.parse_args(args)

        if not parsed_args.id or not parsed_args.weights:
            parser.print_help()
            return

        job_backend = JobBackend(api_key=parsed_args.api_key)

        if '/' in parsed_args.id and '@' not in parsed_args.id:
            job_backend.create(parsed_args.id)

        job_backend.load(parsed_args.id)

        if job_backend.job is None:
            raise Exception("Job not found")

        weights_path = parsed_args.weights

        if not os.path.exists(weights_path):
            raise Exception('Weights file does not exist in ' + weights_path)

        print("Uploading weights to %s of %s ..." %
              (job_backend.job_id, job_backend.model_id))

        job_backend.upload_weights(
            'weights.hdf5', weights_path,
            float(parsed_args.kpi) if parsed_args.kpi else None)
예제 #2
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
                                         prog=aetros.const.__prog__ + ' run')
        parser.add_argument('command', nargs='?', help="The command to run. Default read in configuration file")
        parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in configuration file. If not specified, command is executed on the host.")
        parser.add_argument('--no-image', action='store_true', help="Forces not to use docker, even when image is defined in the configuration file.")

        parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in configuration file. Multiple --server allowed.")
        parser.add_argument('-m', '--model', help="Under which model this job should be listed. Default read in configuration file")
        parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.")
        parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.")

        parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.")
        parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.")
        parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.")
        parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.")

        parser.add_argument('--offline', '-o', action='store_true', help="Whether the execution should happen offline.")

        parser.add_argument('--rebuild-image', action='store_true', help="Makes sure the Docker image is re-built without cache.")

        parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.")
        parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.")

        parser.add_argument('--gpu-device', action='append', help="Which device id should be mapped into the NVIDIA docker container. Only when --local")

        parser.add_argument('--volume', '-v', action='append', help="Volume into docker. Only when --local")
        parser.add_argument('-e', action='append', help="Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env")

        parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.")

        parsed_args = parser.parse_args(args)

        if parsed_args.config and not os.path.exists(parsed_args.config):
            self.logger.error("fatal: file %s does not exist." % (parsed_args.config,))
            sys.exit(2)

        config = find_config(parsed_args.config)
        home_config = read_home_config()

        if config['model'] and not parsed_args.model:
            parsed_args.model = config['model']

        if not parsed_args.model:
            print("fatal: no model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.")
            sys.exit(2)

        if not parsed_args.local and parsed_args.volume:
            print("fatal: can not use volume with jobs on the cluster. Use datasets instead.")
            sys.exit(1)

        if parsed_args.local and parsed_args.priority:
            print("fatal: the priority can only be set for jobs in the cluster.")
            sys.exit(1)

        if config['image']:
            ensure_docker_installed(self.logger)

        env = {}
        if parsed_args.e:
            for item in parsed_args.e:
                if '=' in item:
                    k, v = item.split('=')
                else:
                    k = item
                    v = os.getenv(k)
                env[k] = v

        if ('command' not in config or not config['command']) and not parsed_args.command:
            self.logger.error('No command given. Define the command in aetros.yml or use command argument.')
            sys.exit(1)

        job_backend = JobBackend(parsed_args.model, self.logger)

        ignore = []
        if 'ignore' in config:
            ignore = config['ignore']
        job_backend.job = {'config': {'ignore': ignore}}

        adding_files = loading_text("- Adding job files to index ... ")
        files_added, size_added = job_backend.add_files(config['root'], report=False)
        adding_files("done with %d file%s added (%s)."
                     % (files_added, 's' if files_added != 1 else '', human_size(size_added, 2)))

        create_info = {
            'type': 'custom',
            'config': config
        }

        incoming_hyperparameter = {}
        if parsed_args.param:
            for param in parsed_args.param:
                if '=' not in param:
                    raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"')

                name, value = param.split('=')
                incoming_hyperparameter[name] = value

        # first transform simple format in the full definition with parameter types
        # (string, number, group, choice_group, etc)
        full_hyperparameters = lose_parameters_to_full(config['parameters'])

        # now extract hyperparameters from full definition, and overwrite stuff using
        # incoming_hyperparameter if available
        hyperparameter = extract_parameters(full_hyperparameters, incoming_hyperparameter)

        create_info['config']['parameters'] = hyperparameter

        if parsed_args.rebuild_image:
            create_info['config']['rebuild_image'] = True

        if parsed_args.max_epochs:
            create_info['config']['maxEpochs'] = int(parsed_args.max_epochs)

        create_info['config']['priority'] = 0
        if parsed_args.priority:
            create_info['config']['priority'] = float(parsed_args.priority)

        if parsed_args.max_time:
            create_info['config']['maxTime'] = float(parsed_args.max_time)

        if parsed_args.command:
            create_info['config']['command'] = parsed_args.command

        if parsed_args.image:
            # reset install options, since we can't make sure if the base image still fits
            if 'image' in config and config['image'] and config['image'] != parsed_args.image:
                create_info['config']['install'] = None

            # reset dockerfile, since we specified manually an image
            create_info['config']['dockerfile'] = None
            create_info['config']['image'] = parsed_args.image

        if parsed_args.no_image:
            create_info['config']['image'] = None

        if parsed_args.server:
            create_info['config']['servers'] = []
            for name in parsed_args.server:
                create_info['config']['servers'].append(name)

        create_info['config']['resources'] = create_info['config'].get('resources', {})
        resources = create_info['config']['resources']

        default_cpu_and_memory = 1 if create_info['config']['image'] else 0
        resources['cpu'] = int(parsed_args.cpu or resources.get('cpu', default_cpu_and_memory))
        resources['memory'] = int(parsed_args.memory or resources.get('memory', default_cpu_and_memory))
        resources['gpu'] = int(parsed_args.gpu or resources.get('gpu', 0))
        resources['gpu_memory'] = int(parsed_args.gpu_memory or resources.get('gpu_memory', 0))

        if parsed_args.local:
            create_info['server'] = 'local'

            # make sure we do not limit the resources to something that is not available on the local machine
            warning = []
            cpu = cpuinfo.get_cpu_info()
            mem = psutil.virtual_memory().total
            gpu = 0
            try:
                gpu = len(get_ordered_devices())
            except CudaNotImplementedException: pass

            if not create_info['config']['image'] and not all([x == 0 for x in six.itervalues(resources)]):
                self.logger.warning("! No Docker virtualization since no `image` defined, resources limitation ignored.")

            if create_info['config']['image'] and resources['gpu'] > 0:
                if not (sys.platform == "linux" or sys.platform == "linux2"):
                    self.logger.warning("! Your operating system does not support GPU allocation for "
                                        "Docker virtualization. "
                                        "NVIDIA-Docker2 is only supported on Linux.")

            local_max_resources = {'cpu': cpu['count'], 'memory': ceil(mem / 1024 / 1024 / 1024), 'gpu': gpu}

            if create_info['config']['image']:
                # read max hardware within Docker
                out = docker_call(['run', 'alpine', 'sh', '-c', 'nproc && cat /proc/meminfo | grep MemTotal'])
                cpus, memory = out.decode('utf-8').strip().split('\n')
                local_max_resources['cpu'] = int(cpus)

                memory = memory.replace('MemTotal:', '').replace('kB', '').strip()
                local_max_resources['memory'] = ceil(int(memory) / 1024 / 1024)

            if local_max_resources['cpu'] < resources['cpu']:
                warning.append('CPU cores %d -> %d' % (resources['cpu'], local_max_resources['cpu']))
                resources['cpu'] = local_max_resources['cpu']

            if local_max_resources['memory'] < resources['memory']:
                warning.append('memory %dGB -> %dGB' % (resources['memory'], local_max_resources['memory']))
                resources['memory'] = local_max_resources['memory']

            if local_max_resources['gpu'] < resources['gpu']:
                warning.append('GPU cards %d -> %d' % (resources['gpu'], local_max_resources['gpu']))
                resources['gpu'] = local_max_resources['gpu']

            if warning:
                self.logger.warning("! Resources downgrade due to missing hardware: %s." % (', '.join(warning),))

        if parsed_args.config and not create_info['config']['configPath']:
            create_info['config']['configPath'] = parsed_args.config

        create_info['config']['sourcesAttached'] = True

        creating_git_job = loading_text("- Create job in local Git ... ")
        if aetros.utils.git.get_current_commit_hash():
            create_info['origin_git_source'] = {
                'origin': aetros.utils.git.get_current_remote_url(),
                'author': aetros.utils.git.get_current_commit_author(),
                'message': aetros.utils.git.get_current_commit_message(),
                'branch': aetros.utils.git.get_current_branch(),
                'commit': aetros.utils.git.get_current_commit_hash(),
            }

        job_backend.create(create_info=create_info, server=None)
        creating_git_job("created %s in %s." % (job_backend.job_id[0:9], job_backend.model_name))

        summary = "➤ Summary: Job running "
        if parsed_args.local:
            summary += 'locally'
        else:
            summary += 'on the cluster'

        if create_info['config']['image']:
            summary += ' in Docker using image %s with %d CPU cores, %dGB memory and %d GPUs.' \
                       % (create_info['config']['image'], resources['cpu'], resources['memory'], resources['gpu'])
        else:
            summary += ' on host using all available resources.'

        print(summary)

        # tasks = []
        #
        # if 'tasks' in config:
        #     for name, task_config in six.iteritems(config['tasks']):
        #         replica = 1
        #         if 'replica' in task_config:
        #             replica = int(task_config['replica'])
        #         for index in range(0, replica):
        #             tasks.append(job_backend.create_task(job_id, task_config, name, index))

        if parsed_args.offline:
            if not parsed_args.local:
                self.logger.warning("Can not create a remote job in offline mode.")
                sys.exit(1)

            self.logger.warning("Execution started offline.")
        else:
            adding_files = loading_text("- Connecting to "+home_config['host']+" ... ")
            if job_backend.connect():
                adding_files("connected.")
            else:
                parsed_args.offline = True
                adding_files("failed. Continue in offline mode.")

        if not parsed_args.offline:
            sys.stdout.write("- Uploading job data ... ")
            job_backend.git.push()
            job_backend.client.wait_until_queue_empty(['files'], clear_end=False)

            sys.stdout.write(" done.\n")

            link = "%smodel/%s/job/%s" % (home_config['url'], job_backend.model_name, job_backend.job_id)
            sys.__stdout__.write(u"➤ Monitor job at %s\n" % (link))

        if parsed_args.local:
            job_backend.start(collect_system=False, offline=parsed_args.offline, push=False)

            if not parsed_args.offline:
                job_backend.git.start_push_sync()

            cpus = create_info['config']['resources']['cpu']
            memory = create_info['config']['resources']['memory']

            if not parsed_args.gpu_device and create_info['config']['resources']['gpu'] > 0:
                # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1]
                parsed_args.gpu_device = []
                for i in range(0, create_info['config']['resources']['gpu']):
                    parsed_args.gpu_device.append(i)

            start_command(self.logger, job_backend, env, parsed_args.volume, cpus=cpus, memory=memory, gpu_devices=parsed_args.gpu_device,
                offline=parsed_args.offline)
예제 #3
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
                                         prog=aetros.const.__prog__ + ' run')
        parser.add_argument('command', nargs='?', help="The command to run. Default read in aetros.yml")
        parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in aetros.yml. If not specified, command is executed on the host.")
        parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in aetros.yml. Multiple --server allowed.")
        parser.add_argument('-m', '--model', help="Under which model this job should be listed. Default read in aetros.yml")
        parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.")
        parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.")

        parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.")
        parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.")
        parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.")
        parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.")

        parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.")
        parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.")

        parser.add_argument('--gpu-device', action='append', help="Which device id should be mapped into the NVIDIA docker container.")

        parser.add_argument('--volume', '-v', action='append', help="Volume into docker")
        parser.add_argument('-e', action='append', help="Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env")

        parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.")

        parsed_args = parser.parse_args(args)

        config = read_config(parsed_args.config or 'aetros.yml')

        env = {}
        if parsed_args.e:
            for item in parsed_args.e:
                if '=' in item:
                    k, v = item.split('=')
                else:
                    k = item
                    v = os.getenv(k)
                env[k] = v

        if 'command' not in config and not parsed_args.command:
            self.logger.error('No "command" given in aetros.yml or as argument.')
            sys.exit(1)

        job = JobBackend(parsed_args.model, self.logger, parsed_args.config or 'aetros.yml')
        ignore = []
        if 'ignore' in config:
            ignore = config['ignore']
        job.job = {'config': {'ignore': ignore}}

        files_added, size_added = job.add_files()

        print("%d files added (%s)" % (files_added, human_size(size_added, 2)))

        create_info = {
            'type': 'custom',
            'config': config
        }

        incoming_hyperparameter = {}
        if parsed_args.param:
            for param in parsed_args.param:
                if '=' not in param:
                    raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"')

                name, value = param.split('=')
                incoming_hyperparameter[name] = value

        # first transform simple format in the full definition with parameter types
        # (string, number, group, choice_group, etc)
        full_hyperparameters = lose_parameters_to_full(config['parameters'])

        # now extract hyperparameters from full definition, and overwrite stuff using
        # incoming_hyperparameter if available
        hyperparameter = extract_parameters(full_hyperparameters, incoming_hyperparameter)

        create_info['config']['parameters'] = hyperparameter

        if parsed_args.max_epochs:
            create_info['config']['maxEpochs'] = int(parsed_args.max_epochs)

        create_info['config']['priority'] = 0
        if parsed_args.priority:
            create_info['config']['priority'] = float(parsed_args.priority)

        if parsed_args.max_time:
            create_info['config']['maxTime'] = float(parsed_args.max_time)

        if parsed_args.command:
            create_info['config']['command'] = parsed_args.command

        if parsed_args.image:

            # reset install options, since we can't make sure if the base image still fits
            if 'image' in config and config['image'] and config['image'] != parsed_args.image:
                create_info['config']['install'] = None

            # reset dockerfile, since we specified manually an image
            create_info['config']['dockerfile'] = None
            create_info['config']['image'] = parsed_args.image

        if parsed_args.server:
            create_info['config']['servers'] = []
            for name in parsed_args.server:
                create_info['config']['servers'].append(name)

        if 'resources' not in create_info['config']:
            create_info['config']['resources'] = {}

        if parsed_args.cpu or parsed_args.memory or parsed_args.gpu is not None or parsed_args.gpu_memory:
            if parsed_args.cpu: create_info['config']['resources']['cpu'] = float(parsed_args.cpu)
            if parsed_args.memory: create_info['config']['resources']['memory'] = float(parsed_args.memory)
            if parsed_args.gpu is not None: create_info['config']['resources']['gpu'] = float(parsed_args.gpu)
            if parsed_args.gpu_memory: create_info['config']['resources']['gpu_memory'] = float(parsed_args.gpu_memory)

        if parsed_args.local:
            # usually, the aetros server would assign resources at job root level from the assigned server
            # but since it's started locally, we just use the requested one. User should know what they do.
            # start.py will use 'config' stuff anyone for docker limitation, so we should make sure it is
            # being displayed.

            if 'image' in create_info['config'] and create_info['config']:
                resources = create_info['config']['resources']
                create_info['resources_assigned'] = {'cpus': 1, 'memory': 1, 'gpus': []}

                if 'gpu' in resources and resources['gpu'] > 0:
                    create_info['resources_assigned']['gpus'] = [1] * resources['gpu']
                if 'cpu' in resources:
                    create_info['resources_assigned']['cpus'] = resources['cpu']
                if 'memory' in resources:
                    create_info['resources_assigned']['memory'] = resources['memory']
            else:
                # since this runs on the host, extract machine hardware and put int resources_assigned
                # so we see it at the job.
                pass

        if parsed_args.local:
            create_info['server'] = 'local'

        create_info['config']['sourcesAttached'] = True

        if aetros.utils.git.get_current_commit_hash():
            create_info['origin_git_source'] = {
                'origin': aetros.utils.git.get_current_remote_url(),
                'author': aetros.utils.git.get_current_commit_author(),
                'message': aetros.utils.git.get_current_commit_message(),
                'branch': aetros.utils.git.get_current_branch(),
                'commit': aetros.utils.git.get_current_commit_hash(),
            }

        job.create(create_info=create_info, server=None)

        print("Job %s/%s created." % (job.model_name, job.job_id))

        if parsed_args.local:
            start(self.logger, job.model_name + '/' + job.job_id, fetch=False, env=env, volumes=parsed_args.volume, gpu_devices=parsed_args.gpu_device)
        else:
            if parsed_args.volume:
                print("Can not use volume with jobs on the cluster. Use datasets instead.")
                sys.exit(1)

            #todo, make it visible
            job.git.push()
            print("Open http://%s/model/%s/job/%s to monitor it." % (job.host, job.model_name, job.job_id))
예제 #4
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' run')
        parser.add_argument(
            'command',
            nargs='?',
            help="The command to run. Default read in configuration file")
        parser.add_argument(
            '-i',
            '--image',
            help=
            "Which Docker image to use for the command. Default read in configuration file. If not specified, command is executed on the host."
        )
        parser.add_argument(
            '--no-image',
            action='store_true',
            help=
            "Forces not to use docker, even when image is defined in the configuration file."
        )

        parser.add_argument(
            '-s',
            '--server',
            action='append',
            help=
            "Limits the server pool to this server. Default not limitation or read in configuration file. Multiple --server allowed."
        )
        parser.add_argument(
            '-m',
            '--model',
            help=
            "Under which model this job should be listed. Default read in configuration file"
        )
        parser.add_argument(
            '-l',
            '--local',
            action='store_true',
            help="Start the job immediately on the current machine.")
        parser.add_argument(
            '-c',
            '--config',
            help="Default aetros.yml in current working directory.")
        parser.add_argument(
            '--priority',
            help="Increases or decreases priority. Default is 0.")

        parser.add_argument(
            '--cpu',
            help="How many CPU cores should be assigned to job. Docker only.")
        parser.add_argument(
            '--memory',
            help="How much memory should be assigned to job. Docker only.")
        parser.add_argument(
            '--gpu',
            help="How many GPU cards should be assigned to job. Docker only.")
        parser.add_argument(
            '--gpu_memory',
            help="Memory requirement for the GPU. Docker only.")

        parser.add_argument(
            '--offline',
            '-o',
            action='store_true',
            help="Whether the execution should happen offline.")

        parser.add_argument(
            '--rebuild-image',
            action='store_true',
            help="Makes sure the Docker image is re-built without cache.")

        parser.add_argument(
            '--max-time',
            help=
            "Limit execution time in seconds. Sends SIGINT to the process group when reached."
        )
        parser.add_argument(
            '--max-epochs',
            help=
            "Limit execution epochs. Sends SIGINT to the process group when reached."
        )

        parser.add_argument(
            '--gpu-device',
            action='append',
            help=
            "Which device id should be mapped into the NVIDIA docker container. Only when --local"
        )

        parser.add_argument('--volume',
                            '-v',
                            action='append',
                            help="Volume into docker. Only when --local")
        parser.add_argument(
            '-e',
            action='append',
            help=
            "Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env"
        )

        parser.add_argument(
            '-p',
            '--param',
            action='append',
            help=
            "Sets a hyperparameter, example '--param name=value'. Multiple --param allowed."
        )

        parsed_args = parser.parse_args(args)

        if parsed_args.config and not os.path.exists(parsed_args.config):
            self.logger.error("fatal: file %s does not exist." %
                              (parsed_args.config, ))
            sys.exit(2)

        config = find_config(parsed_args.config)
        home_config = read_home_config()

        if config['model'] and not parsed_args.model:
            parsed_args.model = config['model']

        if not parsed_args.model:
            print(
                "fatal: no model defined. Use --model or switch into a directory where you executed 'aetros init model-name'."
            )
            sys.exit(2)

        if not parsed_args.local and parsed_args.volume:
            print(
                "fatal: can not use volume with jobs on the cluster. Use datasets instead."
            )
            sys.exit(1)

        if parsed_args.local and parsed_args.priority:
            print(
                "fatal: the priority can only be set for jobs in the cluster.")
            sys.exit(1)

        if config['image']:
            ensure_docker_installed(self.logger)

        env = {}
        if parsed_args.e:
            for item in parsed_args.e:
                if '=' in item:
                    k, v = item.split('=')
                else:
                    k = item
                    v = os.getenv(k)
                env[k] = v

        if ('command' not in config
                or not config['command']) and not parsed_args.command:
            self.logger.error(
                'No command given. Define the command in aetros.yml or use command argument.'
            )
            sys.exit(1)

        job_backend = JobBackend(parsed_args.model, self.logger)

        ignore = []
        if 'ignore' in config:
            ignore = config['ignore']
        job_backend.job = {'config': {'ignore': ignore}}

        adding_files = loading_text("- Adding job files to index ... ")
        files_added, size_added = job_backend.add_files(config['root'],
                                                        report=False)
        adding_files("done with %d file%s added (%s)." %
                     (files_added, 's' if files_added != 1 else '',
                      human_size(size_added, 2)))

        create_info = {'type': 'custom', 'config': config}

        incoming_hyperparameter = {}
        if parsed_args.param:
            for param in parsed_args.param:
                if '=' not in param:
                    raise Exception(
                        '--param ' + param +
                        ' does not contain a `=`. Please use "--param name=value"'
                    )

                name, value = param.split('=')
                incoming_hyperparameter[name] = value

        # first transform simple format in the full definition with parameter types
        # (string, number, group, choice_group, etc)
        full_hyperparameters = lose_parameters_to_full(config['parameters'])

        # now extract hyperparameters from full definition, and overwrite stuff using
        # incoming_hyperparameter if available
        hyperparameter = extract_parameters(full_hyperparameters,
                                            incoming_hyperparameter)

        create_info['config']['parameters'] = hyperparameter

        if parsed_args.rebuild_image:
            create_info['config']['rebuild_image'] = True

        if parsed_args.max_epochs:
            create_info['config']['maxEpochs'] = int(parsed_args.max_epochs)

        create_info['config']['priority'] = 0
        if parsed_args.priority:
            create_info['config']['priority'] = float(parsed_args.priority)

        if parsed_args.max_time:
            create_info['config']['maxTime'] = float(parsed_args.max_time)

        if parsed_args.command:
            create_info['config']['command'] = parsed_args.command

        if parsed_args.image:
            # reset install options, since we can't make sure if the base image still fits
            if 'image' in config and config[
                    'image'] and config['image'] != parsed_args.image:
                create_info['config']['install'] = None

            # reset dockerfile, since we specified manually an image
            create_info['config']['dockerfile'] = None
            create_info['config']['image'] = parsed_args.image

        if parsed_args.no_image:
            create_info['config']['image'] = None

        if parsed_args.server:
            create_info['config']['servers'] = []
            for name in parsed_args.server:
                create_info['config']['servers'].append(name)

        create_info['config']['resources'] = create_info['config'].get(
            'resources', {})
        resources = create_info['config']['resources']

        default_cpu_and_memory = 1 if create_info['config']['image'] else 0
        resources['cpu'] = int(parsed_args.cpu
                               or resources.get('cpu', default_cpu_and_memory))
        resources['memory'] = int(
            parsed_args.memory
            or resources.get('memory', default_cpu_and_memory))
        resources['gpu'] = int(parsed_args.gpu or resources.get('gpu', 0))
        resources['gpu_memory'] = int(parsed_args.gpu_memory
                                      or resources.get('gpu_memory', 0))

        if parsed_args.local:
            create_info['server'] = 'local'

            # make sure we do not limit the resources to something that is not available on the local machine
            warning = []
            cpu = cpuinfo.get_cpu_info()
            mem = psutil.virtual_memory().total
            gpu = 0
            try:
                gpu = len(get_ordered_devices())
            except CudaNotImplementedException:
                pass

            if not create_info['config']['image'] and not all(
                [x == 0 for x in six.itervalues(resources)]):
                self.logger.warning(
                    "! No Docker virtualization since no `image` defined, resources limitation ignored."
                )

            if create_info['config']['image'] and resources['gpu'] > 0:
                if not (sys.platform == "linux" or sys.platform == "linux2"):
                    self.logger.warning(
                        "! Your operating system does not support GPU allocation for "
                        "Docker virtualization. "
                        "NVIDIA-Docker2 is only supported on Linux.")

            local_max_resources = {
                'cpu': cpu['count'],
                'memory': ceil(mem / 1024 / 1024 / 1024),
                'gpu': gpu
            }

            if create_info['config']['image']:
                # read max hardware within Docker
                out = docker_call([
                    'run', 'alpine', 'sh', '-c',
                    'nproc && cat /proc/meminfo | grep MemTotal'
                ])
                cpus, memory = out.decode('utf-8').strip().split('\n')
                local_max_resources['cpu'] = int(cpus)

                memory = memory.replace('MemTotal:', '').replace('kB',
                                                                 '').strip()
                local_max_resources['memory'] = ceil(int(memory) / 1024 / 1024)

            if local_max_resources['cpu'] < resources['cpu']:
                warning.append('CPU cores %d -> %d' %
                               (resources['cpu'], local_max_resources['cpu']))
                resources['cpu'] = local_max_resources['cpu']

            if local_max_resources['memory'] < resources['memory']:
                warning.append(
                    'memory %dGB -> %dGB' %
                    (resources['memory'], local_max_resources['memory']))
                resources['memory'] = local_max_resources['memory']

            if local_max_resources['gpu'] < resources['gpu']:
                warning.append('GPU cards %d -> %d' %
                               (resources['gpu'], local_max_resources['gpu']))
                resources['gpu'] = local_max_resources['gpu']

            if warning:
                self.logger.warning(
                    "! Resources downgrade due to missing hardware: %s." %
                    (', '.join(warning), ))

        if parsed_args.config and not create_info['config']['configPath']:
            create_info['config']['configPath'] = parsed_args.config

        create_info['config']['sourcesAttached'] = True

        creating_git_job = loading_text("- Create job in local Git ... ")
        if aetros.utils.git.get_current_commit_hash():
            create_info['origin_git_source'] = {
                'origin': aetros.utils.git.get_current_remote_url(),
                'author': aetros.utils.git.get_current_commit_author(),
                'message': aetros.utils.git.get_current_commit_message(),
                'branch': aetros.utils.git.get_current_branch(),
                'commit': aetros.utils.git.get_current_commit_hash(),
            }

        job_backend.create(create_info=create_info, server=None)
        creating_git_job("created %s in %s." %
                         (job_backend.job_id[0:9], job_backend.model_name))

        summary = "➤ Summary: Job running "
        if parsed_args.local:
            summary += 'locally'
        else:
            summary += 'on the cluster'

        if create_info['config']['image']:
            summary += ' in Docker using image %s with %d CPU cores, %dGB memory and %d GPUs.' \
                       % (create_info['config']['image'], resources['cpu'], resources['memory'], resources['gpu'])
        else:
            summary += ' on host using all available resources.'

        print(summary)

        # tasks = []
        #
        # if 'tasks' in config:
        #     for name, task_config in six.iteritems(config['tasks']):
        #         replica = 1
        #         if 'replica' in task_config:
        #             replica = int(task_config['replica'])
        #         for index in range(0, replica):
        #             tasks.append(job_backend.create_task(job_id, task_config, name, index))

        if parsed_args.offline:
            if not parsed_args.local:
                self.logger.warning(
                    "Can not create a remote job in offline mode.")
                sys.exit(1)

            self.logger.warning("Execution started offline.")
        else:
            adding_files = loading_text("- Connecting to " +
                                        home_config['host'] + " ... ")
            if job_backend.connect():
                adding_files("connected.")
            else:
                parsed_args.offline = True
                adding_files("failed. Continue in offline mode.")

        if not parsed_args.offline:
            sys.stdout.write("- Uploading job data ... ")
            job_backend.git.push()
            job_backend.client.wait_until_queue_empty(['files'],
                                                      clear_end=False)

            sys.stdout.write(" done.\n")

            link = "%s/model/%s/job/%s" % (
                home_config['url'], job_backend.model_name, job_backend.job_id)
            sys.__stdout__.write(u"➤ Monitor job at %s\n" % (link))

        if parsed_args.local:
            job_backend.start(collect_system=False,
                              offline=parsed_args.offline,
                              push=False)

            if not parsed_args.offline:
                job_backend.git.start_push_sync()

            cpus = create_info['config']['resources']['cpu']
            memory = create_info['config']['resources']['memory']

            if not parsed_args.gpu_device and create_info['config'][
                    'resources']['gpu'] > 0:
                # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1]
                parsed_args.gpu_device = []
                for i in range(0, create_info['config']['resources']['gpu']):
                    parsed_args.gpu_device.append(i)

            start_command(self.logger,
                          job_backend,
                          env,
                          parsed_args.volume,
                          cpus=cpus,
                          memory=memory,
                          gpu_devices=parsed_args.gpu_device,
                          offline=parsed_args.offline)
예제 #5
0
class KerasIntegration():
    def __init__(self,
                 id,
                 model,
                 api_key,
                 insights=False,
                 confusion_matrix=False,
                 insight_sample=None):
        """

        :type id: basestring The actual model name available in AETROS Trainer. Example peter/mnist-cnn
        :type insights: bool
        :type confusion_matrix: bool
        :type insight_sample: basestring|None A path to a sample which is being used for the insights. Default is first sample of data_validation.
        """
        self.confusion_matrix = confusion_matrix
        self.model = model

        if isinstance(model, Sequential) and not model.built:
            raise Exception('Sequential model is not built.')

        self.insight_sample = insight_sample
        self.id = id
        self.insights = insights
        self.model_type = 'custom'

        self.job_backend = JobBackend(api_token=api_key)

        copy = {
            'fit': self.model.fit,
            'fit_generator': self.model.fit_generator
        }

        def overwritten_fit(x,
                            y,
                            batch_size=32,
                            nb_epoch=10,
                            verbose=1,
                            callbacks=[],
                            validation_split=0.,
                            validation_data=None,
                            shuffle=True,
                            class_weight=None,
                            sample_weight=None,
                            **kwargs):

            callback = self.setup(x, nb_epoch, batch_size)
            callbacks.append(callback)
            copy['fit'](x, y, batch_size, nb_epoch, verbose, callbacks,
                        validation_split, validation_data, True, class_weight,
                        sample_weight, **kwargs)

            self.end()

        def overwritten_fit_generator(generator,
                                      samples_per_epoch,
                                      nb_epoch,
                                      verbose=1,
                                      callbacks=[],
                                      validation_data=None,
                                      nb_val_samples=None,
                                      class_weight={},
                                      max_q_size=10,
                                      nb_worker=1,
                                      pickle_safe=False):

            callback = self.setup(generator, nb_epoch)
            self.trainer.nb_val_samples = nb_val_samples
            self.trainer.data_validation = validation_data
            callbacks.append(callback)

            copy['fit_generator'](generator, samples_per_epoch, nb_epoch,
                                  verbose, callbacks, validation_data,
                                  nb_val_samples, class_weight, max_q_size,
                                  nb_worker, pickle_safe)
            self.end()

        self.model.fit = overwritten_fit
        self.model.fit_generator = overwritten_fit_generator

    def setup(self, x=None, nb_epoch=1, batch_size=16):
        graph = self.model_to_graph(self.model)

        from keras.preprocessing.image import Iterator

        if isinstance(x, Iterator):
            batch_size = x.batch_size

        settings = {
            'epochs':
            nb_epoch,
            'batchSize':
            batch_size,
            'optimizer':
            type(self.model.optimizer).__name__ if hasattr(
                self.model, 'optimizer') else ''
        }

        self.job_backend.ensure_model(self.id,
                                      self.model.to_json(),
                                      settings=settings,
                                      type=self.model_type,
                                      graph=graph)

        job_id = self.job_backend.create(self.id, insights=self.insights)
        self.job_backend.start()

        print(
            "AETROS job '%s' created and started. Open http://%s/trainer/app#/job=%s to monitor the training."
            % (job_id, self.job_backend.host, job_id))

        job = self.job_backend.load_light_job()
        general_logger = GeneralLogger(job, job_backend=self.job_backend)
        self.trainer = Trainer(self.job_backend, general_logger)

        self.monitoringThread = MonitoringThread(self.job_backend,
                                                 self.trainer)
        self.monitoringThread.daemon = True
        self.monitoringThread.start()

        self.trainer.model = self.model
        self.trainer.data_train = {'x': x}

        self.callback = KerasLogger(self.trainer, self.job_backend,
                                    general_logger)
        self.callback.log_epoch = False
        self.callback.model = self.model
        self.callback.confusion_matrix = self.confusion_matrix

        return self.callback

    def publish(self):
        graph = self.model_to_graph(self.model)
        self.job_backend.ensure_model(self.id,
                                      self.model.to_json(),
                                      type=self.model_type,
                                      graph=graph)

    def start(self, nb_epoch=1, nb_sample=1, title="TRAINING"):
        """
        Starts $title
        :return:
        """

        self.setup(nb_epoch)
        self.callback.params['nb_epoch'] = nb_epoch
        self.callback.params['nb_sample'] = nb_sample
        self.callback.on_train_begin()

        return self.callback

    def batch_begin(self, batch, size):
        logs = {
            'batch': batch,
            'size': size,
        }
        self.callback.on_batch_end(batch, logs)

    def batch_end(self, batch, size, loss=0, acc=0):

        logs = {
            'loss': loss,
            'acc': acc,
            'batch': batch,
            'size': size,
        }
        self.callback.on_batch_end(batch, logs)

    def epoch_end(self, epoch, loss=0, val_loss=0, acc=0, val_acc=0):
        """

        :type epoch: integer starting with 0
        """
        logs = {
            'loss': loss,
            'val_loss': val_loss,
            'acc': acc,
            'val_acc': val_acc,
            'epoch': epoch
        }
        self.callback.on_epoch_end(epoch, logs)

    def end(self):
        self.monitoringThread.stop()
        self.job_backend.sync_weights()
        self.job_backend.set_status('DONE')

    def model_to_graph(self, model):
        graph = {'nodes': [], 'links': [], 'groups': []}

        map = {'idx': {}, 'flatten': [], 'group_pointer': -1}

        def layer_to_dict(layer):
            info = {}

            if isinstance(layer, Dropout):
                info['dropout'] = layer.p

            if isinstance(layer, Dense):
                info['neurons'] = layer.output_dim
                info['activaton'] = layer.activation.__name__

            if isinstance(layer, Convolution2D):
                info['receptiveField'] = [layer.nb_col, layer.nb_row]
                info['features'] = layer.nb_filter

            if isinstance(layer, MaxPooling2D):
                info['poolingArea'] = [layer.pool_size[0], layer.pool_size[1]]

            if isinstance(layer, Embedding):
                info['inputDim'] = layer.input_dim
                info['outputDim'] = layer.output_dim
                info['dropout'] = layer.dropout

            if isinstance(layer, Activation):
                info['activaton'] = layer.activation.__name__

            if isinstance(layer, Merge):
                info['mode'] = layer.mode

            if isinstance(layer, RepeatVector):
                info['n'] = layer.n

            if isinstance(layer, InputLayer):
                info['inputShape'] = layer.input_shape

            info['outputShape'] = layer.output_shape

            return {
                'name': layer.name,
                'class': type(layer).__name__,
                'width': 60,
                'height': 40,
                'info': info
            }

        def add_layer(layer):
            graph['nodes'].append(layer_to_dict(layer))
            map['flatten'].append(layer)
            map['idx'][layer.name] = len(graph['nodes']) - 1
            # if map['group_pointer'] >= 0:
            #     graph['groups'][map['group_pointer']].append(len(graph['nodes'])-1)

        def get_idx(layer):
            return map['idx'][layer.name]

        def extract_layers(layers):
            for layer in layers:
                if layer not in map['flatten']:
                    add_layer(layer)
                    if hasattr(layer, 'layers') and isinstance(
                            layer.layers, list):
                        # graph['groups'].append([])
                        # map['group_pointer'] += 1
                        extract_layers(layer.layers)
                        # map['group_pointer'] -= 1
                    else:
                        for inbound_node in layer.inbound_nodes:
                            extract_layers(inbound_node.inbound_layers)

        extract_layers(model.layers)

        # build edges
        for layer in map['flatten']:

            for inbound_node in layer.inbound_nodes:
                for inbound_layer in inbound_node.inbound_layers:
                    graph['links'].append({
                        'source': get_idx(inbound_layer),
                        'target': get_idx(layer),
                    })

            if hasattr(layer, 'layers') and isinstance(layer.layers, list):
                graph['links'].append({
                    'source': get_idx(layer.layers[-1]),
                    'target': get_idx(layer),
                })

        return graph

    def model_to_layers(self, model):
        layers = []

        # from keras.models import Sequential
        # if isinstance(model, Sequential):
        #     for layer in model.layers:
        #         layers[]

        # 'fc': 'Dense',
        # 'conv': 'Convolutional2D',
        # 'pool': 'MaxPooling2D',
        # 'pool_average': 'AveragePooling2D',
        # 'zero_padding': 'ZeroPadding2D',
        # 'upsampling': 'UpSampling2D',
        # 'flatten': 'Flatten',
        # 'merge': 'Merge',

        layer_type_map = {
            'InputLayer': 'fc',
            'Dense': 'fc',
            'Convolution2D': 'conv',
            'MaxPooling2D': 'pool',
            'AveragePooling2D': 'pool_average',
            'ZeroPadding2D': 'zero_padding',
            'UpSampling2D': 'upsampling',
            'Flatten': 'flatten',
            'Merge': 'merge',
        }

        def get_input_layer(layer):
            if isinstance(layer, Activation) or isinstance(layer, Dropout):
                return get_input_layer(
                    layer.inbound_nodes[0].inbound_layers[0])

            return layer

        for keras_layer in model.layers:
            name = type(keras_layer).__name__

            if name in layer_type_map:
                typeStr = layer_type_map[name]
            else:
                typeStr = name

            layer = {
                'id': keras_layer.name,
                'name': keras_layer.name,
                'type': typeStr,
                'connectedTo': [],
                'receptiveField': {
                    'width': 0,
                    'height': 0
                },
                'poolingArea': {
                    'width': 0,
                    'height': 0
                },
                'padding': [],
                'features': 0,
            }

            if isinstance(keras_layer, Convolution2D):
                layer['receptiveField']['width'] = keras_layer.nb_col
                layer['receptiveField']['height'] = keras_layer.nb_row
                layer['features'] = keras_layer.nb_filter
            if isinstance(keras_layer, MaxPooling2D):
                layer['poolingArea']['width'] = keras_layer.pool_size[0]
                layer['poolingArea']['height'] = keras_layer.pool_size[1]

            if isinstance(keras_layer, InputLayer):
                if len(keras_layer.input_shape) == 4:

                    # grayscale
                    if keras_layer.input_shape[1] == 1:
                        layer['inputType'] = 'image'
                        layer['width'] = keras_layer.input_shape[2]
                        layer['height'] = keras_layer.input_shape[3]

                    elif keras_layer.input_shape[1] == 3:
                        layer['inputType'] = 'image_rgb'
                        layer['width'] = keras_layer.input_shape[2]
                        layer['height'] = keras_layer.input_shape[3]

                elif len(keras_layer.input_shape) == 2:
                    layer['inputType'] = 'list'
                    layer['width'] = keras_layer.input_shape[1]
                    layer['height'] = 1
                else:
                    layer['inputType'] = 'custom'
                    layer['shape'] = keras_layer.input_shape

            if isinstance(keras_layer, Dense):
                layer['weight'] = keras_layer.output_dim

            if isinstance(keras_layer, Dropout):
                layers[-1][0]['dropout'] = keras_layer.p

                continue

            if isinstance(keras_layer, Activation):
                activation_function = str(keras_layer.activation)
                layers[-1][0][
                    'activationFunction'] = activation_function.split(' ')[1]

                continue

            for inbound_node in keras_layer.inbound_nodes:
                for inbound_layer in inbound_node.inbound_layers:
                    inbound_layer = get_input_layer(inbound_layer)
                    layer['connectedTo'].append(inbound_layer.name)

            layers.append([layer])

        return layers
예제 #6
0
    def main(self, args):

        from aetros import keras_model_utils

        import aetros.const
        from aetros.backend import JobBackend
        from aetros.logger import GeneralLogger
        from aetros.Trainer import Trainer

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' upload-weights')
        parser.add_argument('id', nargs='?', help='model name or job id')
        parser.add_argument(
            '--secure-key',
            help="Secure key. Alternatively use API_KEY environment varibale.")
        parser.add_argument(
            '--weights',
            help=
            "Weights path. Per default we try to find it in the ./weights/ folder."
        )
        parser.add_argument(
            '--accuracy',
            help=
            "If you specified model name, you should also specify the accuracy this weights got."
        )
        parser.add_argument(
            '--latest',
            action="store_true",
            help="Instead of best epoch we upload latest weights.")

        parsed_args = parser.parse_args(args)
        job_backend = JobBackend(api_token=parsed_args.secure_key)

        if '/' in parsed_args.id and '@' not in parsed_args.id:
            job_backend.create(parsed_args.id)

        job_backend.load(parsed_args.id)

        if job_backend.job is None:
            raise Exception("Job not found")

        job_model = job_backend.get_job_model()

        weights_path = job_model.get_weights_filepath_best()

        if parsed_args.weights:
            weights_path = parsed_args.weights

        print(("Validate weights in %s ..." % (weights_path, )))

        keras_model_utils.job_prepare(job_model)

        general_logger = GeneralLogger()
        trainer = Trainer(job_backend, general_logger)

        job_model.set_input_shape(trainer)

        print("Loading model ...")
        model_provider = job_model.get_model_provider()
        model = model_provider.get_model(trainer)

        loss = model_provider.get_loss(trainer)
        optimizer = model_provider.get_optimizer(trainer)

        print("Compiling ...")
        model_provider.compile(trainer, model, loss, optimizer)

        print(("Validate weights %s ..." % (weights_path, )))
        job_model.load_weights(model, weights_path)
        print("Validated.")

        print("Uploading weights to %s of %s ..." %
              (job_backend.job_id, job_backend.model_id))

        job_backend.upload_weights(
            'best.hdf5', weights_path,
            float(parsed_args.accuracy) if parsed_args.accuracy else None)

        print("Done")