def testHyperParameters12(self):

        itemsOri = [
            {
                'name': 'num',
                'type': 'number',
                'defaultValue': 0.5
            },
            {
                'name': 'string',
                'type': 'string',
                'defaultValue': 'hi'
            }
        ]

        items = itemsOri
        container = extract_parameters(items, {'num': '0.11'})
        self.assertEquals({'num': 0.11, 'string': 'hi'}, container)

        container = extract_parameters(items, {'num': 0.111})
        self.assertEquals({'num': 0.111, 'string': 'hi'}, container)

        container = extract_parameters(items, {'string': 2})
        self.assertEquals({'num': 0.5, 'string': '2'}, container)

        container = extract_parameters(items, {'string': 'asdasd'})
        self.assertEquals({'num': 0.5, 'string': 'asdasd'}, container)
Example #2
0
    def testHyperParameters11(self):
        items = [{
            'name': 'num',
            'type': 'number',
            'defaultValue': 0.5
        }, {
            'name': 'string',
            'type': 'string',
            'defaultValue': 'hi'
        }]

        container = extract_parameters(items)
        self.assertEquals({'num': 0.5, 'string': 'hi'}, container)
Example #3
0
    def testHyperParameters12(self):

        itemsOri = [{
            'name': 'num',
            'type': 'number',
            'defaultValue': 0.5
        }, {
            'name': 'string',
            'type': 'string',
            'defaultValue': 'hi'
        }]

        items = itemsOri
        container = extract_parameters(items, {'num': '0.11'})
        self.assertEquals({'num': 0.11, 'string': 'hi'}, container)

        container = extract_parameters(items, {'num': 0.111})
        self.assertEquals({'num': 0.111, 'string': 'hi'}, container)

        container = extract_parameters(items, {'string': 2})
        self.assertEquals({'num': 0.5, 'string': '2'}, container)

        container = extract_parameters(items, {'string': 'asdasd'})
        self.assertEquals({'num': 0.5, 'string': 'asdasd'}, container)
    def testHyperParameters(self):
        items = [
            {
                'name': 'choice',
                'type': 'choice_string',
                'children': [
                    {'value': 'a'},
                    {'value': 'b'},
                    {'value': 'c'},
                ]
            }
        ]

        container = extract_parameters(items)
        self.assertEquals({'choice': 'a'}, container)
    def testHyperParametersNumber(self):
        items = [
            {
                'name': 'choice',
                'type': 'choice_number',
                'children': [
                    {'value': 2},
                    {'value': 3},
                    {'value': 4},
                ]
            }
        ]

        container = extract_parameters(items, [])
        self.assertEquals({'choice': 2}, container)
    def testHyperParameters11(self):
        items = [
            {
                'name': 'num',
                'type': 'number',
                'defaultValue': 0.5
            },
            {
                'name': 'string',
                'type': 'string',
                'defaultValue': 'hi'
            }
        ]

        container = extract_parameters(items)
        self.assertEquals({'num': 0.5, 'string': 'hi'}, container)
    def testHyperParametersNumber3(self):
        items = [
            {
                'name': 'choice',
                'type': 'choice_string',
                'defaultValue': 2,
                'children': [
                    {'value': 2},
                    {'value': 3},
                    {'value': 4},
                ]
            }
        ]

        container = extract_parameters(items, {'choice': 3})
        self.assertEquals({'choice': 3}, container)
Example #8
0
    def testHyperParameters(self):
        items = [{
            'name': 'choice',
            'type': 'choice_string',
            'children': [
                {
                    'value': 'a'
                },
                {
                    'value': 'b'
                },
                {
                    'value': 'c'
                },
            ]
        }]

        container = extract_parameters(items)
        self.assertEquals({'choice': 'a'}, container)
Example #9
0
    def testHyperParametersNumber(self):
        items = [{
            'name': 'choice',
            'type': 'choice_number',
            'children': [
                {
                    'value': 2
                },
                {
                    'value': 3
                },
                {
                    'value': 4
                },
            ]
        }]

        container = extract_parameters(items, [])
        self.assertEquals({'choice': 2}, container)
Example #10
0
    def testHyperParametersNumber3(self):
        items = [{
            'name': 'choice',
            'type': 'choice_string',
            'defaultValue': 2,
            'children': [
                {
                    'value': 2
                },
                {
                    'value': 3
                },
                {
                    'value': 4
                },
            ]
        }]

        container = extract_parameters(items, {'choice': 3})
        self.assertEquals({'choice': 3}, container)
Example #11
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
                                         prog=aetros.const.__prog__ + ' run')
        parser.add_argument('command', nargs='?', help="The command to run. Default read in configuration file")
        parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in configuration file. If not specified, command is executed on the host.")
        parser.add_argument('--no-image', action='store_true', help="Forces not to use docker, even when image is defined in the configuration file.")

        parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in configuration file. Multiple --server allowed.")
        parser.add_argument('-m', '--model', help="Under which model this job should be listed. Default read in configuration file")
        parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.")
        parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.")

        parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.")
        parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.")
        parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.")
        parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.")

        parser.add_argument('--offline', '-o', action='store_true', help="Whether the execution should happen offline.")

        parser.add_argument('--rebuild-image', action='store_true', help="Makes sure the Docker image is re-built without cache.")

        parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.")
        parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.")

        parser.add_argument('--gpu-device', action='append', help="Which device id should be mapped into the NVIDIA docker container. Only when --local")

        parser.add_argument('--volume', '-v', action='append', help="Volume into docker. Only when --local")
        parser.add_argument('-e', action='append', help="Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env")

        parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.")

        parsed_args = parser.parse_args(args)

        if parsed_args.config and not os.path.exists(parsed_args.config):
            self.logger.error("fatal: file %s does not exist." % (parsed_args.config,))
            sys.exit(2)

        config = find_config(parsed_args.config)
        home_config = read_home_config()

        if config['model'] and not parsed_args.model:
            parsed_args.model = config['model']

        if not parsed_args.model:
            print("fatal: no model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.")
            sys.exit(2)

        if not parsed_args.local and parsed_args.volume:
            print("fatal: can not use volume with jobs on the cluster. Use datasets instead.")
            sys.exit(1)

        if parsed_args.local and parsed_args.priority:
            print("fatal: the priority can only be set for jobs in the cluster.")
            sys.exit(1)

        if config['image']:
            ensure_docker_installed(self.logger)

        env = {}
        if parsed_args.e:
            for item in parsed_args.e:
                if '=' in item:
                    k, v = item.split('=')
                else:
                    k = item
                    v = os.getenv(k)
                env[k] = v

        if ('command' not in config or not config['command']) and not parsed_args.command:
            self.logger.error('No command given. Define the command in aetros.yml or use command argument.')
            sys.exit(1)

        job_backend = JobBackend(parsed_args.model, self.logger)

        ignore = []
        if 'ignore' in config:
            ignore = config['ignore']
        job_backend.job = {'config': {'ignore': ignore}}

        adding_files = loading_text("- Adding job files to index ... ")
        files_added, size_added = job_backend.add_files(config['root'], report=False)
        adding_files("done with %d file%s added (%s)."
                     % (files_added, 's' if files_added != 1 else '', human_size(size_added, 2)))

        create_info = {
            'type': 'custom',
            'config': config
        }

        incoming_hyperparameter = {}
        if parsed_args.param:
            for param in parsed_args.param:
                if '=' not in param:
                    raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"')

                name, value = param.split('=')
                incoming_hyperparameter[name] = value

        # first transform simple format in the full definition with parameter types
        # (string, number, group, choice_group, etc)
        full_hyperparameters = lose_parameters_to_full(config['parameters'])

        # now extract hyperparameters from full definition, and overwrite stuff using
        # incoming_hyperparameter if available
        hyperparameter = extract_parameters(full_hyperparameters, incoming_hyperparameter)

        create_info['config']['parameters'] = hyperparameter

        if parsed_args.rebuild_image:
            create_info['config']['rebuild_image'] = True

        if parsed_args.max_epochs:
            create_info['config']['maxEpochs'] = int(parsed_args.max_epochs)

        create_info['config']['priority'] = 0
        if parsed_args.priority:
            create_info['config']['priority'] = float(parsed_args.priority)

        if parsed_args.max_time:
            create_info['config']['maxTime'] = float(parsed_args.max_time)

        if parsed_args.command:
            create_info['config']['command'] = parsed_args.command

        if parsed_args.image:
            # reset install options, since we can't make sure if the base image still fits
            if 'image' in config and config['image'] and config['image'] != parsed_args.image:
                create_info['config']['install'] = None

            # reset dockerfile, since we specified manually an image
            create_info['config']['dockerfile'] = None
            create_info['config']['image'] = parsed_args.image

        if parsed_args.no_image:
            create_info['config']['image'] = None

        if parsed_args.server:
            create_info['config']['servers'] = []
            for name in parsed_args.server:
                create_info['config']['servers'].append(name)

        create_info['config']['resources'] = create_info['config'].get('resources', {})
        resources = create_info['config']['resources']

        default_cpu_and_memory = 1 if create_info['config']['image'] else 0
        resources['cpu'] = int(parsed_args.cpu or resources.get('cpu', default_cpu_and_memory))
        resources['memory'] = int(parsed_args.memory or resources.get('memory', default_cpu_and_memory))
        resources['gpu'] = int(parsed_args.gpu or resources.get('gpu', 0))
        resources['gpu_memory'] = int(parsed_args.gpu_memory or resources.get('gpu_memory', 0))

        if parsed_args.local:
            create_info['server'] = 'local'

            # make sure we do not limit the resources to something that is not available on the local machine
            warning = []
            cpu = cpuinfo.get_cpu_info()
            mem = psutil.virtual_memory().total
            gpu = 0
            try:
                gpu = len(get_ordered_devices())
            except CudaNotImplementedException: pass

            if not create_info['config']['image'] and not all([x == 0 for x in six.itervalues(resources)]):
                self.logger.warning("! No Docker virtualization since no `image` defined, resources limitation ignored.")

            if create_info['config']['image'] and resources['gpu'] > 0:
                if not (sys.platform == "linux" or sys.platform == "linux2"):
                    self.logger.warning("! Your operating system does not support GPU allocation for "
                                        "Docker virtualization. "
                                        "NVIDIA-Docker2 is only supported on Linux.")

            local_max_resources = {'cpu': cpu['count'], 'memory': ceil(mem / 1024 / 1024 / 1024), 'gpu': gpu}

            if create_info['config']['image']:
                # read max hardware within Docker
                out = docker_call(['run', 'alpine', 'sh', '-c', 'nproc && cat /proc/meminfo | grep MemTotal'])
                cpus, memory = out.decode('utf-8').strip().split('\n')
                local_max_resources['cpu'] = int(cpus)

                memory = memory.replace('MemTotal:', '').replace('kB', '').strip()
                local_max_resources['memory'] = ceil(int(memory) / 1024 / 1024)

            if local_max_resources['cpu'] < resources['cpu']:
                warning.append('CPU cores %d -> %d' % (resources['cpu'], local_max_resources['cpu']))
                resources['cpu'] = local_max_resources['cpu']

            if local_max_resources['memory'] < resources['memory']:
                warning.append('memory %dGB -> %dGB' % (resources['memory'], local_max_resources['memory']))
                resources['memory'] = local_max_resources['memory']

            if local_max_resources['gpu'] < resources['gpu']:
                warning.append('GPU cards %d -> %d' % (resources['gpu'], local_max_resources['gpu']))
                resources['gpu'] = local_max_resources['gpu']

            if warning:
                self.logger.warning("! Resources downgrade due to missing hardware: %s." % (', '.join(warning),))

        if parsed_args.config and not create_info['config']['configPath']:
            create_info['config']['configPath'] = parsed_args.config

        create_info['config']['sourcesAttached'] = True

        creating_git_job = loading_text("- Create job in local Git ... ")
        if aetros.utils.git.get_current_commit_hash():
            create_info['origin_git_source'] = {
                'origin': aetros.utils.git.get_current_remote_url(),
                'author': aetros.utils.git.get_current_commit_author(),
                'message': aetros.utils.git.get_current_commit_message(),
                'branch': aetros.utils.git.get_current_branch(),
                'commit': aetros.utils.git.get_current_commit_hash(),
            }

        job_backend.create(create_info=create_info, server=None)
        creating_git_job("created %s in %s." % (job_backend.job_id[0:9], job_backend.model_name))

        summary = "➤ Summary: Job running "
        if parsed_args.local:
            summary += 'locally'
        else:
            summary += 'on the cluster'

        if create_info['config']['image']:
            summary += ' in Docker using image %s with %d CPU cores, %dGB memory and %d GPUs.' \
                       % (create_info['config']['image'], resources['cpu'], resources['memory'], resources['gpu'])
        else:
            summary += ' on host using all available resources.'

        print(summary)

        # tasks = []
        #
        # if 'tasks' in config:
        #     for name, task_config in six.iteritems(config['tasks']):
        #         replica = 1
        #         if 'replica' in task_config:
        #             replica = int(task_config['replica'])
        #         for index in range(0, replica):
        #             tasks.append(job_backend.create_task(job_id, task_config, name, index))

        if parsed_args.offline:
            if not parsed_args.local:
                self.logger.warning("Can not create a remote job in offline mode.")
                sys.exit(1)

            self.logger.warning("Execution started offline.")
        else:
            adding_files = loading_text("- Connecting to "+home_config['host']+" ... ")
            if job_backend.connect():
                adding_files("connected.")
            else:
                parsed_args.offline = True
                adding_files("failed. Continue in offline mode.")

        if not parsed_args.offline:
            sys.stdout.write("- Uploading job data ... ")
            job_backend.git.push()
            job_backend.client.wait_until_queue_empty(['files'], clear_end=False)

            sys.stdout.write(" done.\n")

            link = "%smodel/%s/job/%s" % (home_config['url'], job_backend.model_name, job_backend.job_id)
            sys.__stdout__.write(u"➤ Monitor job at %s\n" % (link))

        if parsed_args.local:
            job_backend.start(collect_system=False, offline=parsed_args.offline, push=False)

            if not parsed_args.offline:
                job_backend.git.start_push_sync()

            cpus = create_info['config']['resources']['cpu']
            memory = create_info['config']['resources']['memory']

            if not parsed_args.gpu_device and create_info['config']['resources']['gpu'] > 0:
                # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1]
                parsed_args.gpu_device = []
                for i in range(0, create_info['config']['resources']['gpu']):
                    parsed_args.gpu_device.append(i)

            start_command(self.logger, job_backend, env, parsed_args.volume, cpus=cpus, memory=memory, gpu_devices=parsed_args.gpu_device,
                offline=parsed_args.offline)
Example #12
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
                                         prog=aetros.const.__prog__ + ' run')
        parser.add_argument('command', nargs='?', help="The command to run. Default read in aetros.yml")
        parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in aetros.yml. If not specified, command is executed on the host.")
        parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in aetros.yml. Multiple --server allowed.")
        parser.add_argument('-m', '--model', help="Under which model this job should be listed. Default read in aetros.yml")
        parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.")
        parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.")

        parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.")
        parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.")
        parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.")
        parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.")

        parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.")
        parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.")

        parser.add_argument('--gpu-device', action='append', help="Which device id should be mapped into the NVIDIA docker container.")

        parser.add_argument('--volume', '-v', action='append', help="Volume into docker")
        parser.add_argument('-e', action='append', help="Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env")

        parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.")

        parsed_args = parser.parse_args(args)

        config = read_config(parsed_args.config or 'aetros.yml')

        env = {}
        if parsed_args.e:
            for item in parsed_args.e:
                if '=' in item:
                    k, v = item.split('=')
                else:
                    k = item
                    v = os.getenv(k)
                env[k] = v

        if 'command' not in config and not parsed_args.command:
            self.logger.error('No "command" given in aetros.yml or as argument.')
            sys.exit(1)

        job = JobBackend(parsed_args.model, self.logger, parsed_args.config or 'aetros.yml')
        ignore = []
        if 'ignore' in config:
            ignore = config['ignore']
        job.job = {'config': {'ignore': ignore}}

        files_added, size_added = job.add_files()

        print("%d files added (%s)" % (files_added, human_size(size_added, 2)))

        create_info = {
            'type': 'custom',
            'config': config
        }

        incoming_hyperparameter = {}
        if parsed_args.param:
            for param in parsed_args.param:
                if '=' not in param:
                    raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"')

                name, value = param.split('=')
                incoming_hyperparameter[name] = value

        # first transform simple format in the full definition with parameter types
        # (string, number, group, choice_group, etc)
        full_hyperparameters = lose_parameters_to_full(config['parameters'])

        # now extract hyperparameters from full definition, and overwrite stuff using
        # incoming_hyperparameter if available
        hyperparameter = extract_parameters(full_hyperparameters, incoming_hyperparameter)

        create_info['config']['parameters'] = hyperparameter

        if parsed_args.max_epochs:
            create_info['config']['maxEpochs'] = int(parsed_args.max_epochs)

        create_info['config']['priority'] = 0
        if parsed_args.priority:
            create_info['config']['priority'] = float(parsed_args.priority)

        if parsed_args.max_time:
            create_info['config']['maxTime'] = float(parsed_args.max_time)

        if parsed_args.command:
            create_info['config']['command'] = parsed_args.command

        if parsed_args.image:

            # reset install options, since we can't make sure if the base image still fits
            if 'image' in config and config['image'] and config['image'] != parsed_args.image:
                create_info['config']['install'] = None

            # reset dockerfile, since we specified manually an image
            create_info['config']['dockerfile'] = None
            create_info['config']['image'] = parsed_args.image

        if parsed_args.server:
            create_info['config']['servers'] = []
            for name in parsed_args.server:
                create_info['config']['servers'].append(name)

        if 'resources' not in create_info['config']:
            create_info['config']['resources'] = {}

        if parsed_args.cpu or parsed_args.memory or parsed_args.gpu is not None or parsed_args.gpu_memory:
            if parsed_args.cpu: create_info['config']['resources']['cpu'] = float(parsed_args.cpu)
            if parsed_args.memory: create_info['config']['resources']['memory'] = float(parsed_args.memory)
            if parsed_args.gpu is not None: create_info['config']['resources']['gpu'] = float(parsed_args.gpu)
            if parsed_args.gpu_memory: create_info['config']['resources']['gpu_memory'] = float(parsed_args.gpu_memory)

        if parsed_args.local:
            # usually, the aetros server would assign resources at job root level from the assigned server
            # but since it's started locally, we just use the requested one. User should know what they do.
            # start.py will use 'config' stuff anyone for docker limitation, so we should make sure it is
            # being displayed.

            if 'image' in create_info['config'] and create_info['config']:
                resources = create_info['config']['resources']
                create_info['resources_assigned'] = {'cpus': 1, 'memory': 1, 'gpus': []}

                if 'gpu' in resources and resources['gpu'] > 0:
                    create_info['resources_assigned']['gpus'] = [1] * resources['gpu']
                if 'cpu' in resources:
                    create_info['resources_assigned']['cpus'] = resources['cpu']
                if 'memory' in resources:
                    create_info['resources_assigned']['memory'] = resources['memory']
            else:
                # since this runs on the host, extract machine hardware and put int resources_assigned
                # so we see it at the job.
                pass

        if parsed_args.local:
            create_info['server'] = 'local'

        create_info['config']['sourcesAttached'] = True

        if aetros.utils.git.get_current_commit_hash():
            create_info['origin_git_source'] = {
                'origin': aetros.utils.git.get_current_remote_url(),
                'author': aetros.utils.git.get_current_commit_author(),
                'message': aetros.utils.git.get_current_commit_message(),
                'branch': aetros.utils.git.get_current_branch(),
                'commit': aetros.utils.git.get_current_commit_hash(),
            }

        job.create(create_info=create_info, server=None)

        print("Job %s/%s created." % (job.model_name, job.job_id))

        if parsed_args.local:
            start(self.logger, job.model_name + '/' + job.job_id, fetch=False, env=env, volumes=parsed_args.volume, gpu_devices=parsed_args.gpu_device)
        else:
            if parsed_args.volume:
                print("Can not use volume with jobs on the cluster. Use datasets instead.")
                sys.exit(1)

            #todo, make it visible
            job.git.push()
            print("Open http://%s/model/%s/job/%s to monitor it." % (job.host, job.model_name, job.job_id))
Example #13
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' run')
        parser.add_argument(
            'command',
            nargs='?',
            help="The command to run. Default read in configuration file")
        parser.add_argument(
            '-i',
            '--image',
            help=
            "Which Docker image to use for the command. Default read in configuration file. If not specified, command is executed on the host."
        )
        parser.add_argument(
            '--no-image',
            action='store_true',
            help=
            "Forces not to use docker, even when image is defined in the configuration file."
        )

        parser.add_argument(
            '-s',
            '--server',
            action='append',
            help=
            "Limits the server pool to this server. Default not limitation or read in configuration file. Multiple --server allowed."
        )
        parser.add_argument(
            '-m',
            '--model',
            help=
            "Under which model this job should be listed. Default read in configuration file"
        )
        parser.add_argument(
            '-l',
            '--local',
            action='store_true',
            help="Start the job immediately on the current machine.")
        parser.add_argument(
            '-c',
            '--config',
            help="Default aetros.yml in current working directory.")
        parser.add_argument(
            '--priority',
            help="Increases or decreases priority. Default is 0.")

        parser.add_argument(
            '--cpu',
            help="How many CPU cores should be assigned to job. Docker only.")
        parser.add_argument(
            '--memory',
            help="How much memory should be assigned to job. Docker only.")
        parser.add_argument(
            '--gpu',
            help="How many GPU cards should be assigned to job. Docker only.")
        parser.add_argument(
            '--gpu_memory',
            help="Memory requirement for the GPU. Docker only.")

        parser.add_argument(
            '--offline',
            '-o',
            action='store_true',
            help="Whether the execution should happen offline.")

        parser.add_argument(
            '--rebuild-image',
            action='store_true',
            help="Makes sure the Docker image is re-built without cache.")

        parser.add_argument(
            '--max-time',
            help=
            "Limit execution time in seconds. Sends SIGINT to the process group when reached."
        )
        parser.add_argument(
            '--max-epochs',
            help=
            "Limit execution epochs. Sends SIGINT to the process group when reached."
        )

        parser.add_argument(
            '--gpu-device',
            action='append',
            help=
            "Which device id should be mapped into the NVIDIA docker container. Only when --local"
        )

        parser.add_argument('--volume',
                            '-v',
                            action='append',
                            help="Volume into docker. Only when --local")
        parser.add_argument(
            '-e',
            action='append',
            help=
            "Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env"
        )

        parser.add_argument(
            '-p',
            '--param',
            action='append',
            help=
            "Sets a hyperparameter, example '--param name=value'. Multiple --param allowed."
        )

        parsed_args = parser.parse_args(args)

        if parsed_args.config and not os.path.exists(parsed_args.config):
            self.logger.error("fatal: file %s does not exist." %
                              (parsed_args.config, ))
            sys.exit(2)

        config = find_config(parsed_args.config)
        home_config = read_home_config()

        if config['model'] and not parsed_args.model:
            parsed_args.model = config['model']

        if not parsed_args.model:
            print(
                "fatal: no model defined. Use --model or switch into a directory where you executed 'aetros init model-name'."
            )
            sys.exit(2)

        if not parsed_args.local and parsed_args.volume:
            print(
                "fatal: can not use volume with jobs on the cluster. Use datasets instead."
            )
            sys.exit(1)

        if parsed_args.local and parsed_args.priority:
            print(
                "fatal: the priority can only be set for jobs in the cluster.")
            sys.exit(1)

        if config['image']:
            ensure_docker_installed(self.logger)

        env = {}
        if parsed_args.e:
            for item in parsed_args.e:
                if '=' in item:
                    k, v = item.split('=')
                else:
                    k = item
                    v = os.getenv(k)
                env[k] = v

        if ('command' not in config
                or not config['command']) and not parsed_args.command:
            self.logger.error(
                'No command given. Define the command in aetros.yml or use command argument.'
            )
            sys.exit(1)

        job_backend = JobBackend(parsed_args.model, self.logger)

        ignore = []
        if 'ignore' in config:
            ignore = config['ignore']
        job_backend.job = {'config': {'ignore': ignore}}

        adding_files = loading_text("- Adding job files to index ... ")
        files_added, size_added = job_backend.add_files(config['root'],
                                                        report=False)
        adding_files("done with %d file%s added (%s)." %
                     (files_added, 's' if files_added != 1 else '',
                      human_size(size_added, 2)))

        create_info = {'type': 'custom', 'config': config}

        incoming_hyperparameter = {}
        if parsed_args.param:
            for param in parsed_args.param:
                if '=' not in param:
                    raise Exception(
                        '--param ' + param +
                        ' does not contain a `=`. Please use "--param name=value"'
                    )

                name, value = param.split('=')
                incoming_hyperparameter[name] = value

        # first transform simple format in the full definition with parameter types
        # (string, number, group, choice_group, etc)
        full_hyperparameters = lose_parameters_to_full(config['parameters'])

        # now extract hyperparameters from full definition, and overwrite stuff using
        # incoming_hyperparameter if available
        hyperparameter = extract_parameters(full_hyperparameters,
                                            incoming_hyperparameter)

        create_info['config']['parameters'] = hyperparameter

        if parsed_args.rebuild_image:
            create_info['config']['rebuild_image'] = True

        if parsed_args.max_epochs:
            create_info['config']['maxEpochs'] = int(parsed_args.max_epochs)

        create_info['config']['priority'] = 0
        if parsed_args.priority:
            create_info['config']['priority'] = float(parsed_args.priority)

        if parsed_args.max_time:
            create_info['config']['maxTime'] = float(parsed_args.max_time)

        if parsed_args.command:
            create_info['config']['command'] = parsed_args.command

        if parsed_args.image:
            # reset install options, since we can't make sure if the base image still fits
            if 'image' in config and config[
                    'image'] and config['image'] != parsed_args.image:
                create_info['config']['install'] = None

            # reset dockerfile, since we specified manually an image
            create_info['config']['dockerfile'] = None
            create_info['config']['image'] = parsed_args.image

        if parsed_args.no_image:
            create_info['config']['image'] = None

        if parsed_args.server:
            create_info['config']['servers'] = []
            for name in parsed_args.server:
                create_info['config']['servers'].append(name)

        create_info['config']['resources'] = create_info['config'].get(
            'resources', {})
        resources = create_info['config']['resources']

        default_cpu_and_memory = 1 if create_info['config']['image'] else 0
        resources['cpu'] = int(parsed_args.cpu
                               or resources.get('cpu', default_cpu_and_memory))
        resources['memory'] = int(
            parsed_args.memory
            or resources.get('memory', default_cpu_and_memory))
        resources['gpu'] = int(parsed_args.gpu or resources.get('gpu', 0))
        resources['gpu_memory'] = int(parsed_args.gpu_memory
                                      or resources.get('gpu_memory', 0))

        if parsed_args.local:
            create_info['server'] = 'local'

            # make sure we do not limit the resources to something that is not available on the local machine
            warning = []
            cpu = cpuinfo.get_cpu_info()
            mem = psutil.virtual_memory().total
            gpu = 0
            try:
                gpu = len(get_ordered_devices())
            except CudaNotImplementedException:
                pass

            if not create_info['config']['image'] and not all(
                [x == 0 for x in six.itervalues(resources)]):
                self.logger.warning(
                    "! No Docker virtualization since no `image` defined, resources limitation ignored."
                )

            if create_info['config']['image'] and resources['gpu'] > 0:
                if not (sys.platform == "linux" or sys.platform == "linux2"):
                    self.logger.warning(
                        "! Your operating system does not support GPU allocation for "
                        "Docker virtualization. "
                        "NVIDIA-Docker2 is only supported on Linux.")

            local_max_resources = {
                'cpu': cpu['count'],
                'memory': ceil(mem / 1024 / 1024 / 1024),
                'gpu': gpu
            }

            if create_info['config']['image']:
                # read max hardware within Docker
                out = docker_call([
                    'run', 'alpine', 'sh', '-c',
                    'nproc && cat /proc/meminfo | grep MemTotal'
                ])
                cpus, memory = out.decode('utf-8').strip().split('\n')
                local_max_resources['cpu'] = int(cpus)

                memory = memory.replace('MemTotal:', '').replace('kB',
                                                                 '').strip()
                local_max_resources['memory'] = ceil(int(memory) / 1024 / 1024)

            if local_max_resources['cpu'] < resources['cpu']:
                warning.append('CPU cores %d -> %d' %
                               (resources['cpu'], local_max_resources['cpu']))
                resources['cpu'] = local_max_resources['cpu']

            if local_max_resources['memory'] < resources['memory']:
                warning.append(
                    'memory %dGB -> %dGB' %
                    (resources['memory'], local_max_resources['memory']))
                resources['memory'] = local_max_resources['memory']

            if local_max_resources['gpu'] < resources['gpu']:
                warning.append('GPU cards %d -> %d' %
                               (resources['gpu'], local_max_resources['gpu']))
                resources['gpu'] = local_max_resources['gpu']

            if warning:
                self.logger.warning(
                    "! Resources downgrade due to missing hardware: %s." %
                    (', '.join(warning), ))

        if parsed_args.config and not create_info['config']['configPath']:
            create_info['config']['configPath'] = parsed_args.config

        create_info['config']['sourcesAttached'] = True

        creating_git_job = loading_text("- Create job in local Git ... ")
        if aetros.utils.git.get_current_commit_hash():
            create_info['origin_git_source'] = {
                'origin': aetros.utils.git.get_current_remote_url(),
                'author': aetros.utils.git.get_current_commit_author(),
                'message': aetros.utils.git.get_current_commit_message(),
                'branch': aetros.utils.git.get_current_branch(),
                'commit': aetros.utils.git.get_current_commit_hash(),
            }

        job_backend.create(create_info=create_info, server=None)
        creating_git_job("created %s in %s." %
                         (job_backend.job_id[0:9], job_backend.model_name))

        summary = "➤ Summary: Job running "
        if parsed_args.local:
            summary += 'locally'
        else:
            summary += 'on the cluster'

        if create_info['config']['image']:
            summary += ' in Docker using image %s with %d CPU cores, %dGB memory and %d GPUs.' \
                       % (create_info['config']['image'], resources['cpu'], resources['memory'], resources['gpu'])
        else:
            summary += ' on host using all available resources.'

        print(summary)

        # tasks = []
        #
        # if 'tasks' in config:
        #     for name, task_config in six.iteritems(config['tasks']):
        #         replica = 1
        #         if 'replica' in task_config:
        #             replica = int(task_config['replica'])
        #         for index in range(0, replica):
        #             tasks.append(job_backend.create_task(job_id, task_config, name, index))

        if parsed_args.offline:
            if not parsed_args.local:
                self.logger.warning(
                    "Can not create a remote job in offline mode.")
                sys.exit(1)

            self.logger.warning("Execution started offline.")
        else:
            adding_files = loading_text("- Connecting to " +
                                        home_config['host'] + " ... ")
            if job_backend.connect():
                adding_files("connected.")
            else:
                parsed_args.offline = True
                adding_files("failed. Continue in offline mode.")

        if not parsed_args.offline:
            sys.stdout.write("- Uploading job data ... ")
            job_backend.git.push()
            job_backend.client.wait_until_queue_empty(['files'],
                                                      clear_end=False)

            sys.stdout.write(" done.\n")

            link = "%s/model/%s/job/%s" % (
                home_config['url'], job_backend.model_name, job_backend.job_id)
            sys.__stdout__.write(u"➤ Monitor job at %s\n" % (link))

        if parsed_args.local:
            job_backend.start(collect_system=False,
                              offline=parsed_args.offline,
                              push=False)

            if not parsed_args.offline:
                job_backend.git.start_push_sync()

            cpus = create_info['config']['resources']['cpu']
            memory = create_info['config']['resources']['memory']

            if not parsed_args.gpu_device and create_info['config'][
                    'resources']['gpu'] > 0:
                # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1]
                parsed_args.gpu_device = []
                for i in range(0, create_info['config']['resources']['gpu']):
                    parsed_args.gpu_device.append(i)

            start_command(self.logger,
                          job_backend,
                          env,
                          parsed_args.volume,
                          cpus=cpus,
                          memory=memory,
                          gpu_devices=parsed_args.gpu_device,
                          offline=parsed_args.offline)