Exemple #1
0
def restart(ctx, job_name, data, open_notebook, env, message, gpu, cpu, gpup,
            cpup, command):
    """
    Restart a given job as a new job.
    """
    parameters = {}

    expt_client = ExperimentClient()

    try:
        job = expt_client.get(normalize_job_name(job_name))
    except FloydException:
        job = expt_client.get(job_name)

    if gpup:
        instance_type = G1P_INSTANCE_TYPE
    elif cpup:
        instance_type = C1P_INSTANCE_TYPE
    elif gpu:
        instance_type = G1_INSTANCE_TYPE
    elif cpu:
        instance_type = C1_INSTANCE_TYPE
    else:
        instance_type = job.instance_type

    if instance_type is not None:
        parameters['instance_type'] = instance_type
    else:
        instance_type = job.instance_type

    if env is not None:
        if not validate_env(env, instance_type):
            sys.exit(1)
        parameters['env'] = env

    success, data_ids = process_data_ids(data)
    if not success:
        sys.exit(1)

    if message:
        parameters['message'] = message

    if command:
        parameters['command'] = ' '.join(command)

    floyd_logger.info('Restarting job %s...', job_name)

    new_job_info = expt_client.restart(job.id, parameters=parameters)
    if not new_job_info:
        floyd_logger.error("Failed to restart job")
        sys.exit(1)

    floyd_logger.info('New job created:')
    table_output = [["JOB NAME"], [new_job_info['name']]]
    floyd_logger.info('\n' + tabulate(table_output, headers="firstrow") + '\n')

    show_new_job_info(expt_client, new_job_info['name'], new_job_info,
                      job.mode, open_notebook)
Exemple #2
0
def run(ctx, gpu, env, message, data, mode, open, tensorboard, command):
    """
    Run a command on Floyd. Floyd will upload contents of the
    current directory and run your command remotely.
    This command will generate a run id for reference.
    """
    experiment_config = ExperimentConfigManager.get_config()
    if not ProjectClient().exists(experiment_config.family_id):
        floyd_logger.error(
            'Invalid project id, please run '
            '"floyd init PROJECT_NAME" before scheduling a job.')
        return

    access_token = AuthConfigManager.get_access_token()
    experiment_name = "{}/{}".format(access_token.username,
                                     experiment_config.name)

    # Create module
    if len(data) > 5:
        floyd_logger.error("Cannot attach more than 5 datasets to an job")
        return

    # Get the data entity from the server to:
    # 1. Confirm that the data id or uri exists and has the right permissions
    # 2. If uri is used, get the id of the dataset
    data_ids = []
    for data_name_or_id in data:
        path = None
        if ':' in data_name_or_id:
            data_name_or_id, path = data_name_or_id.split(':')
        data_obj = DataClient().get(data_name_or_id)
        if not data_obj:
            floyd_logger.error(
                "Data not found for name or id: {}".format(data_name_or_id))
            return
        data_ids.append(
            "{}:{}".format(data_obj.id, path) if path else data_obj.id)

    default_name = 'input' if len(data_ids) <= 1 else None
    module_inputs = [{
        'name': get_data_name(data_str, default_name),
        'type': 'dir'
    } for data_str in data_ids]

    if gpu:
        arch = 'gpu'
        instance_type = GPU_INSTANCE_TYPE
    else:
        arch = 'cpu'
        instance_type = CPU_INSTANCE_TYPE

    env_map = EnvClient().get_all()
    envs = env_map.get(arch)
    if envs:
        if env not in envs:
            floyd_logger.error(
                "{} is not in the list of supported environments: {}".format(
                    env, ', '.join(envs.keys())))
            return
    else:
        floyd_logger.error("{} is not a supported architecture".format(arch))
        return

    command_str = ' '.join(command)
    module = Module(name=experiment_name,
                    description=message or '',
                    command=command_str,
                    mode=get_mode_parameter(mode),
                    enable_tensorboard=tensorboard,
                    family_id=experiment_config.family_id,
                    inputs=module_inputs,
                    env=env,
                    arch=arch)

    from floyd.exceptions import BadRequestException
    try:
        module_id = ModuleClient().create(module)
    except BadRequestException as e:
        if 'Project not found, ID' in e.message:
            floyd_logger.error(
                'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.'
            )
        else:
            floyd_logger.error('ERROR: %s', e.message)
        sys.exit(1)
    floyd_logger.debug("Created module with id : {}".format(module_id))

    # Create experiment request
    # Get the actual command entered in the command line
    full_command = get_command_line(gpu, env, message, data, mode, open,
                                    tensorboard, command)
    experiment_request = ExperimentRequest(
        name=experiment_name,
        description=message,
        full_command=full_command,
        module_id=module_id,
        data_ids=data_ids,
        family_id=experiment_config.family_id,
        instance_type=instance_type)
    expt_cli = ExperimentClient()
    expt_info = expt_cli.create(experiment_request)
    floyd_logger.debug("Created job : {}".format(expt_info['id']))

    table_output = [["JOB NAME"], [expt_info['name']]]
    floyd_logger.info(tabulate(table_output, headers="firstrow"))
    floyd_logger.info("")

    if mode in ['jupyter', 'serve']:
        while True:
            # Wait for the experiment / task instances to become available
            try:
                experiment = expt_cli.get(expt_info['id'])
                if experiment.task_instances:
                    break
            except Exception:
                floyd_logger.debug("Job not available yet: {}".format(
                    expt_info['id']))

            floyd_logger.debug("Job not available yet: {}".format(
                expt_info['id']))
            sleep(3)
            continue

        # Print the path to jupyter notebook
        if mode == 'jupyter':
            jupyter_url = experiment.service_url
            print(
                "Setting up your instance and waiting for Jupyter notebook to become available ...",
                end='')
            if wait_for_url(jupyter_url,
                            sleep_duration_seconds=2,
                            iterations=900):
                floyd_logger.info(
                    "\nPath to jupyter notebook: {}".format(jupyter_url))
                if open:
                    webbrowser.open(jupyter_url)
            else:
                floyd_logger.info(
                    "\nPath to jupyter notebook: {}".format(jupyter_url))
                floyd_logger.info(
                    "Notebook is still loading. View logs to track progress")
                floyd_logger.info("   floyd logs {}".format(expt_info['name']))

        # Print the path to serving endpoint
        if mode == 'serve':
            floyd_logger.info("Path to service endpoint: {}".format(
                experiment.service_url))

        if experiment.timeout_seconds < 4 * 60 * 60:
            floyd_logger.info(
                "\nYour job timeout is currently set to {} seconds".format(
                    experiment.timeout_seconds))
            floyd_logger.info(
                "This is because you are in a trial account. Paid users will have longer timeouts. "
                "See https://www.floydhub.com/pricing for details")

    else:
        floyd_logger.info("To view logs enter:")
        floyd_logger.info("   floyd logs {}".format(expt_info['name']))
Exemple #3
0
def restart(ctx, job_name, data, open_notebook, env, message, gpu, cpu, gpup,
            cpup, command):
    """
    Restart a given job as a new job.
    """
    # Error early if more than one --env is passed. Then get the first/only
    # --env out of the list so all other operations work normally (they don't
    # expect an iterable). For details on this approach, see the comment above
    # the --env click option
    if len(env) > 1:
        floyd_logger.error(
            "You passed more than one environment: {}. Please specify a single environment."
            .format(env))
        sys.exit(1)
    env = env[0]

    parameters = {}

    expt_client = ExperimentClient()

    try:
        job = expt_client.get(normalize_job_name(job_name))
    except FloydException:
        job = expt_client.get(job_name)

    if gpup:
        instance_type = G1P_INSTANCE_TYPE
    elif cpup:
        instance_type = C1P_INSTANCE_TYPE
    elif gpu:
        instance_type = G1_INSTANCE_TYPE
    elif cpu:
        instance_type = C1_INSTANCE_TYPE
    else:
        instance_type = job.instance_type

    if instance_type is not None:
        parameters['instance_type'] = instance_type
    else:
        instance_type = job.instance_type

    if env is not None:
        arch = INSTANCE_ARCH_MAP[instance_type]
        if not validate_env(env, arch):
            sys.exit(1)
        parameters['env'] = env

    success, data_ids = process_data_ids(data)
    if not success:
        sys.exit(1)
    if data_ids:
        parameters['data_ids'] = data_ids

    if message:
        parameters['description'] = message

    if command:
        parameters['command'] = ' '.join(command)

    floyd_logger.info('Restarting job %s...', job_name)

    new_job_info = expt_client.restart(job.id, parameters=parameters)
    if not new_job_info:
        floyd_logger.error("Failed to restart job")
        sys.exit(1)

    show_new_job_info(expt_client, new_job_info['name'], new_job_info,
                      job.mode, open_notebook)