Exemple #1
0
def logs(id, url, tail, sleep_duration=1):
    """
    Print the logs of the run.
    """
    experiment = ExperimentClient().get(id)
    task_instance = TaskInstanceClient().get(
        get_module_task_instance_id(experiment.task_instances))
    log_url = "{}/api/v1/resources/{}?content=true".format(
        floyd.floyd_host, task_instance.log_id)
    if url:
        floyd_logger.info(log_url)
        return
    if tail:
        floyd_logger.info("Launching job ...")
        current_shell_output = ""
        while True:
            # Get the logs in a loop and log the new lines
            log_file_contents = get_url_contents(log_url)
            print_output = log_file_contents[len(current_shell_output):]
            if len(print_output.strip()):
                floyd_logger.info(print_output)
            current_shell_output = log_file_contents
            sleep(sleep_duration)
    else:
        log_file_contents = get_url_contents(log_url)
        if len(log_file_contents.strip()):
            floyd_logger.info(log_file_contents)
        else:
            floyd_logger.info("Launching job now. Try after a few seconds.")
Exemple #2
0
def status(id):
    """
    View status of all jobs in a project.

    The command also accepts a specific job name.
    """
    if id:
        try:
            experiment = ExperimentClient().get(normalize_job_name(id))
        except FloydException:
            experiment = ExperimentClient().get(id)

        print_experiments([experiment])
    else:
        experiments = ExperimentClient().get_all()
        print_experiments(experiments)
Exemple #3
0
def stop(id):
    """
    Stop a run before it can finish.
    """
    experiment = ExperimentClient().get(id)
    if experiment.state not in ["queued", "running"]:
        floyd_logger.info("Experiment in {} state cannot be stopped".format(
            experiment.state))
        return

    if ExperimentClient().stop(id):
        floyd_logger.info(
            "Experiment shutdown request submitted. Check status to confirm shutdown"
        )
    else:
        floyd_logger.error("Failed to stop experiment")
Exemple #4
0
def output(id, url):
    """
    Shows the output url of the run.
    By default opens the output page in your default browser.
    """
    try:
        experiment = ExperimentClient().get(normalize_job_name(id))
    except FloydException:
        experiment = ExperimentClient().get(id)

    output_dir_url = "%s/%s/output" % (floyd.floyd_web_host, experiment.name)
    if url:
        floyd_logger.info(output_dir_url)
    else:
        floyd_logger.info("Opening output path in your browser ...")
        webbrowser.open(output_dir_url)
Exemple #5
0
def get_log_id(job_id):
    log_msg_printed = False
    while True:
        try:
            experiment = ExperimentClient().get(normalize_job_name(job_id))
        except FloydException:
            experiment = ExperimentClient().get(job_id)

        instance_log_id = experiment.instance_log_id
        if instance_log_id:
            break
        elif not log_msg_printed:
            floyd_logger.info("Waiting for logs ...\n")
            log_msg_printed = True

        sleep(1)

    return instance_log_id
Exemple #6
0
def clone(id):
    """
    Download the code for the experiment to the current path
    """
    experiment = ExperimentClient().get(id)
    task_instance_id = get_module_task_instance_id(experiment.task_instances)
    task_instance = TaskInstanceClient().get(
        task_instance_id) if task_instance_id else None
    if not task_instance:
        sys.exit(
            "Cannot clone this version of the job. Try a different version.")
    module = ModuleClient().get(
        task_instance.module_id) if task_instance else None
    code_url = "{}/api/v1/resources/{}?content=true&download=true".format(
        floyd.floyd_host, module.resource_id)
    ExperimentClient().download_tar(url=code_url,
                                    untar=True,
                                    delete_after_untar=True)
def logs(id, url, tail, follow, sleep_duration=1):
    """
    Print the logs of the run.
    """
    tail = tail or follow

    log_msg_printed = False
    while True:
        try:
            experiment = ExperimentClient().get(normalize_job_name(id))
        except FloydException:
            experiment = ExperimentClient().get(id)

        instance_log_id = experiment.instance_log_id
        if instance_log_id:
            break
        elif not log_msg_printed:
            floyd_logger.info("Waiting for logs ...\n")
            log_msg_printed = True

        sleep(1)

    log_url = "{}/api/v1/resources/{}?content=true".format(
        floyd.floyd_host, instance_log_id)
    if url:
        floyd_logger.info(log_url)
        return
    if tail:
        floyd_logger.info("Launching job ...")
        current_shell_output = ""
        while True:
            # Get the logs in a loop and log the new lines
            log_file_contents = ResourceClient().get_content(instance_log_id)
            print_output = log_file_contents[len(current_shell_output):]
            if len(print_output.strip()):
                floyd_logger.info(print_output)
            current_shell_output = log_file_contents
            sleep(sleep_duration)
    else:
        log_file_contents = ResourceClient().get_content(instance_log_id)
        if len(log_file_contents.strip()):
            floyd_logger.info(log_file_contents)
        else:
            floyd_logger.info("Launching job now. Try after a few seconds.")
def stop(id):
    """
    Stop a run before it can finish.
    """
    try:
        experiment = ExperimentClient().get(normalize_job_name(id))
    except FloydException:
        experiment = ExperimentClient().get(id)

    if experiment.state not in ["queued", "running"]:
        floyd_logger.info("Job in {} state cannot be stopped".format(
            experiment.state))
        return

    if ExperimentClient().stop(experiment.id):
        floyd_logger.info(
            "Experiment shutdown request submitted. Check status to confirm shutdown"
        )
    else:
        floyd_logger.error("Failed to stop job")
Exemple #9
0
def clone(id, path):
    """
    - Download all files from a job

    Eg: alice/projects/mnist/1/

    Note: This will download the files that were originally uploaded at
    the start of the job.

    - Download files in a specific path from a job

    Specify the path to a directory and download all its files and subdirectories.

    Eg: --path models/checkpoint1
    """
    try:
        experiment = ExperimentClient().get(
            normalize_job_name(id, use_config=False))
    except FloydException:
        experiment = ExperimentClient().get(id)

    task_instance_id = get_module_task_instance_id(experiment.task_instances)
    task_instance = TaskInstanceClient().get(
        task_instance_id) if task_instance_id else None
    if not task_instance:
        sys.exit(
            "Cannot clone this version of the job. Try a different version.")
    module = ModuleClient().get(
        task_instance.module_id) if task_instance else None

    if path:
        # Download a directory from Code
        code_url = "{}/api/v1/download/artifacts/code/{}?is_dir=true&path={}".format(
            floyd.floyd_host, experiment.id, path)
    else:
        # Download the full Code
        code_url = "{}/api/v1/resources/{}?content=true&download=true".format(
            floyd.floyd_host, module.resource_id)
    ExperimentClient().download_tar(url=code_url,
                                    untar=True,
                                    delete_after_untar=True)
Exemple #10
0
def stop(id):
    """
    Stop a running job.
    """
    try:
        experiment = ExperimentClient().get(normalize_job_name(id))
    except FloydException:
        experiment = ExperimentClient().get(id)

    if experiment.state not in ["queued", "queue_scheduled", "running"]:
        floyd_logger.info("Job in {} state cannot be stopped".format(
            experiment.state))
        sys.exit(1)

    if not ExperimentClient().stop(experiment.id):
        floyd_logger.error("Failed to stop job")
        sys.exit(1)

    floyd_logger.info(
        "Experiment shutdown request submitted. Check status to confirm shutdown"
    )
Exemple #11
0
def logs(id, url, tail, sleep_duration=1):
    """
    Print the logs of the run.
    """
    try:
        experiment = ExperimentClient().get(normalize_job_name(id))
    except FloydException:
        experiment = ExperimentClient().get(id)

    if experiment.state == 'queued':
        floyd_logger.info("Job is currently in a queue")
        return

    instance_log_id = experiment.instance_log_id
    if not instance_log_id:
        floyd_logger.info("Job not started yet, no log to show.")
        sys.exit(1)

    log_url = "{}/api/v1/resources/{}?content=true".format(
        floyd.floyd_host, instance_log_id)
    if url:
        floyd_logger.info(log_url)
        return
    if tail:
        floyd_logger.info("Launching job ...")
        current_shell_output = ""
        while True:
            # Get the logs in a loop and log the new lines
            log_file_contents = ResourceClient().get_content(instance_log_id)
            print_output = log_file_contents[len(current_shell_output):]
            if len(print_output.strip()):
                floyd_logger.info(print_output)
            current_shell_output = log_file_contents
            sleep(sleep_duration)
    else:
        log_file_contents = ResourceClient().get_content(instance_log_id)
        if len(log_file_contents.strip()):
            floyd_logger.info(log_file_contents)
        else:
            floyd_logger.info("Launching job now. Try after a few seconds.")
Exemple #12
0
def delete(ids, yes):
    """
    Delete project runs
    """
    failures = False
    for id in ids:
        experiment = ExperimentClient().get(id)
        if not experiment:
            failures = True
            continue

        if not yes and not click.confirm("Delete Run: {}?".format(experiment.name),
                                         abort=False,
                                         default=False):
            floyd_logger.info("Job {}: Skipped.".format(experiment.name))
            continue

        if not ExperimentClient().delete(experiment.id):
            failures = True

    if failures:
        sys.exit(1)
Exemple #13
0
def output(id, url, download):
    """
    Shows the output url of the run.
    By default opens the output page in your default browser.
    """
    experiment = ExperimentClient().get(id)
    task_instance = TaskInstanceClient().get(get_module_task_instance_id(experiment.task_instances))
    if "output" in task_instance.output_ids:
        resource = ResourceClient().get(task_instance.output_ids["output"])
        output_dir_url = "{}/viewer/{}".format(floyd.floyd_host, resource.uri)
        if url:
            floyd_logger.info(output_dir_url)
        else:
            if download:
                output_dir_url = "{}&download=true".format(output_dir_url)
                ExperimentClient().download_tar(url=output_dir_url,
                                                untar=True,
                                                delete_after_untar=True)
            else:
                floyd_logger.info("Opening output directory in your browser ...")
                webbrowser.open(output_dir_url)
    else:
        floyd_logger.error("Output directory not available")
Exemple #14
0
def output(id, url):
    """
    Shows the output url of the run.
    By default opens the output page in your default browser.
    """
    experiment = ExperimentClient().get(id)
    task_instance = TaskInstanceClient().get(get_module_task_instance_id(experiment.task_instances))
    if "output" in task_instance.output_ids:
        output_dir_url = "{}/api/v1/resources/{}?content=true".format(floyd.floyd_host,
                                                                      task_instance.output_ids["output"])
        if url:
            floyd_logger.info(output_dir_url)
        else:
            floyd_logger.info("Opening output directory in your browser ...")
            webbrowser.open(output_dir_url)
    else:
        floyd_logger.error("Output directory not available")
Exemple #15
0
def info(id):
    """
    Prints detailed info for the run
    """
    experiment = ExperimentClient().get(id)
    task_instance_id = get_module_task_instance_id(experiment.task_instances)
    task_instance = TaskInstanceClient().get(task_instance_id) if task_instance_id else None
    table = [["Job name", normalize_job_name(experiment.name)],
             ["Output name", normalize_data_name(experiment.name + '/output') if task_instance else None],
             ["Created", experiment.created_pretty],
             ["Status", experiment.state], ["Duration(s)", experiment.duration_rounded],
             ["Instance", experiment.instance_type_trimmed],
             ["Description", experiment.description]]
    if task_instance and task_instance.mode in ['jupyter', 'serving']:
        table.append(["Mode", task_instance.mode])
        table.append(["Url", experiment.service_url])
    if experiment.tensorboard_url:
        table.append(["Tensorboard", experiment.tensorboard_url])
    floyd_logger.info(tabulate(table))
Exemple #16
0
def info(id):
    """
    Prints detailed info for the run
    """
    experiment = ExperimentClient().get(id)
    task_instance = TaskInstanceClient().get(get_module_task_instance_id(experiment.task_instances))
    mode = url = None
    if experiment.state == "running":
        if task_instance.mode in ['jupyter', 'serving']:
            mode = task_instance.mode
            url = get_task_url(task_instance.id)
    table = [["Run ID", experiment.id], ["Name", experiment.name], ["Created", experiment.created_pretty],
             ["Status", experiment.state], ["Duration(s)", experiment.duration_rounded],
             ["Output ID", task_instance.id], ["Instance", experiment.instance_type_trimmed],
             ["Version", experiment.description]]
    if mode:
        table.append(["Mode", mode])
    if url:
        table.append(["Url", url])
    floyd_logger.info(tabulate(table))
Exemple #17
0
def run(ctx, gpu, env, data, mode, command):
    """
    Run a command on Floyd. Floyd will upload contents of the
    current directory and run your command remotely.
    This command will generate a run id for reference.
    """
    command_str = ' '.join(command)
    experiment_config = ExperimentConfigManager.get_config()
    access_token = AuthConfigManager.get_access_token()
    version = experiment_config.version
    experiment_name = "{}/{}:{}".format(access_token.username,
                                        experiment_config.name,
                                        version)

    # Create module
    module = Module(name=experiment_name,
                    description=version,
                    command=command_str,
                    mode=get_mode_parameter(mode),
                    family_id=experiment_config.family_id,
                    default_container=get_docker_image(env, gpu),
                    version=version)
    module_id = ModuleClient().create(module)
    floyd_logger.debug("Created module with id : {}".format(module_id))

    # Create experiment request
    instance_type = GPU_INSTANCE_TYPE if gpu else CPU_INSTANCE_TYPE
    experiment_request = ExperimentRequest(name=experiment_name,
                                           description=version,
                                           module_id=module_id,
                                           data_id=data,
                                           predecessor=experiment_config.experiment_predecessor,
                                           family_id=experiment_config.family_id,
                                           version=version,
                                           instance_type=instance_type)
    experiment_id = ExperimentClient().create(experiment_request)
    floyd_logger.debug("Created experiment : {}".format(experiment_id))

    # Update expt config including predecessor
    experiment_config.increment_version()
    experiment_config.set_module_predecessor(module_id)
    experiment_config.set_experiment_predecessor(experiment_id)
    ExperimentConfigManager.set_config(experiment_config)

    table_output = [["RUN ID", "NAME", "VERSION"],
                    [experiment_id, experiment_name, version]]
    floyd_logger.info(tabulate(table_output, headers="firstrow"))
    floyd_logger.info("")

    if mode != 'default':
        while True:
            # Wait for the experiment to become available
            try:
                experiment = ExperimentClient().get(experiment_id)
                break
            except Exception:
                floyd_logger.debug("Experiment not available yet: {}".format(experiment_id))
                sleep(1)
                continue

        # Print the path to jupyter notebook
        if mode == 'jupyter':
            jupyter_url = get_task_url(get_module_task_instance_id(experiment.task_instances))
            floyd_logger.info("Waiting for Jupyter notebook to become available ...")
            if wait_for_url(jupyter_url):
                floyd_logger.info("\nPath to jupyter notebook: {}".format(jupyter_url))
            else:
                floyd_logger.info("Problem starting the notebook. View logs for more information")

        # Print the path to serving endpoint
        if mode == 'serve':
            floyd_logger.info("Path to service endpoint: {}".format(
                get_task_url(get_module_task_instance_id(experiment.task_instances))))

    floyd_logger.info("""
To view logs enter:
    floyd logs {}
        """.format(experiment_id))
Exemple #18
0
def run(ctx, gpu, env, message, data, mode, open, tensorboard, command):
    """
    Run a command on Floyd. Floyd will upload contents of the
    current directory and run your command remotely.
    This command will generate a run id for reference.
    """
    experiment_config = ExperimentConfigManager.get_config()
    if not ProjectClient().exists(experiment_config.family_id):
        floyd_logger.error(
            'Invalid project id, please run '
            '"floyd init PROJECT_NAME" before scheduling a job.')
        return

    access_token = AuthConfigManager.get_access_token()
    experiment_name = "{}/{}".format(access_token.username,
                                     experiment_config.name)

    # Create module
    if len(data) > 5:
        floyd_logger.error("Cannot attach more than 5 datasets to an job")
        return

    # Get the data entity from the server to:
    # 1. Confirm that the data id or uri exists and has the right permissions
    # 2. If uri is used, get the id of the dataset
    data_ids = []
    for data_name_or_id in data:
        path = None
        if ':' in data_name_or_id:
            data_name_or_id, path = data_name_or_id.split(':')
        data_obj = DataClient().get(data_name_or_id)
        if not data_obj:
            floyd_logger.error(
                "Data not found for name or id: {}".format(data_name_or_id))
            return
        data_ids.append(
            "{}:{}".format(data_obj.id, path) if path else data_obj.id)

    default_name = 'input' if len(data_ids) <= 1 else None
    module_inputs = [{
        'name': get_data_name(data_str, default_name),
        'type': 'dir'
    } for data_str in data_ids]

    if gpu:
        arch = 'gpu'
        instance_type = GPU_INSTANCE_TYPE
    else:
        arch = 'cpu'
        instance_type = CPU_INSTANCE_TYPE

    env_map = EnvClient().get_all()
    envs = env_map.get(arch)
    if envs:
        if env not in envs:
            floyd_logger.error(
                "{} is not in the list of supported environments: {}".format(
                    env, ', '.join(envs.keys())))
            return
    else:
        floyd_logger.error("{} is not a supported architecture".format(arch))
        return

    command_str = ' '.join(command)
    module = Module(name=experiment_name,
                    description=message or '',
                    command=command_str,
                    mode=get_mode_parameter(mode),
                    enable_tensorboard=tensorboard,
                    family_id=experiment_config.family_id,
                    inputs=module_inputs,
                    env=env,
                    arch=arch)

    from floyd.exceptions import BadRequestException
    try:
        module_id = ModuleClient().create(module)
    except BadRequestException as e:
        if 'Project not found, ID' in e.message:
            floyd_logger.error(
                'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.'
            )
        else:
            floyd_logger.error('ERROR: %s', e.message)
        sys.exit(1)
    floyd_logger.debug("Created module with id : {}".format(module_id))

    # Create experiment request
    # Get the actual command entered in the command line
    full_command = get_command_line(gpu, env, message, data, mode, open,
                                    tensorboard, command)
    experiment_request = ExperimentRequest(
        name=experiment_name,
        description=message,
        full_command=full_command,
        module_id=module_id,
        data_ids=data_ids,
        family_id=experiment_config.family_id,
        instance_type=instance_type)
    expt_cli = ExperimentClient()
    expt_info = expt_cli.create(experiment_request)
    floyd_logger.debug("Created job : {}".format(expt_info['id']))

    table_output = [["JOB NAME"], [expt_info['name']]]
    floyd_logger.info(tabulate(table_output, headers="firstrow"))
    floyd_logger.info("")

    if mode in ['jupyter', 'serve']:
        while True:
            # Wait for the experiment / task instances to become available
            try:
                experiment = expt_cli.get(expt_info['id'])
                if experiment.task_instances:
                    break
            except Exception:
                floyd_logger.debug("Job not available yet: {}".format(
                    expt_info['id']))

            floyd_logger.debug("Job not available yet: {}".format(
                expt_info['id']))
            sleep(3)
            continue

        # Print the path to jupyter notebook
        if mode == 'jupyter':
            jupyter_url = experiment.service_url
            print(
                "Setting up your instance and waiting for Jupyter notebook to become available ...",
                end='')
            if wait_for_url(jupyter_url,
                            sleep_duration_seconds=2,
                            iterations=900):
                floyd_logger.info(
                    "\nPath to jupyter notebook: {}".format(jupyter_url))
                if open:
                    webbrowser.open(jupyter_url)
            else:
                floyd_logger.info(
                    "\nPath to jupyter notebook: {}".format(jupyter_url))
                floyd_logger.info(
                    "Notebook is still loading. View logs to track progress")
                floyd_logger.info("   floyd logs {}".format(expt_info['name']))

        # Print the path to serving endpoint
        if mode == 'serve':
            floyd_logger.info("Path to service endpoint: {}".format(
                experiment.service_url))

        if experiment.timeout_seconds < 4 * 60 * 60:
            floyd_logger.info(
                "\nYour job timeout is currently set to {} seconds".format(
                    experiment.timeout_seconds))
            floyd_logger.info(
                "This is because you are in a trial account. Paid users will have longer timeouts. "
                "See https://www.floydhub.com/pricing for details")

    else:
        floyd_logger.info("To view logs enter:")
        floyd_logger.info("   floyd logs {}".format(expt_info['name']))
Exemple #19
0
def restart(ctx, job_name, data, open_notebook, env, message, gpu, cpu, gpup,
            cpup, command):
    """
    Restart a given job as a new job.
    """
    # Error early if more than one --env is passed. Then get the first/only
    # --env out of the list so all other operations work normally (they don't
    # expect an iterable). For details on this approach, see the comment above
    # the --env click option
    if len(env) > 1:
        floyd_logger.error(
            "You passed more than one environment: {}. Please specify a single environment."
            .format(env))
        sys.exit(1)
    env = env[0]

    parameters = {}

    expt_client = ExperimentClient()

    try:
        job = expt_client.get(normalize_job_name(job_name))
    except FloydException:
        job = expt_client.get(job_name)

    if gpup:
        instance_type = G1P_INSTANCE_TYPE
    elif cpup:
        instance_type = C1P_INSTANCE_TYPE
    elif gpu:
        instance_type = G1_INSTANCE_TYPE
    elif cpu:
        instance_type = C1_INSTANCE_TYPE
    else:
        instance_type = job.instance_type

    if instance_type is not None:
        parameters['instance_type'] = instance_type
    else:
        instance_type = job.instance_type

    if env is not None:
        arch = INSTANCE_ARCH_MAP[instance_type]
        if not validate_env(env, arch):
            sys.exit(1)
        parameters['env'] = env

    success, data_ids = process_data_ids(data)
    if not success:
        sys.exit(1)
    if data_ids:
        parameters['data_ids'] = data_ids

    if message:
        parameters['description'] = message

    if command:
        parameters['command'] = ' '.join(command)

    floyd_logger.info('Restarting job %s...', job_name)

    new_job_info = expt_client.restart(job.id, parameters=parameters)
    if not new_job_info:
        floyd_logger.error("Failed to restart job")
        sys.exit(1)

    show_new_job_info(expt_client, new_job_info['name'], new_job_info,
                      job.mode, open_notebook)
Exemple #20
0
def run(ctx, cpu, gpu, env, message, data, mode, open_notebook, follow,
        tensorboard, gpup, cpup, gpu2, cpu2, max_runtime, task, command):
    """
    Run a command on Floyd. Floyd will upload contents of the
    current directory and run your command remotely.
    This command will generate a run id for reference.
    """
    # cli_default is used for any option that has default value
    cli_default = {'description': '', 'command': ''}
    # Error early if more than one --env is passed.  Then get the first/only
    # --env out of the list so all other operations work normally (they don't
    # expect an iterable). For details on this approach, see the comment above
    # the --env click option
    if not env:
        cli_default['env'] = DEFAULT_ENV
        env = None
    elif len(env) > 1:
        floyd_logger.error(
            "You passed more than one environment: {}. Please specify a single environment."
            .format(env))
        sys.exit(1)
    else:
        env = env[0]

    if not mode:
        cli_default['mode'] = 'command'

    experiment_config = ExperimentConfigManager.get_config()
    access_token = AuthConfigManager.get_access_token()
    namespace = experiment_config.namespace or access_token.username

    if not ProjectClient().exists(experiment_config.name, namespace=namespace):
        floyd_logger.error(
            'Invalid project id, please run '
            '"floyd init PROJECT_NAME" before scheduling a job.')
        sys.exit(1)

    experiment_name = "{}/{}".format(namespace, experiment_config.name)

    success, data_ids = process_data_ids(data)
    if not success:
        sys.exit(2)

    # Create module
    default_name = 'input' if len(data_ids) <= 1 else None
    module_inputs = [{
        'name': get_data_name(data_str, default_name),
        'type': 'dir'
    } for data_str in data_ids]

    instance_type = None
    if gpu2:
        instance_type = G2_INSTANCE_TYPE
    elif cpu2:
        instance_type = C2_INSTANCE_TYPE
    elif gpup:
        instance_type = G1P_INSTANCE_TYPE
    elif cpup:
        instance_type = C1P_INSTANCE_TYPE
    elif gpu:
        instance_type = G1_INSTANCE_TYPE
    elif cpu:
        instance_type = C1_INSTANCE_TYPE

    if not instance_type:
        cli_default['instance_type'] = C1_INSTANCE_TYPE

    yaml_config = read_yaml_config()
    arch = INSTANCE_ARCH_MAP[resolve_final_instance_type(
        instance_type, yaml_config, task, cli_default)]
    if not validate_env(env or cli_default['env'], arch):
        sys.exit(3)

    command_str = ' '.join(command)
    if command_str and mode in ('jupyter', 'serve'):
        floyd_logger.error(
            'Command argument "%s" cannot be used with mode: %s.\nSee http://docs.floydhub.com/guides/run_a_job/#mode for more information about run modes.',
            command_str, mode)  # noqa
        sys.exit(3)
    if command_str == '':
        # set to none so it won't override floyd config
        command_str = None

    module = Module(name=experiment_name,
                    description=message or '',
                    command=command_str,
                    mode=mode,
                    enable_tensorboard=tensorboard,
                    family_id=experiment_config.family_id,
                    inputs=module_inputs,
                    env=env,
                    instance_type=instance_type,
                    yaml_config=yaml_config,
                    task=task)

    try:
        module_id = ModuleClient().create(module, cli_default)
    except BadRequestException as e:
        if 'Project not found, ID' in e.message:
            floyd_logger.error(
                'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.'
            )
        else:
            floyd_logger.error('ERROR: %s', e.message)
        sys.exit(4)
    floyd_logger.debug("Created module with id : %s", module_id)

    # Create experiment request
    # Get the actual command entered in the command line
    if max_runtime:
        max_runtime = int(max_runtime)
    full_command = get_command_line(instance_type, env, message, data, mode,
                                    open_notebook, tensorboard, command_str)
    experiment_request = ExperimentRequest(
        name=experiment_name,
        description=message,
        full_command=full_command,
        module_id=module_id,
        max_runtime=max_runtime,
        env=env,
        data_ids=data_ids,
        family_id=experiment_config.family_id,
        instance_type=instance_type,
        yaml_config=yaml_config,
        task=task)
    expt_client = ExperimentClient()
    expt_info = expt_client.create(experiment_request, cli_default)
    floyd_logger.debug("Created job : %s", expt_info['id'])

    job_name = expt_info['name']
    if not follow:
        show_new_job_info(expt_client, job_name, expt_info, mode,
                          open_notebook)
    else:
        # If the user specified --follow, we assume they're only interested in
        # log output and not in anything that would be displayed by
        # show_new_job_info.
        floyd_logger.info("Opening logs ...")
        instance_log_id = instance_log_id = get_log_id(job_name)
        follow_logs(instance_log_id)
Exemple #21
0
def run(ctx, gpu, env, message, data, mode, open, tensorboard, gpup, cpup,
        command):
    """
    Run a command on Floyd. Floyd will upload contents of the
    current directory and run your command remotely.
    This command will generate a run id for reference.
    """
    experiment_config = ExperimentConfigManager.get_config()
    if not ProjectClient().exists(experiment_config.family_id):
        floyd_logger.error(
            'Invalid project id, please run '
            '"floyd init PROJECT_NAME" before scheduling a job.')
        sys.exit(1)

    access_token = AuthConfigManager.get_access_token()
    experiment_name = "{}/{}".format(access_token.username,
                                     experiment_config.name)

    success, data_ids = process_data_ids(data)
    if not success:
        sys.exit(2)

    # Create module
    default_name = 'input' if len(data_ids) <= 1 else None
    module_inputs = [{
        'name': get_data_name(data_str, default_name),
        'type': 'dir'
    } for data_str in data_ids]

    if gpup:
        instance_type = G1P_INSTANCE_TYPE
    elif cpup:
        instance_type = C1P_INSTANCE_TYPE
    elif gpu:
        instance_type = G1_INSTANCE_TYPE
    else:
        instance_type = C1_INSTANCE_TYPE

    if not validate_env(env, instance_type):
        sys.exit(3)

    command_str = ' '.join(command)
    if command_str and mode in ('jupyter', 'serve'):
        floyd_logger.error(
            'Command argument "%s" cannot be used with mode: %s.\nSee http://docs.floydhub.com/guides/run_a_job/#mode for more information about run modes.',
            command_str, mode)
        sys.exit(3)

    module = Module(name=experiment_name,
                    description=message or '',
                    command=command_str,
                    mode=get_mode_parameter(mode),
                    enable_tensorboard=tensorboard,
                    family_id=experiment_config.family_id,
                    inputs=module_inputs,
                    env=env,
                    arch=INSTANCE_ARCH_MAP[instance_type])

    try:
        module_id = ModuleClient().create(module)
    except BadRequestException as e:
        if 'Project not found, ID' in e.message:
            floyd_logger.error(
                'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.'
            )
        else:
            floyd_logger.error('ERROR: %s', e.message)
        sys.exit(4)
    floyd_logger.debug("Created module with id : %s", module_id)

    # Create experiment request
    # Get the actual command entered in the command line
    full_command = get_command_line(instance_type, env, message, data, mode,
                                    open, tensorboard, command_str)
    experiment_request = ExperimentRequest(
        name=experiment_name,
        description=message,
        full_command=full_command,
        module_id=module_id,
        data_ids=data_ids,
        family_id=experiment_config.family_id,
        instance_type=instance_type)
    expt_client = ExperimentClient()
    expt_info = expt_client.create(experiment_request)
    floyd_logger.debug("Created job : %s", expt_info['id'])

    job_name = normalize_job_name(expt_info['name'])
    floyd_logger.info("")
    table_output = [["JOB NAME"], [job_name]]
    floyd_logger.info(tabulate(table_output, headers="firstrow"))
    floyd_logger.info("")
    show_new_job_info(expt_client, job_name, expt_info, mode)
Exemple #22
0
def run(ctx, gpu, env, message, data, mode, open_notebook, tensorboard, gpup,
        cpup, gpu2, cpu2, command):
    """
    Run a command on Floyd. Floyd will upload contents of the
    current directory and run your command remotely.
    This command will generate a run id for reference.
    """
    # Error early if more than one --env is passed.  Then get the first/only
    # --env out of the list so all other operations work normally (they don't
    # expect an iterable). For details on this approach, see the comment above
    # the --env click option
    if len(env) > 1:
        floyd_logger.error(
            "You passed more than one environment: {}. Please specify a single environment."
            .format(env))
        sys.exit(1)
    env = env[0]
    experiment_config = ExperimentConfigManager.get_config()
    access_token = AuthConfigManager.get_access_token()
    namespace = experiment_config.namespace or access_token.username

    if not ProjectClient().exists(experiment_config.name, namespace=namespace):
        floyd_logger.error(
            'Invalid project id, please run '
            '"floyd init PROJECT_NAME" before scheduling a job.')
        sys.exit(1)

    experiment_name = "{}/{}".format(namespace, experiment_config.name)

    success, data_ids = process_data_ids(data)
    if not success:
        sys.exit(2)

    # Create module
    default_name = 'input' if len(data_ids) <= 1 else None
    module_inputs = [{
        'name': get_data_name(data_str, default_name),
        'type': 'dir'
    } for data_str in data_ids]

    if gpu2:
        instance_type = G2_INSTANCE_TYPE
    elif cpu2:
        instance_type = C2_INSTANCE_TYPE
    elif gpup:
        instance_type = G1P_INSTANCE_TYPE
    elif cpup:
        instance_type = C1P_INSTANCE_TYPE
    elif gpu:
        instance_type = G1_INSTANCE_TYPE
    else:
        instance_type = C1_INSTANCE_TYPE

    if not validate_env(env, instance_type):
        sys.exit(3)

    command_str = ' '.join(command)
    if command_str and mode in ('jupyter', 'serve'):
        floyd_logger.error(
            'Command argument "%s" cannot be used with mode: %s.\nSee http://docs.floydhub.com/guides/run_a_job/#mode for more information about run modes.',
            command_str, mode)  # noqa
        sys.exit(3)

    module = Module(name=experiment_name,
                    description=message or '',
                    command=command_str,
                    mode=get_mode_parameter(mode),
                    enable_tensorboard=tensorboard,
                    family_id=experiment_config.family_id,
                    inputs=module_inputs,
                    env=env,
                    arch=INSTANCE_ARCH_MAP[instance_type])

    try:
        module_id = ModuleClient().create(module)
    except BadRequestException as e:
        if 'Project not found, ID' in e.message:
            floyd_logger.error(
                'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.'
            )
        else:
            floyd_logger.error('ERROR: %s', e.message)
        sys.exit(4)
    floyd_logger.debug("Created module with id : %s", module_id)

    # Create experiment request
    # Get the actual command entered in the command line
    full_command = get_command_line(instance_type, env, message, data, mode,
                                    open_notebook, tensorboard, command_str)
    experiment_request = ExperimentRequest(
        name=experiment_name,
        description=message,
        full_command=full_command,
        module_id=module_id,
        env=env,
        data_ids=data_ids,
        family_id=experiment_config.family_id,
        instance_type=instance_type)
    expt_client = ExperimentClient()
    expt_info = expt_client.create(experiment_request)
    floyd_logger.debug("Created job : %s", expt_info['id'])

    job_name = expt_info['name']
    show_new_job_info(expt_client, job_name, expt_info, mode, open_notebook)
"""View experiments by job."""

import streamlit as st
import numpy as np
import pandas as pd
import subprocess

from floyd.client.experiment import ExperimentClient
from floyd.client.data import DataClient
from pathlib import Path
from torch import tensor

from metalearn import plotting

experiment_client = ExperimentClient()
data_client = DataClient()

cache_dir = Path.home() / "floyd_cache"

EXPERIMENT_LIMIT = 10000
SUCCESS_STATE = "success"
METRICS_FILE = "rnn_metalearn_controller_experiment.csv"


@st.cache
def get_experiments():
    return {
        exp.name: exp
        for exp in experiment_client.get_all(limit=EXPERIMENT_LIMIT)
        if exp.state == SUCCESS_STATE
    }