Beispiel #1
0
def initialize_new_upload(data_config, access_token, description=None, source_dir='.'):
    # TODO: hit upload server to check for liveness before moving on
    data_config.set_tarball_path(None)
    data_config.set_data_endpoint(None)
    data_config.set_resource_id(None)

    namespace = data_config.namespace or access_token.username
    data_name = "{}/{}".format(namespace, data_config.name)

    # Create tarball of the data using the ID returned from the API
    # TODO: allow to the users to change directory for the compression
    temp_dir = tempfile.mkdtemp()
    tarball_path = os.path.join(temp_dir, "floydhub_data.tar.gz")

    floyd_logger.debug("Creating tarfile with contents of current directory: %s",
                       tarball_path)
    floyd_logger.info("Compressing data...")

    # TODO: purge tarball on Ctrl-C
    create_tarfile(source_dir=source_dir, filename=tarball_path)

    # If starting a new upload fails for some reason down the line, we don't
    # want to re-tar, so save off the tarball path now
    data_config.set_tarball_path(tarball_path)
    DataConfigManager.set_config(data_config)

    # Create data object using API
    data = DataRequest(name=data_name,
                       description=description,
                       family_id=data_config.family_id,
                       data_type='gzip')
    data_info = DataClient().create(data)
    if not data_info:
        rmtree(temp_dir)
        sys.exit(1)

    data_config.set_data_id(data_info['id'])
    data_config.set_data_name(data_info['name'])
    DataConfigManager.set_config(data_config)

    # fetch auth token for upload server
    creds = DataClient().new_tus_credentials(data_info['id'])
    if not creds:
        # TODO: delete module from server?
        rmtree(temp_dir)
        sys.exit(1)

    data_resource_id = creds[0]
    data_endpoint = TusDataClient().initialize_upload(
        tarball_path,
        metadata={"filename": data_resource_id},
        auth=creds)
    if not data_endpoint:
        # TODO: delete module from server?
        floyd_logger.error("Failed to get upload URL from Floydhub!")
        rmtree(temp_dir)
        sys.exit(1)

    data_config.set_data_endpoint(data_endpoint)
    DataConfigManager.set_config(data_config)
Beispiel #2
0
def process_data_ids(data):
    if len(data) > 5:
        floyd_logger.error("Cannot attach more than 5 datasets to a job")
        return False, None

    # Get the data entity from the server to:
    # 1. Confirm that the data id or uri exists and has the right permissions
    # 2. If uri is used, get the id of the dataset
    data_ids = []
    for data_name_or_id in data:
        path = None
        if ':' in data_name_or_id:
            data_name_or_id, path = data_name_or_id.split(':')
            data_name_or_id = normalize_data_name(data_name_or_id,
                                                  use_data_config=False)

        data_obj = DataClient().get(
            normalize_data_name(data_name_or_id, use_data_config=False))

        if not data_obj:
            # Try with the raw ID
            data_obj = DataClient().get(data_name_or_id)

        if not data_obj:
            floyd_logger.error(
                "Data not found for name or id: {}".format(data_name_or_id))
            return False, None
        if path:
            data_ids.append("%s:%s" % (data_obj.id, path))
        else:
            data_ids.append(data_obj.id)
    return True, data_ids
Beispiel #3
0
def delete(ids, yes):
    """
    Delete data sets.
    """
    failures = False

    for id in ids:
        data_source = DataClient().get(id)
        if not data_source:
            failures = True
            continue

        data_name = normalize_data_name(data_source.name)
        suffix = data_name.split('/')[-1]
        if not suffix.isdigit():
            failures = True
            floyd_logger.error('%s is not a dataset, skipped.', id)
            if suffix == 'output':
                floyd_logger.error(
                    'To delete job output, please delete the job itself.')
            continue

        if not yes and not click.confirm("Delete Data: {}?".format(data_name),
                                         abort=False,
                                         default=False):
            floyd_logger.info("Data %s: Skipped", data_name)
            continue

        if not DataClient().delete(data_source.id):
            failures = True
        else:
            floyd_logger.info("Data %s: Deleted", data_name)

    if failures:
        sys.exit(1)
Beispiel #4
0
def status(id):
    """
    Show the status of a run with id.
    It can also list status of all the runs in the project.
    """
    if id:
        data_source = DataClient().get(id)
        print_data([data_source])
    else:
        data_sources = DataClient().get_all()
        print_data(data_sources)
Beispiel #5
0
def delete(id, yes):
    """
    Delete data set.
    """
    data_source = DataClient().get(id)

    if not yes:
        click.confirm('Delete Data: {}?'.format(data_source.name), abort=True, default=False)

    if DataClient().delete(id):
        floyd_logger.info("Data deleted")
    else:
        floyd_logger.error("Failed to delete data")
Beispiel #6
0
def get_data_object(data_id, use_data_config=True):
    """
    Normalize the data_id and query the server.
    If that is unavailable try the raw ID
    """
    normalized_data_reference = normalize_data_name(data_id, use_data_config=use_data_config)
    client = DataClient()
    data_obj = client.get(normalized_data_reference)

    # Try with the raw ID
    if not data_obj and data_id != normalized_data_reference:
        data_obj = client.get(data_id)

    return data_obj
Beispiel #7
0
def clone(id):
    """
    Download the code for the job to the current path
    """
    data_source = DataClient().get(id)

    if not data_source:
        sys.exit()

    data_url = "{}/api/v1/resources/{}?content=true&download=true".format(
        floyd.floyd_host, data_source.resource_id)
    DataClient().download_tar(url=data_url,
                              untar=True,
                              delete_after_untar=True)
Beispiel #8
0
def status(id):
    """
    Show the status of a run with id. or a friendly name.
    It can also list status of all the runs in the project.
    """
    if id:
        data_source = DataClient().get(normalize_data_name(id))

        if not data_source:
            # Try with the raw ID
            data_source = DataClient().get(id)

        print_data([data_source] if data_source else [])
    else:
        data_sources = DataClient().get_all()
        print_data(data_sources)
Beispiel #9
0
def listfiles(data_name):
    """
    List files in a dataset.
    """

    data_source = get_data_object(data_name, use_data_config=False)

    if not data_source:
        if 'output' in data_name:
            floyd_logger.info("Note: You cannot clone the output of a running job. You need to wait for it to finish.")
        sys.exit()

    # Depth-first search
    dirs = ['']
    paths = []
    while dirs:
        cur_dir = dirs.pop()
        url = "/resources/{}/{}?content=true".format(data_source.resource_id, cur_dir)
        response = DataClient().request("GET", url).json()

        if response['skipped_files'] > 0:
            floyd_logger.info("Warning: in directory '%s', %s/%s files skipped (too many files)", cur_dir, response['skipped_files'], response['total_files'])

        files = response['files']
        files.sort(key=lambda f: f['name'])
        for f in files:
            path = os.path.join(cur_dir, f['name'])
            if f['type'] == 'directory':
                path += os.sep
            paths.append(path)

            if f['type'] == 'directory':
                dirs.append(os.path.join(cur_dir, f['name']))
    for path in paths:
        floyd_logger.info(path)
Beispiel #10
0
def upload():
    """
    Upload data in the current dir to Floyd.
    """
    data_config = DataConfigManager.get_config()
    access_token = AuthConfigManager.get_access_token()
    version = data_config.version

    # Create data object
    data_name = "{}/{}:{}".format(access_token.username, data_config.name,
                                  version)
    data = DataRequest(name=data_name, description=version, version=version)
    data_id = DataClient().create(data)
    floyd_logger.debug("Created data with id : {}".format(data_id))
    floyd_logger.info("Upload finished")

    # Update expt config including predecessor
    data_config.increment_version()
    data_config.set_data_predecessor(data_id)
    DataConfigManager.set_config(data_config)

    # Print output
    table_output = [["DATA ID", "NAME", "VERSION"],
                    [data_id, data_name, version]]
    floyd_logger.info(tabulate(table_output, headers="firstrow"))
Beispiel #11
0
def add(source):
    """
    Create a new dataset version from the contents of a job.

    This will create a new dataset version with the job output.
    Use the full job name: foo/projects/bar/1/code, foo/projects/bar/1/files or foo/projects/bar/1/output
    """
    new_data = DatasetClient().add_data(source)
    print_data([DataClient().get(new_data['data_id'])])
Beispiel #12
0
def clone(id):
    """
    Download the code for the job to the current path
    """
    data_source = DataClient().get(id)

    if not data_source:
        if 'output' in id:
            floyd_logger.info(
                "Note: You cannot clone the output of a running job. You need to wait for it to finish."
            )
        sys.exit()

    data_url = "{}/api/v1/resources/{}?content=true&download=true".format(
        floyd.floyd_host, data_source.resource_id)
    DataClient().download_tar(url=data_url,
                              untar=True,
                              delete_after_untar=True)
Beispiel #13
0
def output(id, url):
    """
    Shows the url of the dataset. You can use id or a friendly URI.
    By default opens the output page in your default browser.
    """
    data_source = DataClient().get(normalize_data_name(id))
    if id and not data_source:
        # Try with the raw ID
        data_source = DataClient().get(id)

    if not data_source:
        sys.exit()

    data_url = "%s/%s" % (floyd.floyd_web_host, data_source.name)
    if url:
        floyd_logger.info(data_url)
    else:
        floyd_logger.info("Opening output directory in your browser ...")
        webbrowser.open(data_url)
Beispiel #14
0
def status(id):
    """
    View status of all versions in a dataset.

    The command also accepts a specific dataset version.
    """
    if id:
        data_source = get_data_object(id, use_data_config=False)
        print_data([data_source] if data_source else [])
    else:
        data_sources = DataClient().get_all()
        print_data(data_sources)
Beispiel #15
0
def output(id, url):
    """
    Shows the output url of the run.
    By default opens the output page in your default browser.
    """
    data_source = DataClient().get(id)
    data_url = "{}/api/v1/resources/{}?content=true".format(
        floyd.floyd_host, data_source.resource_id)
    if url:
        floyd_logger.info(data_url)
    else:
        floyd_logger.info("Opening output directory in your browser ...")
        webbrowser.open(data_url)
Beispiel #16
0
def getfile(data_name, path):
    """
    Get the specified individual file from a dataset
    """

    data_source = DataClient().get(
        normalize_data_name(data_name, use_data_config=False))
    if data_name and not data_source:
        # Try with the raw ID
        data_source = DataClient().get(data_name)

    if not data_source:
        if 'output' in data_name:
            floyd_logger.info(
                "Note: You cannot clone the output of a running job. You need to wait for it to finish."
            )
        sys.exit()

    url = "{}/api/v1/resources/{}/{}?content=true".format(
        floyd.floyd_host, data_source.resource_id, path)
    fname = os.path.basename(path)
    DataClient().download(url, filename=fname)
Beispiel #17
0
def get_output(id, path, untar, delete_after_untar):
    """
    - Download all files in a dataset or from a Job output
    Eg: alice/projects/mnist/1/files, alice/projects/mnist/1/output or
    alice/dataset/mnist-data/1/

    Using /output will download the files that are saved at the end of the job.
    Note: This will download the files that are saved at
    the end of the job.
    - Download a directory from a dataset or from Job output
    Specify the path to a directory and download all its files and
    subdirectories.
    Eg: --path models/checkpoint1
    """
    data_source = get_data_object(id, use_data_config=False)

    if not data_source:
        if "output" in id:
            floyd_logger.info(
                "Note: You cannot clone the output of a running job. You need "
                "to wait for it to finish.")
        sys.exit()

    if path:
        # Download a directory from Dataset or Files
        # Get the type of data resource from the id
        # (foo/projects/bar/ or foo/datasets/bar/)
        if "/datasets/" in id:
            resource_type = "data"
            resource_id = data_source.id
        else:
            resource_type = "files"
            try:
                experiment = ExperimentClient().get(
                    normalize_job_name(id, use_config=False))
            except FloydException:
                experiment = ExperimentClient().get(id)
            resource_id = experiment.id

        data_url = "{}/api/v1/download/artifacts/{}/{}?is_dir=true&path={}" \
            .format(floyd.floyd_host, resource_type, resource_id, path)
    else:
        # Download the full Dataset
        data_url = "{}/api/v1/resources/{}?content=true&download=true".format(
            floyd.floyd_host, data_source.resource_id)

    DataClient().download_tar(
        url=data_url,
        untar=untar,
        delete_after_untar=untar and delete_after_untar,
    )
Beispiel #18
0
def delete(ids, yes):
    """
    Delete data sets.
    """
    failures = False

    for id in ids:
        data_source = DataClient().get(id)
        if not data_source:
            failures = True
            continue

        if not yes and not click.confirm("Delete Data: {}?".format(
                data_source.name),
                                         abort=False,
                                         default=False):
            floyd_logger.info("Data {}: Skipped".format(data_source.name))
            continue

        if not DataClient().delete(id):
            failures = True

    if failures:
        sys.exit(1)
Beispiel #19
0
def clone(id):
    """
    Download all files in a dataset.
    """
    data_source = get_data_object(id, use_data_config=False)

    if not data_source:
        if 'output' in id:
            floyd_logger.info("Note: You cannot clone the output of a running job. You need to wait for it to finish.")
        sys.exit()

    data_url = "{}/api/v1/resources/{}?content=true&download=true".format(floyd.floyd_host,
                                                                          data_source.resource_id)
    DataClient().download_tar(url=data_url,
                              untar=True,
                              delete_after_untar=True)
Beispiel #20
0
def getfile(data_name, path):
    """
    Download a specific file from a dataset.
    """

    data_source = get_data_object(data_name, use_data_config=False)

    if not data_source:
        if 'output' in data_name:
            floyd_logger.info("Note: You cannot clone the output of a running job. You need to wait for it to finish.")
        sys.exit()

    url = "{}/api/v1/resources/{}/{}?content=true".format(floyd.floyd_host, data_source.resource_id, path)
    fname = os.path.basename(path)
    DataClient().download(url, filename=fname)
    floyd_logger.info("Download finished")
Beispiel #21
0
def output(id, url):
    """
    Shows the url of the dataset. You can use id or a friendly URI.
    By default opens the output page in your default browser.
    """
    data_source = DataClient().get(id)

    if not data_source:
        sys.exit()

    resource = ResourceClient().get(data_source.resource_id)
    data_url = "{}/viewer/{}".format(floyd.floyd_host, resource.uri)
    if url:
        floyd_logger.info(data_url)
    else:
        floyd_logger.info("Opening output directory in your browser ...")
        webbrowser.open(data_url)
Beispiel #22
0
def run(ctx, gpu, env, message, data, mode, open, tensorboard, command):
    """
    Run a command on Floyd. Floyd will upload contents of the
    current directory and run your command remotely.
    This command will generate a run id for reference.
    """
    experiment_config = ExperimentConfigManager.get_config()
    if not ProjectClient().exists(experiment_config.family_id):
        floyd_logger.error(
            'Invalid project id, please run '
            '"floyd init PROJECT_NAME" before scheduling a job.')
        return

    access_token = AuthConfigManager.get_access_token()
    experiment_name = "{}/{}".format(access_token.username,
                                     experiment_config.name)

    # Create module
    if len(data) > 5:
        floyd_logger.error("Cannot attach more than 5 datasets to an job")
        return

    # Get the data entity from the server to:
    # 1. Confirm that the data id or uri exists and has the right permissions
    # 2. If uri is used, get the id of the dataset
    data_ids = []
    for data_name_or_id in data:
        path = None
        if ':' in data_name_or_id:
            data_name_or_id, path = data_name_or_id.split(':')
        data_obj = DataClient().get(data_name_or_id)
        if not data_obj:
            floyd_logger.error(
                "Data not found for name or id: {}".format(data_name_or_id))
            return
        data_ids.append(
            "{}:{}".format(data_obj.id, path) if path else data_obj.id)

    default_name = 'input' if len(data_ids) <= 1 else None
    module_inputs = [{
        'name': get_data_name(data_str, default_name),
        'type': 'dir'
    } for data_str in data_ids]

    if gpu:
        arch = 'gpu'
        instance_type = GPU_INSTANCE_TYPE
    else:
        arch = 'cpu'
        instance_type = CPU_INSTANCE_TYPE

    env_map = EnvClient().get_all()
    envs = env_map.get(arch)
    if envs:
        if env not in envs:
            floyd_logger.error(
                "{} is not in the list of supported environments: {}".format(
                    env, ', '.join(envs.keys())))
            return
    else:
        floyd_logger.error("{} is not a supported architecture".format(arch))
        return

    command_str = ' '.join(command)
    module = Module(name=experiment_name,
                    description=message or '',
                    command=command_str,
                    mode=get_mode_parameter(mode),
                    enable_tensorboard=tensorboard,
                    family_id=experiment_config.family_id,
                    inputs=module_inputs,
                    env=env,
                    arch=arch)

    from floyd.exceptions import BadRequestException
    try:
        module_id = ModuleClient().create(module)
    except BadRequestException as e:
        if 'Project not found, ID' in e.message:
            floyd_logger.error(
                'ERROR: Please run "floyd init PROJECT_NAME" before scheduling a job.'
            )
        else:
            floyd_logger.error('ERROR: %s', e.message)
        sys.exit(1)
    floyd_logger.debug("Created module with id : {}".format(module_id))

    # Create experiment request
    # Get the actual command entered in the command line
    full_command = get_command_line(gpu, env, message, data, mode, open,
                                    tensorboard, command)
    experiment_request = ExperimentRequest(
        name=experiment_name,
        description=message,
        full_command=full_command,
        module_id=module_id,
        data_ids=data_ids,
        family_id=experiment_config.family_id,
        instance_type=instance_type)
    expt_cli = ExperimentClient()
    expt_info = expt_cli.create(experiment_request)
    floyd_logger.debug("Created job : {}".format(expt_info['id']))

    table_output = [["JOB NAME"], [expt_info['name']]]
    floyd_logger.info(tabulate(table_output, headers="firstrow"))
    floyd_logger.info("")

    if mode in ['jupyter', 'serve']:
        while True:
            # Wait for the experiment / task instances to become available
            try:
                experiment = expt_cli.get(expt_info['id'])
                if experiment.task_instances:
                    break
            except Exception:
                floyd_logger.debug("Job not available yet: {}".format(
                    expt_info['id']))

            floyd_logger.debug("Job not available yet: {}".format(
                expt_info['id']))
            sleep(3)
            continue

        # Print the path to jupyter notebook
        if mode == 'jupyter':
            jupyter_url = experiment.service_url
            print(
                "Setting up your instance and waiting for Jupyter notebook to become available ...",
                end='')
            if wait_for_url(jupyter_url,
                            sleep_duration_seconds=2,
                            iterations=900):
                floyd_logger.info(
                    "\nPath to jupyter notebook: {}".format(jupyter_url))
                if open:
                    webbrowser.open(jupyter_url)
            else:
                floyd_logger.info(
                    "\nPath to jupyter notebook: {}".format(jupyter_url))
                floyd_logger.info(
                    "Notebook is still loading. View logs to track progress")
                floyd_logger.info("   floyd logs {}".format(expt_info['name']))

        # Print the path to serving endpoint
        if mode == 'serve':
            floyd_logger.info("Path to service endpoint: {}".format(
                experiment.service_url))

        if experiment.timeout_seconds < 4 * 60 * 60:
            floyd_logger.info(
                "\nYour job timeout is currently set to {} seconds".format(
                    experiment.timeout_seconds))
            floyd_logger.info(
                "This is because you are in a trial account. Paid users will have longer timeouts. "
                "See https://www.floydhub.com/pricing for details")

    else:
        floyd_logger.info("To view logs enter:")
        floyd_logger.info("   floyd logs {}".format(expt_info['name']))
Beispiel #23
0
def add(source):
    """
    Create data for current dataset from a given source, for example: foo/projects/bar/1/output
    """
    new_data = DatasetClient().add_data(source)
    print_data([DataClient().get(new_data['data_id'])])
Beispiel #24
0
def complete_upload(data_config):
    data_endpoint = data_config.data_endpoint
    data_id = data_config.data_id
    tarball_path = data_config.tarball_path

    if not data_id:
        floyd_logger.error("Corrupted upload state, please start a new one.")
        sys.exit(1)

    # check for tarball upload, upload to server if not done
    if not data_config.resource_id and (tarball_path and data_endpoint):
        floyd_logger.debug("Getting fresh upload credentials")
        creds = DataClient().new_tus_credentials(data_id)
        if not creds:
            sys.exit(1)

        file_size = os.path.getsize(tarball_path)
        # check for upload limit dimension
        if file_size > MAX_UPLOAD_SIZE:
            try:
                floyd_logger.info("Removing compressed data...")
                rmtree(os.path.dirname(tarball_path))
            except (OSError, TypeError):
                pass

            sys.exit(("Data size too large to upload, please keep it under %s.\n") %
                     (sizeof_fmt(MAX_UPLOAD_SIZE)))

        floyd_logger.info("Uploading compressed data. Total upload size: %s",
                          sizeof_fmt(file_size))
        tus_client = TusDataClient()
        if not tus_client.resume_upload(tarball_path, data_endpoint, auth=creds):
            floyd_logger.error("Failed to finish upload!")
            return

        try:
            floyd_logger.info("Removing compressed data...")
            rmtree(os.path.dirname(tarball_path))
        except (OSError, TypeError):
            pass

        floyd_logger.debug("Created data with id : %s", data_id)
        floyd_logger.info("Upload finished.")

        # Update data config
        data_config.set_tarball_path(None)
        data_config.set_data_endpoint(None)
        data_source = DataClient().get(data_id)
        data_config.set_resource_id(data_source.resource_id)
        DataConfigManager.set_config(data_config)

    # data tarball uploaded, check for server untar
    if data_config.resource_id:
        floyd_logger.info(
            "Waiting for server to unpack data.\n"
            "You can exit at any time and come back to check the status with:\n"
            "\tfloyd data upload -r")
        try:
            for i in dots(ResourceWaitIter(data_config.resource_id),
                          label='Waiting for unpack...'):
                pass
        except WaitTimeoutException:
            clint_STREAM.write('\n')
            clint_STREAM.flush()
            floyd_logger.info(
                "Looks like it is going to take longer for Floydhub to unpack "
                "your data. Please check back later.")
            sys.exit(1)
        else:
            data_config.set_resource_id(None)
            data_config.set_tarball_path(None)
            data_config.set_data_endpoint(None)
            data_config.set_resource_id(None)
            data_config.set_data_id(None)
            DataConfigManager.set_config(data_config)

    # Print output
    table_output = [["NAME"],
                    [normalize_data_name(data_config.data_name)]]
    floyd_logger.info('')
    floyd_logger.info(tabulate(table_output, headers="firstrow"))
"""View experiments by job."""

import streamlit as st
import numpy as np
import pandas as pd
import subprocess

from floyd.client.experiment import ExperimentClient
from floyd.client.data import DataClient
from pathlib import Path
from torch import tensor

from metalearn import plotting

experiment_client = ExperimentClient()
data_client = DataClient()

cache_dir = Path.home() / "floyd_cache"

EXPERIMENT_LIMIT = 10000
SUCCESS_STATE = "success"
METRICS_FILE = "rnn_metalearn_controller_experiment.csv"


@st.cache
def get_experiments():
    return {
        exp.name: exp
        for exp in experiment_client.get_all(limit=EXPERIMENT_LIMIT)
        if exp.state == SUCCESS_STATE
    }