コード例 #1
0
def local_files(ctx, role, name, compute_hash: bool, export: bool,
                imported: bool, path: str):
    """Add a local file directory (not managed by git) to the workspace. Subcommand of ``add``"""
    ns = ctx.obj
    if role is None:
        if imported:
            role = ResourceRoles.SOURCE_DATA_SET
        elif ns.batch:
            raise BatchModeError("--role")
        else:
            role = click.prompt(
                "Please enter a role for this resource, one of [s]ource-data, [i]ntermediate-data, [c]ode, or [r]esults",
                type=ROLE_PARAM,
            )
    path = abspath(expanduser(path))
    workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir)
    if export and role in (ResourceRoles.SOURCE_DATA_SET, ResourceRoles.CODE):
        raise click.BadOptionUsage(
            message="Cannot export a source data or code resource",
            option_name="export")
    if export and imported:
        raise click.BadOptionUsage(
            message="Cannot specify both --export and --imported",
            option_name="imported")
    if imported and role != ResourceRoles.SOURCE_DATA_SET:
        raise click.BadOptionUsage(
            message="--imported only for source-data roles",
            option_name="imported")
    add_command("file", role, name, workspace, path, compute_hash, export,
                imported)
コード例 #2
0
 def _init_dws_state(self):
     workspace = find_and_load_workspace(
         batch=True,
         verbose=self.verbose,
         uri_or_local_path=self.workspace_dir)
     self._dws_state = _DwsModelState(workspace, self.input_resource,
                                      self.results_resource)
コード例 #3
0
def pull(ctx, workspace_dir: str, only: Optional[str], skip: Optional[str],
         only_workspace: bool):
    """Pull the latest state of the workspace and its resources from their origins."""
    ns = ctx.obj
    option_cnt = ((1 if only is not None else 0) +
                  (1 if skip is not None else 0) +
                  (1 if only_workspace else 0))
    if option_cnt > 1:
        raise click.BadOptionUsage(
            message=
            "Please specify at most one of --only, --skip, or --only-workspace",
            option_name="--only")  # type: ignore
    if workspace_dir is None:
        if ns.batch:
            raise BatchModeError("--workspace-dir")
        else:
            workspace_dir = click.prompt("Please enter the workspace root dir",
                                         type=WORKSPACE_PARAM)
    workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir)
    pull_command(
        workspace,
        only=only.split(",") if only else None,
        skip=skip.split(",") if skip else None,
        only_workspace=only_workspace,
    )
コード例 #4
0
def restore(
    ctx,
    workspace_dir: str,
    only: Optional[str],
    leave: Optional[str],
    strict: bool,
    tag_or_hash: str,
):
    """Restore the workspace to a prior state"""
    ns = ctx.obj
    if (only is not None) and (leave is not None):
        raise click.BadOptionUsage(
            option_name="--only",
            message="Please specify either --only or --leave, but not both"
        )  # type: ignore
    if workspace_dir is None:
        if ns.batch:
            raise BatchModeError("--workspace-dir")
        else:
            workspace_dir = click.prompt("Please enter the workspace root dir",
                                         type=WORKSPACE_PARAM)
    workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir)
    restore_command(
        workspace,
        tag_or_hash,
        only=only.split(",") if only else None,
        leave=leave.split(",") if leave else None,
        strict=strict,
    )
コード例 #5
0
def get_snapshot_history(
    workspace_uri_or_path: Optional[str] = None,
    reverse: bool = False,
    max_count: Optional[int] = None,
    verbose: bool = False,
) -> Iterable[SnapshotInfo]:
    """Get the history of snapshots, starting with the oldest first (unless :reverse: is True).
    Returns a list of SnapshotInfo instances, containing the snapshot number,
    hash, tag, timestamp, and message. If :max_count: is specified, returns at most that many snapshots.

    """
    workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path)
    assert isinstance(workspace, SnapshotWorkspaceMixin)
    if not reverse:
        return [
            SnapshotInfo(
                snapshot_idx + 1, md.hashval, md.tags, md.timestamp, md.message, md.metrics
            )
            for (snapshot_idx, md) in enumerate(
                workspace.list_snapshots(reverse=False, max_count=max_count)
            )
        ]
    else:
        last_snapshot_no = workspace.get_next_snapshot_number() - 1
        return [
            SnapshotInfo(
                last_snapshot_no - i, md.hashval, md.tags, md.timestamp, md.message, md.metrics
            )
            for (i, md) in enumerate(workspace.list_snapshots(reverse=True, max_count=max_count))
        ]
コード例 #6
0
    def __init__(
        self,
        model_name: str,
        monitor: str = "val_loss",
        save_best_only: bool = False,
        mode: str = "auto",
        save_freq: Union[str, int] = "epoch",
        results_resource: Optional[Union[str, ResourceRef]] = None,
        workspace_dir: Optional[str] = None,
        verbose: Union[int, bool] = 0,
    ):
        """
        model_name is used to create the checkpoint filenames. The checkpoints
        will be saved as MODEL_NAME_{epoch}.

        Currently, only supports save_weights_only option.

        verbose can be either 0,1 in the style of tensorflow or a True,False
        in the style of Data Workspaces.

        """
        self.dws_model_name = model_name
        if verbose == 0 or verbose == False:
            tf_verbose = 0
            dws_verbose = False
        else:
            tf_verbose = 1
            dws_verbose = True

        self.workspace = find_and_load_workspace(
            batch=True, verbose=dws_verbose, uri_or_local_path=workspace_dir)

        results_ref = _find_resource(self.workspace, ResourceRoles.RESULTS,
                                     results_resource)
        self.results_resource = self.workspace.get_resource(results_ref.name)
        if not isinstance(self.results_resource, FileResourceMixin):
            raise ConfigurationError(
                "Resource %s is not a file-based resource" % results_ref.name)
        self.results_subdir = results_ref.subpath  # type: Optional[str]
        scratch_dir = self.workspace.get_scratch_directory()
        assert isdir(scratch_dir), "missing scratch directory %s" % scratch_dir
        self.dws_checkpoint_path = join(scratch_dir,
                                        "checkpoints")  # type: str
        if not isdir(self.dws_checkpoint_path):
            os.mkdir(self.dws_checkpoint_path)
        self.checkpoint_filepath_template = join(self.dws_checkpoint_path,
                                                 model_name + "_{epoch}")
        super().__init__(
            filepath=self.checkpoint_filepath_template,
            monitor=monitor,
            save_best_only=save_best_only,
            mode=mode,
            save_freq=save_freq,
            save_weights_only=True,
            verbose=tf_verbose,
        )
コード例 #7
0
def snapshot(ctx, workspace_dir, message, tag):
    """Take a snapshot of the current workspace's state"""
    ns = ctx.obj
    if workspace_dir is None:
        if ns.batch:
            raise BatchModeError("--workspace-dir")
        else:
            workspace_dir = click.prompt("Please enter the workspace root dir",
                                         type=WORKSPACE_PARAM)
    workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir)
    snapshot_command(workspace, tag, message)
コード例 #8
0
def status(ctx, workspace_dir, history, limit):
    """NOTE: this command is DEPRECATED. Please use ``dws report status`` and ``dws report history`` instead."""
    ns = ctx.obj
    if workspace_dir is None:
        if ns.batch:
            raise BatchModeError("--workspace-dir")
        else:
            workspace_dir = click.prompt("Please enter the workspace root dir",
                                         type=WORKSPACE_PARAM)
    workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir)
    status_command(workspace, history, limit)
コード例 #9
0
def diff(ctx, workspace_dir, snapshot_or_tag1, snapshot_or_tag2):
    """List differences between two snapshots"""
    ns = ctx.obj
    if workspace_dir is None:
        if ns.batch:
            raise BatchModeError("--workspace-dir")
        else:
            workspace_dir = click.prompt("Please enter the workspace root dir",
                                         type=WORKSPACE_PARAM)
    workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir)
    diff_command(workspace, snapshot_or_tag1, snapshot_or_tag2)
コード例 #10
0
def take_snapshot(
    workspace_uri_or_path: Optional[str] = None,
    tag: Optional[str] = None,
    message: str = "",
    verbose: bool = False,
) -> str:
    """Take a snapshot of the workspace, using the tag and message,
    if provided. Returns the snapshot hash (which can be used to restore to
    this point).
    """
    workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path)
    return snapshot_command(workspace, tag=tag, message=message)
コード例 #11
0
def get_local_path_for_resource(
        name: str,
        workspace_uri_or_path: Optional[str] = None,
        verbose: bool = False
    ) -> Optional[str]:
    """If a local path is available for this resource, return it. Otherwise,
    return None."""
    workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path)
    r = workspace.get_resource(name)
    return  cast(LocalStateResourceMixin, r).get_local_path_if_any() \
            if isinstance(r, LocalStateResourceMixin) \
            else None
コード例 #12
0
def s3(ctx, role, name, bucket_name: str):
    """Add a S3 resource to the workspace. Subcommand of ``add``"""
    ns = ctx.obj
    if role is None:
        if ns.batch:
            raise BatchModeError("--role")
        else:
            role = click.prompt(
                "Please enter a role for this resource, one of [s]ource-data, [i]ntermediate-data, [c]ode, or [r]esults",
                type=ROLE_PARAM,
            )
    workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir)
    add_command("s3", role, name, workspace, bucket_name)
コード例 #13
0
def deploy_build(
    ctx,
    image_name: Optional[str],
    force_rebuild: bool,
    git_user_email: Optional[str],
    git_user_name: Optional[str],
):
    """Build a docker image containing this workspace. This command is EXERIMENTAL
    and subject to change."""
    ns = ctx.obj
    workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir)
    deploy_build_command(workspace, image_name, force_rebuild, git_user_email,
                         git_user_name)
コード例 #14
0
def get_filesystem_for_resource(name: str,
        workspace_uri_or_path: Optional[str] = None,
        verbose: bool = False
    ) -> Optional[ResourceFileSystem]:
    """Get the a filesystem-like object for the named resource.
    If it isn't a FileResource, returns None.
    """
    workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path)
    resource = workspace.get_resource(name)
    if isinstance(resource, FileResourceMixin):
        return ResourceFileSystem(resource)
    else:
        return None
コード例 #15
0
def graph(
    ctx,
    resource: Optional[str],
    snapshot: Optional[str],
    format: str,
    width: int,
    height: int,
    output_file: str,
):
    """Graph the lineage of a resource, writing the graph to an HTML file. Subcommand of ``lineage``"""
    ns = ctx.obj
    workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir)
    lineage_graph_command(workspace, output_file, resource, snapshot, format,
                          width, height)
コード例 #16
0
def api_resource(ctx, role, name):
    """Resource to represent data obtained via an API. Use this when there is
    no file-based representation of your data that can be versioned and captured
    more directly. Subcommand of ``add``"""
    ns = ctx.obj
    if role is None:
        if ns.batch:
            raise BatchModeError("--role")
        else:
            role = click.prompt(
                "Please enter a role for this resource, either [s]ource-data or [i]ntermediate-data",
                type=DATA_ROLE_PARAM,
            )
    workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir)
    add_command("api-resource", role, name, workspace)
コード例 #17
0
def delete_snapshot(ctx, workspace_dir: str, no_include_resources: bool,
                    tag_or_hash: str):
    """Delete the specified snapshot. This includes the metadata and lineage
    data for the snapshot. Unless --no-include-resources is specified, this
    also deletes any results data saved for the snapshot (under the
    snapshots subdirectory of a results resource)."""
    ns = ctx.obj
    if workspace_dir is None:
        if ns.batch:
            raise BatchModeError("--workspace-dir")
        else:
            workspace_dir = click.prompt("Please enter the workspace root dir",
                                         type=WORKSPACE_PARAM)
    workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir)
    delete_snapshot_command(workspace, tag_or_hash, no_include_resources)
コード例 #18
0
def publish(ctx, workspace_dir, skip: str, remote_repository):
    """Add a remote Git repository as the origin for the workspace and
    do the initial push of the workspace and any other resources.
    """
    ns = ctx.obj
    if workspace_dir is None:
        if ns.batch:
            raise BatchModeError("--workspace-dir")
        else:
            workspace_dir = click.prompt("Please enter the workspace root dir",
                                         type=WORKSPACE_PARAM)
    workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir)
    publish_command(workspace, remote_repository)
    push_command(workspace,
                 only=None,
                 skip=skip.split(",") if skip else None,
                 only_workspace=False)
コード例 #19
0
ファイル: api.py プロジェクト: kisdma/data-workspaces-core
def get_resource_info(workspace_uri_or_path: Optional[str] = None, verbose: bool = False):
    """Returns a list of ResourceInfo instances, describing the resources
    defined for this workspace.
    """
    workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path)

    return [
        ResourceInfo(
            r.name,
            r.role,
            r.resource_type,
            cast(LocalStateResourceMixin, r).get_local_path_if_any()
            if isinstance(r, LocalStateResourceMixin)
            else None,
        )
        for r in workspace.get_resources()
    ]
コード例 #20
0
ファイル: dws.py プロジェクト: kisdma/data-workspaces-core
def rclone(
    ctx,
    role,
    name,
    config: str,
    compute_hash: bool,
    export: bool,
    imported: bool,
    source: str,
    dest: str,
):
    """Add an rclone-d repository as a resource to the workspace. Subcommand of ``add``"""
    ns = ctx.obj
    if role is None:
        if imported:
            role = ResourceRoles.SOURCE_DATA_SET
        elif ns.batch:
            raise BatchModeError("--role")
        else:
            role = click.prompt(
                "Please enter a role for this resource, one of [s]ource-data, [i]ntermediate-data, [c]ode, or [r]esults",
                type=ROLE_PARAM,
            )
    rclone_re = r".*:.*"
    if re.match(rclone_re, source) == None:
        raise click.BadOptionUsage(
            message=
            "Source in rclone should be specified as remotename:filepath",
            option_name="source",
        )
    if export and role in (ResourceRoles.SOURCE_DATA_SET, ResourceRoles.CODE):
        raise click.BadOptionUsage(
            message="Cannot export a source data or code resource",
            option_name="export")
    if export and imported:
        raise click.BadOptionUsage(
            message="Cannot specify both --export and --imported",
            option_name="imported")
    if imported and role != ResourceRoles.SOURCE_DATA_SET:
        raise click.BadOptionUsage(
            message="--imported only for source-data roles",
            option_name="imported")
    dest = abspath(expanduser(dest))
    workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir)
    add_command("rclone", role, name, workspace, source, dest, config,
                compute_hash, export, imported)
コード例 #21
0
def get_results(
    workspace_uri_or_path: Optional[str] = None,
    tag_or_hash: Optional[str] = None,
    resource_name: Optional[str] = None,
    verbose: bool = False,
) -> Optional[Tuple[JSONDict, str]]:
    """Get a results file as a parsed json dict. If no resource or snapshot
    is specified, searches all the results resources for a file. If a snapshot
    is specified, we look in the subdirectory where the resuls have been moved.
    If no snapshot is specified, and we don't find a file, we look in the most
    recent snapshot.

    Returns a tuple with the results and the logical path (resource:/subpath) to
    the results. If nothing is found, returns None.
    """
    workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path)
    return _get_results(workspace, tag_or_hash, resource_name)
コード例 #22
0
def restore(
    tag_or_hash: str,
    workspace_uri_or_path: Optional[str] = None,
    only: Optional[List[str]] = None,
    leave: Optional[List[str]] = None,
    verbose: bool = False,
) -> int:
    """Restore to a previous snapshot, identified by either its hash
    or its tag (if one was specified). Parameters:

    * ``only`` - an optional list of resources to store. If specified
      all other resources will be left as-is.
    * ``leave`` - an optional list of resource to leave as-is. Both
      ``only`` and ``leave`` should not be specified together.

    Returns the number of resources changed.
    """
    workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path)
    return restore_command(workspace, tag_or_hash=tag_or_hash, only=only, leave=leave)
コード例 #23
0
def config(ctx, workspace_dir, resource, param_name, param_value):
    """Get or set configuration parameters. Local parameters are only for this
    copy of the workspace, while global parameters are stored centrally and
    affect all copies.

    If neither PARAMETER_NAME nor PARAMETER_VALUE are specified, this command
    prints a table of all parameters and their information (scope, value, default or not,
    and help text). If just PARAMETER_NAME is specified, it prints the specified parameter's
    information. Finally, if both the parameter name and value are specified, the parameter
    is set to the specified value."""
    ns = ctx.obj

    if workspace_dir is None:
        if ns.batch:
            raise BatchModeError("--workspace-dir")
        else:
            workspace_dir = click.prompt("Please enter the workspace root dir",
                                         type=WORKSPACE_PARAM)
    workspace = find_and_load_workspace(ns.batch, ns.verbose, workspace_dir)
    config_command(workspace, param_name, param_value, resource)
コード例 #24
0
def make_lineage_graph(
    output_file: str,
    workspace_uri_or_path: Optional[str] = None,
    resource_name: Optional[str] = None,
    tag_or_hash: Optional[str] = None,
    width: int = 1024,
    height: int = 800,
    verbose: bool = False,
) -> None:
    """Write a lineage graph as an html/javascript page to the specified file.
    """
    workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path)
    lineage_graph_command(
        workspace,
        output_file,
        resource_name=resource_name,
        snapshot=tag_or_hash,
        width=width,
        height=height,
    )
コード例 #25
0
def make_lineage_table(
    workspace_uri_or_path: Optional[str] = None,
    tag_or_hash: Optional[str] = None,
    verbose: bool = False,
) -> Iterable[Tuple[str, str, str, Optional[List[str]]]]:
    """Make a table of the lineage for each resource.
    The columns are: ref, lineage type, details, inputs
    """
    workspace = find_and_load_workspace(True, verbose, workspace_uri_or_path)
    if not isinstance(workspace, SnapshotWorkspaceMixin):
        raise ConfigurationError("Workspace %s does not support lineage" % workspace.name)
    if not workspace.supports_lineage():
        raise ConfigurationError("Workspace %s does not support lineage" % workspace.name)
    snapshot_hash = None  # type: Optional[str]
    if tag_or_hash is not None:
        md = workspace.get_snapshot_by_tag_or_hash(tag_or_hash)
        snapshot_hash = md.hashval
    return lu.make_lineage_table(
        workspace.get_instance(), workspace.get_lineage_store(), snapshot_hash
    )
コード例 #26
0
def git(ctx, role, name, branch, read_only, export, imported, path):
    """Add a local git repository as a resource. Subcommand of ``add``"""
    ns = ctx.obj
    if role is None:
        if imported:
            role = ResourceRoles.SOURCE_DATA_SET
        elif ns.batch:
            raise BatchModeError("--role")
        else:
            role = click.prompt(
                "Please enter a role for this resource, one of [s]ource-data, [i]ntermediate-data, [c]ode, or [r]esults",
                type=ROLE_PARAM,
            )
    if export and role in (ResourceRoles.SOURCE_DATA_SET, ResourceRoles.CODE):
        raise click.BadOptionUsage(
            message="Cannot export a source data or code resource",
            option_name="export")
    if export and imported:
        raise click.BadOptionUsage(
            message="Cannot specify both --export and --imported",
            option_name="imported")
    if imported and role != ResourceRoles.SOURCE_DATA_SET:
        raise click.BadOptionUsage(
            message="--imported only for source-data roles",
            option_name="imported")
    if imported:
        read_only = True
    if path.startswith("git@") or path.startswith("https://"):
        raise click.BadOptionUsage(
            message="It looks like you tried to specify a git URL (%s)." % path
            + " Currently, git resources only accept a local path." +
            " Try cloning your repository and then pasing the local path to that repository.",
            option_name="path",
        )

    path = abspath(expanduser(path))
    workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir)
    add_command("git", role, name, workspace, path, branch, read_only, export,
                imported)
コード例 #27
0
def load_dataset_from_resource(resource_name: str,
                               subpath: Optional[str] = None,
                               workspace_dir: Optional[str] = None) -> Bunch:
    """
    Load a datset (data and targets) from the specified resource, and returns an
    sklearn-style Bunch (a dictionary-like object). The bunch will include at least
    three attributes:

    * ``data`` - a NumPy array of shape number_samples * number_features
    * ``target`` - a NumPy array of length number_samples
    * ``resource`` - a :class:`~dataworkspaces.workspace.ResourceRef` that provides the resource name and
      subpath (if any) for the data

    Some other attributes that may also be present, depending on the data set:

    * ``DESCR`` - text containing a full description of the data set (for humans)
    * ``feature_names`` - an array of length number_features containing the name
      of each feature.
    * ``target_names`` - an array containing the name of each target class

    Data sets may define their own attributes as well (see below).

    **Parameters**

    resource_name
        The name of the resource containing the dataset.

    subpath
        Optional subpath within the resource where this specific dataset is located.
        If not specified, the root of the resource is used.

    workspace_dir
       The root directory of your workspace in the local file system. Usually,
       this can be left unspecified and inferred by DWS, which will search up
       from the current working directory.

    **Creating a Dataset**

    To create a dataset in your resource that is suitable for importing by this function,
    you simply need to create a file for each attribute you want in the bunch and place
    all these files in the same directory within your resource.
    The names of the files should be ``ATTRIBUTE.extn`` where ``ATTRIBUTE`` is the
    attribute name (e.g. ``data`` or ``DESCR``) and ``.extn`` is a file extension
    indicating the format. Supported file extensions are:

    * ``.txt`` or ``.rst`` - text files
    * ``.csv`` - csv files. These are read in using ``numpy.loadtxt()``. If this
      fails because the csv does not contain all numeric data, pandas is used to read
      in the file. It is then converted back to a numpy array.
    * ``.csv.gz`` or ``.csv.bz2`` - these are compressed csv files which are treated
      the same was as csv files (numpy and pandas will automatically uncompress before parsing).
    * ``.npy`` - this a a file containing a serialized NumPy array saved via ``numpy.save()``.
      It is loaded using ``numpy.load()``.
    """

    workspace = find_and_load_workspace(True, False, workspace_dir)
    workspace.validate_resource_name(resource_name, subpath)
    dataset_name = ("Resource " + resource_name + " subpath " +
                    subpath if subpath is not None else "Resource " +
                    resource_name)
    r = workspace.get_resource(resource_name)
    if not isinstance(
            r, LocalStateResourceMixin) or (r.get_local_path_if_any() is None):
        # TODO: Support a data access api
        raise ConfigurationError(
            "Unable to instantiate a data set for resource '%s': currently not supported for non-local resources"
            % resource_name)
    local_path = r.get_local_path_if_any()
    assert local_path is not None
    dataset_path = join(local_path,
                        subpath) if subpath is not None else local_path
    result = {}  # this will be the args to the result Bunch
    # First load data and target files, which are required
    data_file = join(dataset_path, "data.csv")
    if exists(data_file):
        pass
    elif exists(data_file + ".gz"):
        data_file += ".gz"
    elif exists(data_file + ".bz2"):
        data_file += ".bz2"
    else:
        raise ConfigurationError("Did not find data file for %s at '%s'" %
                                 (dataset_name, data_file))
    result["data"] = np.loadtxt(data_file, delimiter=",")
    target_file = join(dataset_path, "target.csv")
    if exists(target_file):
        pass
    elif exists(target_file + ".gz"):
        target_file += ".gz"
    elif exists(target_file + ".bz2"):
        target_file += ".bz2"
    else:
        raise ConfigurationError("Did not find target file for %s at '%s'" %
                                 (dataset_name, target_file))
    result["target"] = np.loadtxt(target_file, delimiter=",")
    if result["data"].shape[0] != result["target"].shape[0]:
        raise ConfigurationError(
            "Data matrix at '%s' has %d rows, but target at '%s' has %d rows" %
            (data_file, result["data"].shape[0], target_file,
             result["target"].shape[0]))
    result["resource"] = ResourceRef(resource_name, subpath)
    # check for and load any other attributes
    for fname in os.listdir(dataset_path):
        if fname.endswith(".txt"):
            result[fname[:-4]] = _load_dataset_file(dataset_path, fname)
        elif fname.endswith(".rst"):
            result[fname[:-4]] = _load_dataset_file(dataset_path, fname)
        elif fname.endswith(".csv"):
            result[fname[:-4]] = _load_dataset_file(dataset_path, fname)
        elif fname.endswith(".csv.gz"):
            result[fname[:-7]] = _load_dataset_file(dataset_path, fname)
        elif fname.endswith(".csv.bz2"):
            result[fname[:-8]] = _load_dataset_file(dataset_path, fname)
        elif fname.endswith(".npy"):
            result[fname[:-4]] = _load_dataset_file(dataset_path, fname)
    return Bunch(**result)
コード例 #28
0
def deploy_run(ctx, image_name: Optional[str], no_mount_ssh_keys: bool):
    """Build a docker image containing this workspace. This command is EXERIMENTAL
    and subject to change."""
    ns = ctx.obj
    workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir)
    deploy_run_command(workspace, image_name, no_mount_ssh_keys)
コード例 #29
0
def add_lineage_to_keras_model_class(
    Cls: type,
    input_resource: Optional[Union[str, ResourceRef]] = None,
    results_resource: Optional[Union[str, ResourceRef]] = None,
    workspace_dir: Optional[str] = None,
    checkpoint_config: Optional[CheckpointConfig] = None,
    verbose: bool = False,
) -> type:
    """This function wraps a Keras model class with a subclass that overwrites
    key methods to make calls to the data lineage API.

    **Parameters:**

    * ``Cls`` -- the class being wrapped
    * ``input_resources`` -- optional list of input resources to this model.
      Each resource may be specified by name, by a local file path, or via a
      ``ResourceRef``. If no inputs are specified, will try to infer from the
      workspace.
    * ``results_resource`` -- optional resource where the results are to be stored.
      May be specified by name, by a local file path, or via a ``ResourceRef``.
      if not specified, will try to infer from the workspace.
    * ``workspace-dir`` -- Optional directory specifying the workspace. Usually can be
      inferred from the current directory.
    * ``checkpoint_config`` -- Optional instance of :class:`~CheckpointConfig`, which
      is used to enable checkpointing on fit and fit_generator()
    * ``verbose`` -- If True, print extra debugging information.

    The following methods are wrapped:

    * :func:`~__init__` - loads the workspace and adds dws-specific class members
    * :func:`~compile` - captures the ``optimizer`` and ``loss_function`` parameter values
    * :func:`~fit` - captures the ``epochs`` and ``batch_size`` parameter values;
      if input is an API resource, capture hash values of training data, otherwise capture
      input resource name. If the input is an API resource, and it is either a Keras Sequence
      or a generator, writes the generator and captures the hashes of returned values as it
      is iterated through.
    * :func:`~evaluate` - captures the ``batch_size`` parameter value; if input is an
      API resource, capture hash values of test data, otherwise capture input resource
      name; capture metrics and write them to results resource. If the input is an API resource,
      and it is either a Keras Sequence or a generator, writes the generator and captures
      the hashes of returned values as it is iterated through.
    """
    if hasattr(
            Cls,
            "_dws_model_wrap") and Cls._dws_model_wrap is True:  # type: ignore
        print("dws>> %s or a superclass is already wrapped" % Cls.__name__)
        return Cls  # already wrapped
    workspace = find_and_load_workspace(batch=True,
                                        verbose=verbose,
                                        uri_or_local_path=workspace_dir)

    class WrappedModel(Cls):  # type: ignore
        _dws_model_wrap = True

        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self._dws_state = _DwsModelState(workspace, input_resource,
                                             results_resource)
            if checkpoint_config is not None:
                self.checkpoint_cb = DwsModelCheckpoint(
                    checkpoint_config.model_name,
                    monitor=checkpoint_config.monitor,
                    save_best_only=checkpoint_config.save_best_only,
                    mode=checkpoint_config.mode,
                    save_freq=checkpoint_config.save_freq,
                    results_resource=results_resource,
                    workspace_dir=workspace_dir,
                    verbose=verbose,
                )  # type: Optional[DwsModelCheckpoint]
            else:
                self.checkpoint_cb = None

        def compile(
            self,
            optimizer,
            loss=None,
            metrics=None,
            loss_weights=None,
            sample_weight_mode=None,
            weighted_metrics=None,
            target_tensors=None,
            distribute=None,
            **kwargs,
        ):
            if isinstance(optimizer, str):
                self._dws_state.lineage.add_param("optimizer", optimizer)
            elif isinstance(optimizer, optimizers.Optimizer):
                self._dws_state.lineage.add_param("optimizer",
                                                  optimizer.__class__.__name__)
            if isinstance(loss, str):
                self._dws_state.lineage.add_param("loss_function", loss)
            elif isinstance(loss, losses.Loss):
                self._dws_state.lineage.add_param("loss_function",
                                                  loss.__class__.__name__)
            if tensorflow.__version__ < "2.2.":  # type: ignore
                return super().compile(
                    optimizer,
                    loss,
                    metrics,
                    loss_weights,
                    sample_weight_mode,
                    weighted_metrics,
                    target_tensors,
                    distribute,
                    **kwargs,
                )
            else:  # starting in 2.2, tensorflow removed the tartet_tensors and distribute args
                return super().compile(
                    optimizer,
                    loss,
                    metrics,
                    loss_weights,
                    sample_weight_mode,
                    weighted_metrics,
                    **kwargs,
                )

        def fit(self, x, y=None, **kwargs):
            """x, y can be arrays or x can be a generator.
            """
            if "epochs" in kwargs:
                self._dws_state.lineage.add_param("fit.epochs",
                                                  kwargs["epochs"])
            else:
                self._dws_state.lineage.add_param("fit.epochs", 1)
            if "batch_size" in kwargs:
                self._dws_state.lineage.add_param("fit.batch_size",
                                                  kwargs["batch_size"])
            else:
                self._dws_state.lineage.add_param("fit.batch_size", None)
            api_resource = self._dws_state.find_input_resources_and_return_if_api(
                x, y)
            if api_resource is not None:
                _verify_eager_if_dataset(x, y, api_resource)
                api_resource.init_hash_state()
                hash_state = api_resource.get_hash_state()
                if isinstance(x, kerasutils.Sequence):
                    if y is not None:
                        raise NotSupportedError(
                            "fit() method does not suppport a generator for x AND a y value"
                        )
                    x = _TfKerasSequenceWrapper(x, hash_state)
                elif isinstance(x, GeneratorType):
                    if y is not None:
                        raise NotSupportedError(
                            "fit() method does not suppport a generator for x AND a y value"
                        )
                    x = _wrap_generator(x, hash_state)
                else:  # x and y are provided as full arrays
                    _add_to_hash(x, hash_state)
                    if y is not None:
                        _add_to_hash(y, hash_state)
                    api_resource.save_current_hash(
                    )  # in case we evaluate in a separate process
            if self.checkpoint_cb:
                if "callbacks" in kwargs:
                    kwargs["callbacks"].append(self.checkpoint_cb)
                else:
                    kwargs["callbacks"] = [
                        self.checkpoint_cb,
                    ]
            return super().fit(x, y, **kwargs)

        def fit_generator(
            self,
            generator,
            steps_per_epoch=None,
            epochs=1,
            verbose=1,
            callbacks=None,
            validation_data=None,
            validation_steps=None,
            validation_freq=1,
            class_weight=None,
            max_queue_size=10,
            workers=1,
            use_multiprocessing=False,
            shuffle=True,
            initial_epoch=0,
        ):
            self._dws_state.lineage.add_param("fit_generator.epochs", epochs)
            self._dws_state.lineage.add_param("fit_generator.steps_per_epoch",
                                              steps_per_epoch)
            api_resource = self._dws_state.find_input_resources_and_return_if_api(
                generator)
            if api_resource is not None:
                # wrap the generator to capture each entry as it is returned
                api_resource.init_hash_state()
                hash_state = api_resource.get_hash_state()
                if isinstance(generator, kerasutils.Sequence):
                    generator = _TfKerasSequenceWrapper(generator, hash_state)
                else:
                    generator = _wrap_generator(generator, hash_state)
            if self.checkpoint_cb:
                if callbacks is not None:
                    callbacks.append(self.checkpoint_cb)
                else:
                    callbacks = [
                        self.checkpoint_cb,
                    ]
            results = super().fit_generator(
                generator,
                steps_per_epoch,
                epochs,
                verbose,
                callbacks,
                validation_data,
                validation_steps,
                validation_freq,
                class_weight,
                max_queue_size,
                workers,
                use_multiprocessing,
                shuffle,
                initial_epoch,
            )
            if api_resource is not None:
                api_resource.save_current_hash()
            return results

        def evaluate(self, x, y=None, **kwargs):
            if "batch_size" in kwargs:
                self._dws_state.lineage.add_param("evaluate.batch_size",
                                                  kwargs["batch_size"])
            else:
                self._dws_state.lineage.add_param("evaluate.batch_size", None)
            api_resource = self._dws_state.find_input_resources_and_return_if_api(
                x, y)
            if api_resource is not None:
                _verify_eager_if_dataset(x, y, api_resource)
                api_resource.dup_hash_state()
                hash_state = api_resource.get_hash_state()
                if isinstance(x, kerasutils.Sequence):
                    if y is not None:
                        raise NotSupportedError(
                            "evaluate() method does not suppport a generator for x AND a y value"
                        )
                    x = _TfKerasSequenceWrapper(x, hash_state)
                elif isinstance(x, GeneratorType):
                    if y is not None:
                        raise NotSupportedError(
                            "evaluate() method does not suppport a generator for x AND a y value"
                        )
                    x = _wrap_generator(x, hash_state)
                else:
                    _add_to_hash(x, hash_state)
                    if y is not None:
                        _add_to_hash(y, hash_state)
            results = super().evaluate(x, y, **kwargs)
            assert len(results) == len(self.metrics_names)
            if api_resource is not None:
                api_resource.save_current_hash()
                api_resource.pop_hash_state()
            self._dws_state.write_metrics_and_complete(
                {n: v
                 for (n, v) in zip(self.metrics_names, results)})
            return results

        def evaluate_generator(
            self,
            generator,
            steps=None,
            callbacks=None,
            max_queue_size=10,
            workers=1,
            use_multiprocessing=False,
            verbose=0,
        ):
            self._dws_state.lineage.add_param("evaluate_generator.steps",
                                              steps)
            api_resource = self._dws_state.find_input_resources_and_return_if_api(
                generator)
            if api_resource is not None:
                # wrap the generator to capture each entry as it is returned
                api_resource.dup_hash_state()
                hash_state = api_resource.get_hash_state()
                if isinstance(generator, kerasutils.Sequence):
                    generator = _TfKerasSequenceWrapper(generator, hash_state)
                else:
                    generator = _wrap_generator(generator, hash_state)
            results = super().evaluate_generator(generator, steps, callbacks,
                                                 max_queue_size, workers,
                                                 use_multiprocessing, verbose)
            if api_resource is not None:
                api_resource.save_current_hash()
                api_resource.pop_hash_state()
            assert len(results) == len(self.metrics_names)
            self._dws_state.write_metrics_and_complete(
                {n: v
                 for (n, v) in zip(self.metrics_names, results)})
            return results

    WrappedModel.__name__ = Cls.__name__  # this is to fake things out for the reporting
    if workspace.verbose:
        print("dws>> Wrapped model class %s" % Cls.__name__)
    return WrappedModel
コード例 #30
0
def rclone(
    ctx,
    role,
    name,
    config: str,
    compute_hash: bool,
    export: bool,
    imported: bool,
    master: str,
    sync_mode: str,
    size_only: bool,
    remote: str,
    local_path: str,
):
    """Add an rclone-d repository as a resource to the workspace. Subcommand of ``add``.
    This is designed for uni-directional synchronization between a remote and a local_path.
    The remote has the form remote_name:remote_path, where remote_name is an entry in your
    rclone config file.
    """
    ns = ctx.obj
    if role is None:
        if imported:
            role = ResourceRoles.SOURCE_DATA_SET
        elif ns.batch:
            raise BatchModeError("--role")
        else:
            role = click.prompt(
                "Please enter a role for this resource, one of [s]ource-data, [i]ntermediate-data, [c]ode, or [r]esults",
                type=ROLE_PARAM,
            )
    rclone_re = r".*:.*"
    if re.match(rclone_re, remote) == None:
        raise click.BadOptionUsage(
            message=
            "Source in rclone should be specified as remotename:filepath",
            option_name="source",
        )
    if export and role in (ResourceRoles.SOURCE_DATA_SET, ResourceRoles.CODE):
        raise click.BadOptionUsage(
            message="Cannot export a source data or code resource",
            option_name="export")
    if export and imported:
        raise click.BadOptionUsage(
            message="Cannot specify both --export and --imported",
            option_name="imported")
    if imported and role != ResourceRoles.SOURCE_DATA_SET:
        raise click.BadOptionUsage(
            message="--imported only for source-data roles",
            option_name="imported")
    local_path = abspath(expanduser(local_path))
    workspace = find_and_load_workspace(ns.batch, ns.verbose, ns.workspace_dir)
    add_command(
        "rclone",
        role,
        name,
        workspace,
        remote,
        local_path,
        config,
        compute_hash,
        export,
        imported,
        master,
        sync_mode,
        size_only,
    )