Example #1
0
def resolve_datasource_commits(info, source_id):
    ds_api = ce_api.DatasourcesApi(api_client(info))

    if len(source_id.split(':')) == 2:
        datasource_id, commit_id = source_id.split(':')

        datasources = api_call(ds_api.get_datasources_api_v1_datasources_get)
        ds_id = find_closest_uuid(datasource_id, datasources)

        commits = api_call(
            ds_api.get_commits_api_v1_datasources_ds_id_commits_get, ds_id)
        c_id = find_closest_uuid(commit_id, commits)
    elif len(source_id.split(':')) == 1:
        datasource_id = source_id

        datasources = api_call(ds_api.get_datasources_api_v1_datasources_get)
        ds_id = find_closest_uuid(datasource_id, datasources)

        commits = api_call(
            ds_api.get_commits_api_v1_datasources_ds_id_commits_get, ds_id)
        commits.sort(key=lambda x: x.created_at)
        c_id = commits[-1].id
    else:
        raise ValueError('Unresolvable datasource')

    return ds_id, c_id
Example #2
0
    def commit_datasource(self,
                          datasource_id: Text,
                          message: Text = None,
                          schema: Dict[Text, Any] = None,
                          orchestration_backend: Text = None,
                          orchestration_args: Dict = None,
                          processing_backend: Text = None,
                          processing_args: Dict = None) -> DatasourceCommit:
        """ Create a new workspace in DB
        """
        api = ce_api.DatasourcesApi(self.client)

        if schema is None:
            schema = {}
        else:
            assert isinstance(schema, dict), 'The schema should be a dict.'

        commit = api_utils.api_call(
            api.create_datasource_commit_api_v1_datasources_ds_id_commits_post,
            DatasourceCommit.creator(
                message=message,
                schema=schema,
                orchestration_backend=orchestration_backend,
                orchestration_args=orchestration_args,
                processing_backend=processing_backend,
                processing_args=processing_args),
            ds_id=datasource_id,
        )

        return DatasourceCommit(**commit.to_dict())
Example #3
0
def create_datasource(info,
                      name,
                      ds_type,
                      source,
                      provider_id,
                      args):
    """Create a datasource"""
    click.echo('Registering datasource {}...'.format(name))

    parsed_args = parse_unknown_options(args)

    api = ce_api.DatasourcesApi(api_client(info))
    p_api = ce_api.ProvidersApi(api_client(info))

    p_list = api_call(p_api.get_loggedin_provider_api_v1_providers_get)
    p_uuid = find_closest_uuid(provider_id, p_list)

    ds = api_call(
        api.create_datasource_api_v1_datasources_post,
        DatasourceCreate(
            name=name,
            type=ds_type,
            source=source,
            provider_id=p_uuid,
            args=parsed_args,
        ))

    declare('Datasource registered with ID: {}'.format(
        format_uuid(ds.id)))
Example #4
0
def template_pipeline(info, datasource, output_path, no_docs, no_datasource):
    """Copy the configuration of a registered pipeline"""
    # TODO: with the info we can do datasource specific templates later on
    from ce_cli.pretty_yaml import TEMPLATE_CONFIG
    if not no_datasource:
        active_user = info[constants.ACTIVE_USER]
        if datasource is not None:
            from ce_cli.utils import resolve_datasource_commits
            ds_id, c_id = resolve_datasource_commits(info, datasource)
        elif constants.ACTIVE_DATASOURCE_COMMIT in info[active_user]:
            ds_id, c_id = info[active_user][
                constants.ACTIVE_DATASOURCE_COMMIT].split(':')
        else:
            raise AssertionError('Please either select an active datasource '
                                 'commit to work on or explicitly define it.')

        api = ce_api.DatasourcesApi(utils.api_client(info))
        schema = utils.api_call(
            api.get_datasource_commit_schema_api_v1_datasources_ds_id_commits_commit_id_schema_get,
            ds_id=ds_id,
            commit_id=c_id)

        from ce_standards.standard_experiment import GlobalKeys
        TEMPLATE_CONFIG[GlobalKeys.FEATURES] = {f: {} for f in schema}

    utils.save_config(TEMPLATE_CONFIG, output_path, no_docs)
Example #5
0
def list_pipelines(info, pipeline_id, ignore_empty):
    """List of registered pipelines"""
    utils.notice('Fetching pipeline(s). This might take a few seconds... \n')
    active_user = info[constants.ACTIVE_USER]
    ws = info[active_user][constants.ACTIVE_WORKSPACE]
    ws_api = ce_api.WorkspacesApi(utils.api_client(info))
    p_api = ce_api.PipelinesApi(utils.api_client(info))
    d_api = ce_api.DatasourcesApi(utils.api_client(info))

    pipelines = utils.api_call(
        ws_api.get_workspaces_pipelines_api_v1_workspaces_workspace_id_pipelines_get,
        ws)

    if pipeline_id is not None:
        pipeline_id = utils.find_closest_uuid(pipeline_id, pipelines)

    pipelines.sort(key=lambda x: x.created_at)
    for p in pipelines:
        write_check = (len(p.pipeline_runs) > 0 or not ignore_empty) and \
                      (pipeline_id is None or pipeline_id == p.id)

        if write_check:
            # THIS WHOLE THING IS HERE FOR A REASON!!!!!!
            title = 'PIPELINE NAME: {} PIPELINE ID: {}'.format(
                p.name, utils.format_uuid(p.id))
            utils.declare(title)
            utils.declare('-' * len(title))
            if len(p.pipeline_runs) == 0:
                click.echo('No runs for this pipeline yet!')
            else:
                table = []
                for r in p.pipeline_runs:
                    author = utils.api_call(
                        p_api.get_pipeline_run_user_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_user_get,
                        p.id,
                        r.id)

                    # Resolve datasource
                    ds_commit = utils.api_call(
                        d_api.get_single_commit_api_v1_datasources_commits_commit_id_get,
                        r.datasource_commit_id)
                    ds = utils.api_call(
                        d_api.get_datasource_api_v1_datasources_ds_id_get,
                        ds_commit.datasource_id)

                    table.append({
                        'RUN ID': utils.format_uuid(r.id),
                        'TYPE': r.pipeline_run_type,
                        'CPUs PER WORKER': r.cpus_per_worker,
                        'WORKERS': r.workers,
                        'DATASOURCE': '{}_{}'.format(
                            ds.name,
                            utils.format_uuid(r.datasource_commit_id)),
                        'AUTHOR': author.email,
                        'CREATED AT': utils.format_date(r.start_time),
                    })
                click.echo(tabulate(table, headers='keys', tablefmt='plain'))
            click.echo('\n')
Example #6
0
def commit_datasource(ctx,
                      info,
                      datasource_id,
                      message,
                      schema,
                      orchestration_backend,
                      orchestration_args,
                      processing_backend,
                      processing_args,
                      force):
    """Creates a commit for a datasource"""
    api = ce_api.DatasourcesApi(api_client(info))

    if not force:
        confirmation('Committing will trigger a pipeline that will create a '
                     'snapshot of your datasources current state. '
                     'This might take a while. '
                     'Are you sure you wish to continue?', abort=True)

    # find closest, this a heavy call for now
    all_ds = api_call(api.get_datasources_api_v1_datasources_get)
    ds_uuid = find_closest_uuid(datasource_id, all_ds)

    if schema:
        try:
            with open(schema, 'rt', encoding='utf8') as f:
                schema_dict = yaml.load(f)
        except:
            error('Badly formatted YAML!')
            schema_dict = dict()
    else:
        schema_dict = dict()

    commit = api_call(
        api.create_datasource_commit_api_v1_datasources_ds_id_commits_post,
        DatasourceCommitCreate(
            message=message,
            used_schema=schema_dict,
            orchestration_backend=orchestration_backend,
            orchestration_args=orchestration_args,
            processing_backend=processing_backend,
            processing_args=processing_args,
        ),
        ds_id=ds_uuid,
    )
    declare('Commit successful: {}'.format(format_uuid(commit.id)))

    active_commit = '{datasource_id}:{commit_id}'.format(datasource_id=ds_uuid,
                                                         commit_id=commit.id)

    user = info[constants.ACTIVE_USER]
    info[user][constants.ACTIVE_DATASOURCE_COMMIT] = active_commit
    info.save()
    declare('Active datasource commit set to: {}'.format(
        format_uuid(active_commit)))
Example #7
0
    def get_datasources(self, **kwargs) -> List[Datasource]:
        """ Get a list of datasources
        """
        api = ce_api.DatasourcesApi(self.client)
        ds_list = api_utils.api_call(
            func=api.get_datasources_api_v1_datasources_get)
        datasources = [Datasource(**ds.to_dict()) for ds in ds_list]

        if kwargs:
            datasources = client_utils.filter_objects(datasources, **kwargs)
        return datasources
Example #8
0
 def peek_datasource_commit(self,
                            datasource_id: Text,
                            datasource_commit_id: Text,
                            size: int = 10) -> List[Dict[Text, Any]]:
     ds_api = ce_api.DatasourcesApi(self.client)
     data = api_utils.api_call(
         ds_api.
         get_datasource_commit_data_sample_api_v1_datasources_ds_id_commits_commit_id_data_get,
         ds_id=datasource_id,
         commit_id=datasource_commit_id,
         sample_size=size)
     return data
Example #9
0
    def test_pipeline(self,
                      pipeline_id: Text,
                      datasource_id: Text = None,
                      datasource_commit_id: Text = None,
                      orchestration_backend: Text = None,
                      orchestration_args: Dict = None,
                      processing_backend: Text = None,
                      processing_args: Dict = None,
                      training_backend: Text = None,
                      training_args: Dict = None,
                      serving_backend: Text = None,
                      serving_args: Dict = None) -> PipelineRun:

        if datasource_id is None is datasource_commit_id is None:
            assert ValueError('Please either define a datasource_id '
                              '(to pick the latest commit) or a '
                              'datasource_commit_id to define a source.')

        ds_api = ce_api.DatasourcesApi(self.client)

        if datasource_id is not None:
            commits = api_utils.api_call(
                ds_api.get_commits_api_v1_datasources_ds_id_commits_get,
                datasource_id)

            commits.sort(key=lambda x: x.created_at)
            c_id = commits[-1].id

        elif datasource_commit_id is not None:
            c_id = datasource_commit_id
        else:
            raise LookupError('Hello there!')

        run_create = PipelineRun.creator(
            pipeline_run_type=PipelineRunTypes.test.name,
            datasource_commit_id=c_id,
            orchestration_backend=orchestration_backend,
            orchestration_args=orchestration_args,
            processing_backend=processing_backend,
            processing_args=processing_args,
            additional_args={
                'training_backend': training_backend,
                'training_args': training_args,
                'serving_backend': serving_backend,
                'serving_args': serving_args
            })

        p_api = ce_api.PipelinesApi(self.client)
        return api_utils.api_call(
            p_api.create_pipeline_run_api_v1_pipelines_pipeline_id_runs_post,
            run_create, pipeline_id)
Example #10
0
    def get_datasource_commits(self, datasource_id: Text,
                               **kwargs) -> List[DatasourceCommit]:

        api = ce_api.DatasourcesApi(self.client)

        dsc_list = api_utils.api_call(
            api.get_commits_api_v1_datasources_ds_id_commits_get,
            datasource_id)

        commits = [DatasourceCommit(**dsc.to_dict()) for dsc in dsc_list]

        if kwargs:
            commits = client_utils.filter_objects(commits, **kwargs)
        return commits
Example #11
0
 def create_datasource(self, name: Text, type: Text, source: Text,
                       provider_id: Text, args: Dict[Text,
                                                     Any]) -> Datasource:
     """ Create a new workspace in DB
     """
     api = ce_api.DatasourcesApi(self.client)
     ds = api_utils.api_call(
         func=api.create_datasource_api_v1_datasources_post,
         body=Datasource.creator(name=name,
                                 type_=type,
                                 source=source,
                                 provider_id=provider_id,
                                 args=args))
     return Datasource(**ds.to_dict())
Example #12
0
def list_datasources(info):
    """List of all the available datasources"""
    user = info[constants.ACTIVE_USER]
    if constants.ACTIVE_DATASOURCE_COMMIT in info[user]:
        active_dc = info[user][constants.ACTIVE_DATASOURCE_COMMIT]
        active_dc = active_dc.split(':')[1]
    else:
        active_dc = None
    api = ce_api.DatasourcesApi(api_client(info))
    ds_list = api_call(api.get_datasources_api_v1_datasources_get)

    declare('You have created {count} different '
            'datasource(s).\n'.format(count=len(ds_list)))
    declare("Use 'cengine datasource commits DATASOURCE_ID' see commits of  "
            "any datasource.\n")

    if ds_list:
        table = []
        for ds in ds_list:
            dcs = [x.id for x in ds.datasource_commits]
            status = 'No Commit'
            latest_created_at = 'No Commit'
            if len(dcs) != 0:
                latest = min(ds.datasource_commits,
                             key=attrgetter('created_at'))
                latest_created_at = format_date(latest.created_at)

            latest_n_bytes = latest.n_bytes if latest else ''
            latest_n_datapoints = latest.n_datapoints if latest else ''
            latest_n_features = latest.n_features if latest else ''

            table.append({'Selection': '*' if active_dc in dcs else '',
                          'ID': format_uuid(ds.id),
                          'Name': ds.name,
                          'Type': ds.type,
                          '# Commits': len(ds.datasource_commits),
                          'Latest Commit Status': status,
                          'Latest Commit Date': latest_created_at,
                          'Latest Commit Bytes': latest_n_bytes,
                          'Latest Commit # Datapoints': latest_n_datapoints,
                          'Latest Commit # Features': latest_n_features
                          })
        click.echo(tabulate(table, headers='keys', tablefmt='presto'))
        click.echo()
Example #13
0
def peek_datasource(info, source_id, sample_size):
    """Randomly sample datasource and print to console."""
    api = ce_api.DatasourcesApi(api_client(info))

    ds_id, c_id = resolve_datasource_commits(info, source_id)

    declare('Randomly generating {} samples from datasource {}:{}'.format(
        sample_size,
        format_uuid(ds_id),
        format_uuid(c_id)
    ))

    data = api_call(
        api.get_datasource_commit_data_sample_api_v1_datasources_ds_id_commits_commit_id_data_get,
        ds_id=ds_id,
        commit_id=c_id,
        sample_size=sample_size)

    click.echo(tabulate(data, headers='keys', tablefmt='plain'))
Example #14
0
def list_datasource_commits(info, datasource_id):
    """List of all the available datasources"""
    api = ce_api.DatasourcesApi(api_client(info))

    # find closest, this a heavy call for now
    all_ds = api_call(api.get_datasources_api_v1_datasources_get)
    ds_uuid = find_closest_uuid(datasource_id, all_ds)

    ds = api_call(
        api.get_datasource_api_v1_datasources_ds_id_get,
        ds_id=ds_uuid)

    declare('There are {count} different commits for datasource {name}'
            '.\n'.format(count=len(ds.datasource_commits), name=ds.name))

    user = info[constants.ACTIVE_USER]
    if constants.ACTIVE_DATASOURCE_COMMIT in info[user]:
        _, c_id = info[user][constants.ACTIVE_DATASOURCE_COMMIT].split(':')
    else:
        c_id = None

    if ds.datasource_commits:
        table = []
        for commit in ds.datasource_commits:
            status = api_call(
                api.get_datasource_commit_status_api_v1_datasources_ds_id_commits_commit_id_status_get,
                ds.id,
                commit.id,
            )
            table.append({
                'Selection': '*' if commit.id == c_id else '',
                'ID': format_uuid(commit.id),
                'Created At': format_date(commit.created_at),
                'Status': status,
                'Message': commit.message,
                'Bytes': commit.n_bytes,
                '# Datapoints': commit.n_datapoints,
                '# Features': commit.n_features
            })
        click.echo(tabulate(table, headers='keys', tablefmt='presto'))
        click.echo()
Example #15
0
    def from_datasource(cls,
                        client,
                        datasource_id: str,
                        commit_id: str = None):

        ds_api = ce_api.DatasourcesApi(client.client)

        if commit_id is None:
            commits = api_utils.api_call(
                ds_api.get_commits_api_v1_datasources_ds_id_commits_get,
                datasource_id)
            commits.sort(key=lambda x: x.created_at)
            commit_id = commits[0].id

        schema = api_utils.api_call(
            ds_api.
            get_datasource_commit_schema_api_v1_datasources_ds_id_commits_commit_id_schema_get,
            ds_id=datasource_id,
            commit_id=commit_id)

        config = cls()
        config.features = [f for f in schema]
        return config
Example #16
0
def check_datasource_commit(info):
    user = info[constants.ACTIVE_USER]
    if constants.ACTIVE_DATASOURCE_COMMIT in info[user]:
        active_dsc = info[user][constants.ACTIVE_DATASOURCE_COMMIT]
        ds_id, c_id = active_dsc.split(':')

        ds_api = ce_api.DatasourcesApi(api_client(info))
        ds = api_call(ds_api.get_datasource_api_v1_datasources_ds_id_get,
                      ds_id)

        click.echo('Currently, the active datasource is:')

        declare('Datasource Name: {}\n'
                'Datasource ID: {}\n'
                'Commit ID: {}\n'.format(ds.name, format_uuid(ds_id),
                                         format_uuid(c_id)))
    else:
        raise click.ClickException(message=error(
            "You have not selected a datasource to work on.\n"
            "You can either select one by using the argument called "
            "'datasource'\n "
            "Or you can use 'cengine datasource list' to see the "
            "possible options \n and 'cengine datasource set' to "
            "select one.\n"))
Example #17
0
def get_pipeline_status(info, pipeline_id):
    """Get status of started pipelines"""
    utils.notice('Fetching pipeline(s). This might take a few seconds.. \n')
    active_user = info[constants.ACTIVE_USER]
    ws = info[active_user][constants.ACTIVE_WORKSPACE]

    ws_api = ce_api.WorkspacesApi(utils.api_client(info))
    p_api = ce_api.PipelinesApi(utils.api_client(info))
    d_api = ce_api.DatasourcesApi(utils.api_client(info))

    pipelines = utils.api_call(
        ws_api.get_workspaces_pipelines_api_v1_workspaces_workspace_id_pipelines_get,
        ws)

    if pipeline_id is not None:
        pipeline_id = utils.find_closest_uuid(pipeline_id, pipelines)

    pipelines.sort(key=lambda x: x.created_at)
    for p in pipelines:
        write_check = (len(p.pipeline_runs) > 0) and \
                      (pipeline_id is None or pipeline_id == p.id)

        if write_check:
            title = 'PIPELINE NAME: {} PIPELINE ID: {}'.format(
                p.name, utils.format_uuid(p.id))
            utils.declare(title)
            utils.declare('-' * len(title))

            table = []
            for r in p.pipeline_runs:
                run = utils.api_call(
                    p_api.get_pipeline_run_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_get,
                    p.id,
                    r.id)

                # Resolve datasource
                ds_commit = utils.api_call(
                    d_api.get_single_commit_api_v1_datasources_commits_commit_id_get,
                    r.datasource_commit_id)
                ds = utils.api_call(
                    d_api.get_datasource_api_v1_datasources_ds_id_get,
                    ds_commit.datasource_id)

                if run.end_time:
                    td = run.end_time - run.start_time
                else:
                    td = datetime.now(timezone.utc) - run.start_time

                # # Resolve component status
                # stage = utils.get_run_stage(run.pipeline_components)

                table.append({
                    'RUN ID': utils.format_uuid(run.id),
                    'TYPE': run.pipeline_run_type,
                    'STATUS': run.status,
                    # 'STAGE': stage,
                    'DATASOURCE': '{}_{}'.format(
                        ds.name, utils.format_uuid(run.datasource_commit_id)),
                    'DATAPOINTS': '{}'.format(ds_commit.n_datapoints),
                    # 'RUNNING STAGE': stage,
                    'START TIME': utils.format_date(run.start_time),
                    'DURATION': utils.format_timedelta(td),
                })

            click.echo(tabulate(table, headers='keys', tablefmt='plain'))
            click.echo('\n')
Example #18
0
    def infer_pipeline(self,
                       pipeline_id: Text = None,
                       pipeline_run_id: Text = None,
                       datasource_id: Text = None,
                       datasource_commit_id: Text = None,
                       orchestration_backend: Text = None,
                       orchestration_args: Dict = None,
                       processing_backend: Text = None,
                       processing_args: Dict = None) -> PipelineRun:

        # Resolve the pipeline run_id
        if pipeline_id is None is pipeline_run_id is None:
            raise ValueError('Please either define a pipeline_id '
                             '(to pick the latest training run) or a '
                             'pipeline_run_id to choose a trained model.')

        p_api = ce_api.PipelinesApi(self.client)
        if pipeline_id is not None:
            runs = api_utils.api_call(
                p_api.get_pipeline_runs_api_v1_pipelines_pipeline_id_runs_get,
                pipeline_id)

            runs.sort(key=lambda x: x.run_time)
            training_runs = [
                r for r in runs
                if r.pipeline_run_type == PipelineRunTypes.training.name
            ]
            if len(training_runs) == 0:
                raise ValueError('You dont have any training runs with the '
                                 'pipeline {}'.format(pipeline_id))
            r_id = training_runs[-1].id
        elif pipeline_run_id is not None:
            # TODO: If you just have the pipeline_run_id, how do you get the
            #   run without the pipeline_id?
            # TODO: We need to check whether we have a training run here
            r_id = pipeline_run_id
        else:
            raise LookupError('Hello there!')

        if datasource_id is None is datasource_commit_id is None:
            raise ValueError('Please either define a datasource_id '
                             '(to pick the latest commit) or a '
                             'datasource_commit_id to define a source.')

        ds_api = ce_api.DatasourcesApi(self.client)

        if datasource_id is not None:
            commits = api_utils.api_call(
                ds_api.get_commits_api_v1_datasources_ds_id_commits_get,
                datasource_id)

            commits.sort(key=lambda x: x.created_at)
            c_id = commits[-1].id

        elif datasource_commit_id is not None:
            c_id = datasource_commit_id
        else:
            raise LookupError('General Kenobi!')

        run_create = PipelineRun.creator(
            pipeline_run_type=PipelineRunTypes.infer.name,
            datasource_commit_id=c_id,
            orchestration_backend=orchestration_backend,
            orchestration_args=orchestration_args,
            processing_backend=processing_backend,
            processing_args=processing_args,
            additional_args={'run_id': r_id})

        p_api = ce_api.PipelinesApi(self.client)
        return api_utils.api_call(
            p_api.create_pipeline_run_api_v1_pipelines_pipeline_id_runs_post,
            run_create, pipeline_id)
Example #19
0
    def get_pipeline_status(self,
                            workspace_id: Text,
                            pipeline_id: Text = None) -> Dict:

        ws_api = ce_api.WorkspacesApi(self.client)
        p_api = ce_api.PipelinesApi(self.client)
        d_api = ce_api.DatasourcesApi(self.client)

        status_dict = {}

        pipelines = api_utils.api_call(
            ws_api.
            get_workspaces_pipelines_api_v1_workspaces_workspace_id_pipelines_get,
            workspace_id)

        pipelines.sort(key=lambda x: x.created_at)
        for p in pipelines:
            write_check = (len(p.pipeline_runs) > 0) and \
                          (pipeline_id is None or pipeline_id == p.id)

            if write_check:

                status_dict[p.id] = []
                for r in p.pipeline_runs:
                    run = api_utils.api_call(
                        p_api.
                        get_pipeline_run_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_get,
                        p.id, r.id)

                    # Resolve datasource
                    ds_commit = api_utils.api_call(
                        d_api.
                        get_single_commit_api_v1_datasources_commits_commit_id_get,
                        r.datasource_commit_id)
                    ds = api_utils.api_call(
                        d_api.get_datasource_api_v1_datasources_ds_id_get,
                        ds_commit.datasource_id)

                    if run.end_time:
                        td = run.end_time - run.start_time
                    else:
                        td = datetime.now(timezone.utc) - run.start_time

                    status_dict[p.id].append({
                        'RUN ID':
                        run.id,
                        'TYPE':
                        run.pipeline_run_type,
                        'STATUS':
                        run.status,
                        'DATASOURCE':
                        '{}_{}'.format(ds.name, run.datasource_commit_id),
                        'DATAPOINTS':
                        '{}'.format(ds_commit.n_datapoints),
                        'START TIME':
                        print_utils.format_date(run.start_time),
                        'DURATION':
                        print_utils.format_timedelta(td),
                    })

        return status_dict