def resolve_datasource_commits(info, source_id): ds_api = ce_api.DatasourcesApi(api_client(info)) if len(source_id.split(':')) == 2: datasource_id, commit_id = source_id.split(':') datasources = api_call(ds_api.get_datasources_api_v1_datasources_get) ds_id = find_closest_uuid(datasource_id, datasources) commits = api_call( ds_api.get_commits_api_v1_datasources_ds_id_commits_get, ds_id) c_id = find_closest_uuid(commit_id, commits) elif len(source_id.split(':')) == 1: datasource_id = source_id datasources = api_call(ds_api.get_datasources_api_v1_datasources_get) ds_id = find_closest_uuid(datasource_id, datasources) commits = api_call( ds_api.get_commits_api_v1_datasources_ds_id_commits_get, ds_id) commits.sort(key=lambda x: x.created_at) c_id = commits[-1].id else: raise ValueError('Unresolvable datasource') return ds_id, c_id
def commit_datasource(self, datasource_id: Text, message: Text = None, schema: Dict[Text, Any] = None, orchestration_backend: Text = None, orchestration_args: Dict = None, processing_backend: Text = None, processing_args: Dict = None) -> DatasourceCommit: """ Create a new workspace in DB """ api = ce_api.DatasourcesApi(self.client) if schema is None: schema = {} else: assert isinstance(schema, dict), 'The schema should be a dict.' commit = api_utils.api_call( api.create_datasource_commit_api_v1_datasources_ds_id_commits_post, DatasourceCommit.creator( message=message, schema=schema, orchestration_backend=orchestration_backend, orchestration_args=orchestration_args, processing_backend=processing_backend, processing_args=processing_args), ds_id=datasource_id, ) return DatasourceCommit(**commit.to_dict())
def create_datasource(info, name, ds_type, source, provider_id, args): """Create a datasource""" click.echo('Registering datasource {}...'.format(name)) parsed_args = parse_unknown_options(args) api = ce_api.DatasourcesApi(api_client(info)) p_api = ce_api.ProvidersApi(api_client(info)) p_list = api_call(p_api.get_loggedin_provider_api_v1_providers_get) p_uuid = find_closest_uuid(provider_id, p_list) ds = api_call( api.create_datasource_api_v1_datasources_post, DatasourceCreate( name=name, type=ds_type, source=source, provider_id=p_uuid, args=parsed_args, )) declare('Datasource registered with ID: {}'.format( format_uuid(ds.id)))
def template_pipeline(info, datasource, output_path, no_docs, no_datasource): """Copy the configuration of a registered pipeline""" # TODO: with the info we can do datasource specific templates later on from ce_cli.pretty_yaml import TEMPLATE_CONFIG if not no_datasource: active_user = info[constants.ACTIVE_USER] if datasource is not None: from ce_cli.utils import resolve_datasource_commits ds_id, c_id = resolve_datasource_commits(info, datasource) elif constants.ACTIVE_DATASOURCE_COMMIT in info[active_user]: ds_id, c_id = info[active_user][ constants.ACTIVE_DATASOURCE_COMMIT].split(':') else: raise AssertionError('Please either select an active datasource ' 'commit to work on or explicitly define it.') api = ce_api.DatasourcesApi(utils.api_client(info)) schema = utils.api_call( api.get_datasource_commit_schema_api_v1_datasources_ds_id_commits_commit_id_schema_get, ds_id=ds_id, commit_id=c_id) from ce_standards.standard_experiment import GlobalKeys TEMPLATE_CONFIG[GlobalKeys.FEATURES] = {f: {} for f in schema} utils.save_config(TEMPLATE_CONFIG, output_path, no_docs)
def list_pipelines(info, pipeline_id, ignore_empty): """List of registered pipelines""" utils.notice('Fetching pipeline(s). This might take a few seconds... \n') active_user = info[constants.ACTIVE_USER] ws = info[active_user][constants.ACTIVE_WORKSPACE] ws_api = ce_api.WorkspacesApi(utils.api_client(info)) p_api = ce_api.PipelinesApi(utils.api_client(info)) d_api = ce_api.DatasourcesApi(utils.api_client(info)) pipelines = utils.api_call( ws_api.get_workspaces_pipelines_api_v1_workspaces_workspace_id_pipelines_get, ws) if pipeline_id is not None: pipeline_id = utils.find_closest_uuid(pipeline_id, pipelines) pipelines.sort(key=lambda x: x.created_at) for p in pipelines: write_check = (len(p.pipeline_runs) > 0 or not ignore_empty) and \ (pipeline_id is None or pipeline_id == p.id) if write_check: # THIS WHOLE THING IS HERE FOR A REASON!!!!!! title = 'PIPELINE NAME: {} PIPELINE ID: {}'.format( p.name, utils.format_uuid(p.id)) utils.declare(title) utils.declare('-' * len(title)) if len(p.pipeline_runs) == 0: click.echo('No runs for this pipeline yet!') else: table = [] for r in p.pipeline_runs: author = utils.api_call( p_api.get_pipeline_run_user_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_user_get, p.id, r.id) # Resolve datasource ds_commit = utils.api_call( d_api.get_single_commit_api_v1_datasources_commits_commit_id_get, r.datasource_commit_id) ds = utils.api_call( d_api.get_datasource_api_v1_datasources_ds_id_get, ds_commit.datasource_id) table.append({ 'RUN ID': utils.format_uuid(r.id), 'TYPE': r.pipeline_run_type, 'CPUs PER WORKER': r.cpus_per_worker, 'WORKERS': r.workers, 'DATASOURCE': '{}_{}'.format( ds.name, utils.format_uuid(r.datasource_commit_id)), 'AUTHOR': author.email, 'CREATED AT': utils.format_date(r.start_time), }) click.echo(tabulate(table, headers='keys', tablefmt='plain')) click.echo('\n')
def commit_datasource(ctx, info, datasource_id, message, schema, orchestration_backend, orchestration_args, processing_backend, processing_args, force): """Creates a commit for a datasource""" api = ce_api.DatasourcesApi(api_client(info)) if not force: confirmation('Committing will trigger a pipeline that will create a ' 'snapshot of your datasources current state. ' 'This might take a while. ' 'Are you sure you wish to continue?', abort=True) # find closest, this a heavy call for now all_ds = api_call(api.get_datasources_api_v1_datasources_get) ds_uuid = find_closest_uuid(datasource_id, all_ds) if schema: try: with open(schema, 'rt', encoding='utf8') as f: schema_dict = yaml.load(f) except: error('Badly formatted YAML!') schema_dict = dict() else: schema_dict = dict() commit = api_call( api.create_datasource_commit_api_v1_datasources_ds_id_commits_post, DatasourceCommitCreate( message=message, used_schema=schema_dict, orchestration_backend=orchestration_backend, orchestration_args=orchestration_args, processing_backend=processing_backend, processing_args=processing_args, ), ds_id=ds_uuid, ) declare('Commit successful: {}'.format(format_uuid(commit.id))) active_commit = '{datasource_id}:{commit_id}'.format(datasource_id=ds_uuid, commit_id=commit.id) user = info[constants.ACTIVE_USER] info[user][constants.ACTIVE_DATASOURCE_COMMIT] = active_commit info.save() declare('Active datasource commit set to: {}'.format( format_uuid(active_commit)))
def get_datasources(self, **kwargs) -> List[Datasource]: """ Get a list of datasources """ api = ce_api.DatasourcesApi(self.client) ds_list = api_utils.api_call( func=api.get_datasources_api_v1_datasources_get) datasources = [Datasource(**ds.to_dict()) for ds in ds_list] if kwargs: datasources = client_utils.filter_objects(datasources, **kwargs) return datasources
def peek_datasource_commit(self, datasource_id: Text, datasource_commit_id: Text, size: int = 10) -> List[Dict[Text, Any]]: ds_api = ce_api.DatasourcesApi(self.client) data = api_utils.api_call( ds_api. get_datasource_commit_data_sample_api_v1_datasources_ds_id_commits_commit_id_data_get, ds_id=datasource_id, commit_id=datasource_commit_id, sample_size=size) return data
def test_pipeline(self, pipeline_id: Text, datasource_id: Text = None, datasource_commit_id: Text = None, orchestration_backend: Text = None, orchestration_args: Dict = None, processing_backend: Text = None, processing_args: Dict = None, training_backend: Text = None, training_args: Dict = None, serving_backend: Text = None, serving_args: Dict = None) -> PipelineRun: if datasource_id is None is datasource_commit_id is None: assert ValueError('Please either define a datasource_id ' '(to pick the latest commit) or a ' 'datasource_commit_id to define a source.') ds_api = ce_api.DatasourcesApi(self.client) if datasource_id is not None: commits = api_utils.api_call( ds_api.get_commits_api_v1_datasources_ds_id_commits_get, datasource_id) commits.sort(key=lambda x: x.created_at) c_id = commits[-1].id elif datasource_commit_id is not None: c_id = datasource_commit_id else: raise LookupError('Hello there!') run_create = PipelineRun.creator( pipeline_run_type=PipelineRunTypes.test.name, datasource_commit_id=c_id, orchestration_backend=orchestration_backend, orchestration_args=orchestration_args, processing_backend=processing_backend, processing_args=processing_args, additional_args={ 'training_backend': training_backend, 'training_args': training_args, 'serving_backend': serving_backend, 'serving_args': serving_args }) p_api = ce_api.PipelinesApi(self.client) return api_utils.api_call( p_api.create_pipeline_run_api_v1_pipelines_pipeline_id_runs_post, run_create, pipeline_id)
def get_datasource_commits(self, datasource_id: Text, **kwargs) -> List[DatasourceCommit]: api = ce_api.DatasourcesApi(self.client) dsc_list = api_utils.api_call( api.get_commits_api_v1_datasources_ds_id_commits_get, datasource_id) commits = [DatasourceCommit(**dsc.to_dict()) for dsc in dsc_list] if kwargs: commits = client_utils.filter_objects(commits, **kwargs) return commits
def create_datasource(self, name: Text, type: Text, source: Text, provider_id: Text, args: Dict[Text, Any]) -> Datasource: """ Create a new workspace in DB """ api = ce_api.DatasourcesApi(self.client) ds = api_utils.api_call( func=api.create_datasource_api_v1_datasources_post, body=Datasource.creator(name=name, type_=type, source=source, provider_id=provider_id, args=args)) return Datasource(**ds.to_dict())
def list_datasources(info): """List of all the available datasources""" user = info[constants.ACTIVE_USER] if constants.ACTIVE_DATASOURCE_COMMIT in info[user]: active_dc = info[user][constants.ACTIVE_DATASOURCE_COMMIT] active_dc = active_dc.split(':')[1] else: active_dc = None api = ce_api.DatasourcesApi(api_client(info)) ds_list = api_call(api.get_datasources_api_v1_datasources_get) declare('You have created {count} different ' 'datasource(s).\n'.format(count=len(ds_list))) declare("Use 'cengine datasource commits DATASOURCE_ID' see commits of " "any datasource.\n") if ds_list: table = [] for ds in ds_list: dcs = [x.id for x in ds.datasource_commits] status = 'No Commit' latest_created_at = 'No Commit' if len(dcs) != 0: latest = min(ds.datasource_commits, key=attrgetter('created_at')) latest_created_at = format_date(latest.created_at) latest_n_bytes = latest.n_bytes if latest else '' latest_n_datapoints = latest.n_datapoints if latest else '' latest_n_features = latest.n_features if latest else '' table.append({'Selection': '*' if active_dc in dcs else '', 'ID': format_uuid(ds.id), 'Name': ds.name, 'Type': ds.type, '# Commits': len(ds.datasource_commits), 'Latest Commit Status': status, 'Latest Commit Date': latest_created_at, 'Latest Commit Bytes': latest_n_bytes, 'Latest Commit # Datapoints': latest_n_datapoints, 'Latest Commit # Features': latest_n_features }) click.echo(tabulate(table, headers='keys', tablefmt='presto')) click.echo()
def peek_datasource(info, source_id, sample_size): """Randomly sample datasource and print to console.""" api = ce_api.DatasourcesApi(api_client(info)) ds_id, c_id = resolve_datasource_commits(info, source_id) declare('Randomly generating {} samples from datasource {}:{}'.format( sample_size, format_uuid(ds_id), format_uuid(c_id) )) data = api_call( api.get_datasource_commit_data_sample_api_v1_datasources_ds_id_commits_commit_id_data_get, ds_id=ds_id, commit_id=c_id, sample_size=sample_size) click.echo(tabulate(data, headers='keys', tablefmt='plain'))
def list_datasource_commits(info, datasource_id): """List of all the available datasources""" api = ce_api.DatasourcesApi(api_client(info)) # find closest, this a heavy call for now all_ds = api_call(api.get_datasources_api_v1_datasources_get) ds_uuid = find_closest_uuid(datasource_id, all_ds) ds = api_call( api.get_datasource_api_v1_datasources_ds_id_get, ds_id=ds_uuid) declare('There are {count} different commits for datasource {name}' '.\n'.format(count=len(ds.datasource_commits), name=ds.name)) user = info[constants.ACTIVE_USER] if constants.ACTIVE_DATASOURCE_COMMIT in info[user]: _, c_id = info[user][constants.ACTIVE_DATASOURCE_COMMIT].split(':') else: c_id = None if ds.datasource_commits: table = [] for commit in ds.datasource_commits: status = api_call( api.get_datasource_commit_status_api_v1_datasources_ds_id_commits_commit_id_status_get, ds.id, commit.id, ) table.append({ 'Selection': '*' if commit.id == c_id else '', 'ID': format_uuid(commit.id), 'Created At': format_date(commit.created_at), 'Status': status, 'Message': commit.message, 'Bytes': commit.n_bytes, '# Datapoints': commit.n_datapoints, '# Features': commit.n_features }) click.echo(tabulate(table, headers='keys', tablefmt='presto')) click.echo()
def from_datasource(cls, client, datasource_id: str, commit_id: str = None): ds_api = ce_api.DatasourcesApi(client.client) if commit_id is None: commits = api_utils.api_call( ds_api.get_commits_api_v1_datasources_ds_id_commits_get, datasource_id) commits.sort(key=lambda x: x.created_at) commit_id = commits[0].id schema = api_utils.api_call( ds_api. get_datasource_commit_schema_api_v1_datasources_ds_id_commits_commit_id_schema_get, ds_id=datasource_id, commit_id=commit_id) config = cls() config.features = [f for f in schema] return config
def check_datasource_commit(info): user = info[constants.ACTIVE_USER] if constants.ACTIVE_DATASOURCE_COMMIT in info[user]: active_dsc = info[user][constants.ACTIVE_DATASOURCE_COMMIT] ds_id, c_id = active_dsc.split(':') ds_api = ce_api.DatasourcesApi(api_client(info)) ds = api_call(ds_api.get_datasource_api_v1_datasources_ds_id_get, ds_id) click.echo('Currently, the active datasource is:') declare('Datasource Name: {}\n' 'Datasource ID: {}\n' 'Commit ID: {}\n'.format(ds.name, format_uuid(ds_id), format_uuid(c_id))) else: raise click.ClickException(message=error( "You have not selected a datasource to work on.\n" "You can either select one by using the argument called " "'datasource'\n " "Or you can use 'cengine datasource list' to see the " "possible options \n and 'cengine datasource set' to " "select one.\n"))
def get_pipeline_status(info, pipeline_id): """Get status of started pipelines""" utils.notice('Fetching pipeline(s). This might take a few seconds.. \n') active_user = info[constants.ACTIVE_USER] ws = info[active_user][constants.ACTIVE_WORKSPACE] ws_api = ce_api.WorkspacesApi(utils.api_client(info)) p_api = ce_api.PipelinesApi(utils.api_client(info)) d_api = ce_api.DatasourcesApi(utils.api_client(info)) pipelines = utils.api_call( ws_api.get_workspaces_pipelines_api_v1_workspaces_workspace_id_pipelines_get, ws) if pipeline_id is not None: pipeline_id = utils.find_closest_uuid(pipeline_id, pipelines) pipelines.sort(key=lambda x: x.created_at) for p in pipelines: write_check = (len(p.pipeline_runs) > 0) and \ (pipeline_id is None or pipeline_id == p.id) if write_check: title = 'PIPELINE NAME: {} PIPELINE ID: {}'.format( p.name, utils.format_uuid(p.id)) utils.declare(title) utils.declare('-' * len(title)) table = [] for r in p.pipeline_runs: run = utils.api_call( p_api.get_pipeline_run_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_get, p.id, r.id) # Resolve datasource ds_commit = utils.api_call( d_api.get_single_commit_api_v1_datasources_commits_commit_id_get, r.datasource_commit_id) ds = utils.api_call( d_api.get_datasource_api_v1_datasources_ds_id_get, ds_commit.datasource_id) if run.end_time: td = run.end_time - run.start_time else: td = datetime.now(timezone.utc) - run.start_time # # Resolve component status # stage = utils.get_run_stage(run.pipeline_components) table.append({ 'RUN ID': utils.format_uuid(run.id), 'TYPE': run.pipeline_run_type, 'STATUS': run.status, # 'STAGE': stage, 'DATASOURCE': '{}_{}'.format( ds.name, utils.format_uuid(run.datasource_commit_id)), 'DATAPOINTS': '{}'.format(ds_commit.n_datapoints), # 'RUNNING STAGE': stage, 'START TIME': utils.format_date(run.start_time), 'DURATION': utils.format_timedelta(td), }) click.echo(tabulate(table, headers='keys', tablefmt='plain')) click.echo('\n')
def infer_pipeline(self, pipeline_id: Text = None, pipeline_run_id: Text = None, datasource_id: Text = None, datasource_commit_id: Text = None, orchestration_backend: Text = None, orchestration_args: Dict = None, processing_backend: Text = None, processing_args: Dict = None) -> PipelineRun: # Resolve the pipeline run_id if pipeline_id is None is pipeline_run_id is None: raise ValueError('Please either define a pipeline_id ' '(to pick the latest training run) or a ' 'pipeline_run_id to choose a trained model.') p_api = ce_api.PipelinesApi(self.client) if pipeline_id is not None: runs = api_utils.api_call( p_api.get_pipeline_runs_api_v1_pipelines_pipeline_id_runs_get, pipeline_id) runs.sort(key=lambda x: x.run_time) training_runs = [ r for r in runs if r.pipeline_run_type == PipelineRunTypes.training.name ] if len(training_runs) == 0: raise ValueError('You dont have any training runs with the ' 'pipeline {}'.format(pipeline_id)) r_id = training_runs[-1].id elif pipeline_run_id is not None: # TODO: If you just have the pipeline_run_id, how do you get the # run without the pipeline_id? # TODO: We need to check whether we have a training run here r_id = pipeline_run_id else: raise LookupError('Hello there!') if datasource_id is None is datasource_commit_id is None: raise ValueError('Please either define a datasource_id ' '(to pick the latest commit) or a ' 'datasource_commit_id to define a source.') ds_api = ce_api.DatasourcesApi(self.client) if datasource_id is not None: commits = api_utils.api_call( ds_api.get_commits_api_v1_datasources_ds_id_commits_get, datasource_id) commits.sort(key=lambda x: x.created_at) c_id = commits[-1].id elif datasource_commit_id is not None: c_id = datasource_commit_id else: raise LookupError('General Kenobi!') run_create = PipelineRun.creator( pipeline_run_type=PipelineRunTypes.infer.name, datasource_commit_id=c_id, orchestration_backend=orchestration_backend, orchestration_args=orchestration_args, processing_backend=processing_backend, processing_args=processing_args, additional_args={'run_id': r_id}) p_api = ce_api.PipelinesApi(self.client) return api_utils.api_call( p_api.create_pipeline_run_api_v1_pipelines_pipeline_id_runs_post, run_create, pipeline_id)
def get_pipeline_status(self, workspace_id: Text, pipeline_id: Text = None) -> Dict: ws_api = ce_api.WorkspacesApi(self.client) p_api = ce_api.PipelinesApi(self.client) d_api = ce_api.DatasourcesApi(self.client) status_dict = {} pipelines = api_utils.api_call( ws_api. get_workspaces_pipelines_api_v1_workspaces_workspace_id_pipelines_get, workspace_id) pipelines.sort(key=lambda x: x.created_at) for p in pipelines: write_check = (len(p.pipeline_runs) > 0) and \ (pipeline_id is None or pipeline_id == p.id) if write_check: status_dict[p.id] = [] for r in p.pipeline_runs: run = api_utils.api_call( p_api. get_pipeline_run_api_v1_pipelines_pipeline_id_runs_pipeline_run_id_get, p.id, r.id) # Resolve datasource ds_commit = api_utils.api_call( d_api. get_single_commit_api_v1_datasources_commits_commit_id_get, r.datasource_commit_id) ds = api_utils.api_call( d_api.get_datasource_api_v1_datasources_ds_id_get, ds_commit.datasource_id) if run.end_time: td = run.end_time - run.start_time else: td = datetime.now(timezone.utc) - run.start_time status_dict[p.id].append({ 'RUN ID': run.id, 'TYPE': run.pipeline_run_type, 'STATUS': run.status, 'DATASOURCE': '{}_{}'.format(ds.name, run.datasource_commit_id), 'DATAPOINTS': '{}'.format(ds_commit.n_datapoints), 'START TIME': print_utils.format_date(run.start_time), 'DURATION': print_utils.format_timedelta(td), }) return status_dict