def respond_file(self, path): abs_path = os.path.abspath(path) naas_type = mimetypes.guess_type(abs_path)[0] display(Markdown("Response Set as File, preview below: ")) display( JSON( {"path": abs_path}, metadata={"naas_api": True, "naas_type": naas_type} ) )
def to_mime_and_metadata(obj) -> (dict, dict): # noqa: C901 if isinstance(obj, bytes): obj = base64.b64encode(obj).decode("utf-8") return {"text/html": to_html(obj)}, {} elif isinstance(obj, str) and obj.startswith("http"): if re.match(r".*\.(gif|jpg|svg|jpeg||png)$", obj, re.I): try: return Image(obj, embed=True)._repr_mimebundle_() except TypeError: pass return {"text/html": to_html(obj)}, {} elif isinstance(obj, str) and len(obj) < 1024 and os.path.exists(obj): if re.match(r".*\.(gif|jpg|svg|jpeg||png)$", obj, re.I): try: return Image(obj, embed=True)._repr_mimebundle_() except TypeError: pass return {"text/html": to_html(obj)}, {} elif hasattr(obj, "_repr_mimebundle_"): obj.embed = True return obj._repr_mimebundle_() elif hasattr(obj, "_repr_json_"): obj.embed = True return {"application/json": obj._repr_json_()}, {} elif hasattr(obj, "_repr_html_"): obj.embed = True return {"text/html": obj._repr_html_()}, {} elif hasattr(obj, "_repr_png_"): return {"image/png": obj._repr_png_()}, {} elif hasattr(obj, "_repr_jpeg_"): return {"image/jpeg": obj._repr_jpeg_()}, {} elif hasattr(obj, "_repr_svg_"): return {"image/svg": obj._repr_svg_()}, {} try: if isinstance(obj, str): return { "text/html": f"<pre>{to_html(obj)}</pre>".replace("\\n", "\n") }, {} else: data, metadata = JSON(data=obj, expanded=True)._repr_json_() return ( { "application/json": data, "text/html": f"<pre>{to_html(obj)}</pre>" }, metadata, ) except (TypeError, JSONDecodeError): pass try: return {"text/html": to_html(obj)}, {} except TypeError: return {}, {}
def to_mime_and_metadata(obj) -> (dict, dict): # noqa: C901 if isinstance(obj, bytes): obj = base64.b64encode(obj).decode('utf-8') return {'text/html': to_html(obj)}, {} elif isinstance(obj, str) and obj.startswith('http'): if re.match(r'.*\.(gif|jpg|svg|jpeg||png)$', obj, re.I): try: return Image(obj, embed=True)._repr_mimebundle_() except TypeError: pass return {'text/html': to_html(obj)}, {} elif isinstance(obj, str) and len(obj) < 1024 and os.path.exists(obj): if re.match(r'.*\.(gif|jpg|svg|jpeg||png)$', obj, re.I): try: return Image(obj, embed=True)._repr_mimebundle_() except TypeError: pass return {'text/html': to_html(obj)}, {} elif hasattr(obj, '_repr_mimebundle_'): obj.embed = True return obj._repr_mimebundle_() elif hasattr(obj, '_repr_json_'): obj.embed = True return {'application/json': obj._repr_json_()}, {} elif hasattr(obj, '_repr_html_'): obj.embed = True return {'text/html': obj._repr_html_()}, {} elif hasattr(obj, '_repr_png_'): return {'image/png': obj._repr_png_()}, {} elif hasattr(obj, '_repr_jpeg_'): return {'image/jpeg': obj._repr_jpeg_()}, {} elif hasattr(obj, '_repr_svg_'): return {'image/svg': obj._repr_svg_()}, {} try: data, metadata = JSON(data=obj, expanded=True)._repr_json_() return { 'application/json': data, 'text/html': f'<pre>{to_html(obj)}</pre>', }, metadata except (TypeError, JSONDecodeError): pass try: return {'text/html': to_html(obj)}, {} except TypeError: return {}, {}
def kgtk( arg1: typing.Union[str, pandas.DataFrame], arg2: typing.Optional[str] = None, df: typing.Optional[pandas.DataFrame] = None, auto_display_html: typing.Optional[bool] = None, auto_display_json: typing.Optional[bool] = None, auto_display_md: typing.Optional[bool] = None, unquote_column_names: typing.Optional[bool] = None, bash_command: typing.Optional[str] = None, kgtk_command: typing.Optional[str] = None, ) -> typing.Optional[pandas.DataFrame]: """This function simplifies using KGTK commands in a Jupyter Lab environment. Invocation ========== kgtk("pipeline") Execute the command pipeline. The results are printed, displayed, or returned as a Pandas DataFrame. kgtk(df, "pipeline") The `df` in the call is a Pandas DataFrame, which is converted to KGTK format and passed to the pipeline as standard input. The results are printed, displayed, or returned as a Pandas DataFrame. Optional Parameters ======== ========== df=DF (default None) This is an alternate method for specifying an input DataFrame. auto_display_html=True/False (default True) This parameter controls the processing of HTML output. See below. auto_display_json=True/False (default True) This parameter controls the processing of JSON output. See below. auto_display_md=True/False (default False) This parameter controls the processing of MarkDown output. See below. unquote_column_names=True/False (default True) Convert string column names to symbols. bash_command=CMD (default 'bash') This parameter specifies the name of the shell interpreter. If the envar KGTK_BASH_COMMAND is present, it will supply the default value for the name of the shell interpreter. kgtk_command=CMD (default 'kgtk') This parameter specifies the kgtk shell command. If the envar KGTK_KGTK_COMMAND is present, it will supply the default value for the name of the `kgtk` command. One use for this feature is to redefine the `kgtk` command to include `time` as a prefix, and/or to include common options. Standard Output Processing ======== ====== ========= If the standard output of the pipeline is in HTML format (`--output-format HTML` or `kgtk("... /html")`), identified by starting with `<!DOCTYPE html>`, the output will be displayed with `display(HTML(output))` by default. However, if `kgtk(... auto_display_json=False)` or if the envar `KGTK_AUTO_DISPLAY_HTML` set to `false`, then the output will be printed. If the standard output of the pipeline is in JSON format (`--output-format JSON`), identified as starting with `[` or '{', the output will be displayed with `display(JSON(output))` by default. However, if `kgtk(... auto_display_json=False)` or if the envar `KGTK_AUTO_DISPLAY_JSON` set to `false`, then the output will be printed. If the standard output of the pipeline is MarkDown format (typically by ending the pipeline in `... / md` or `... /table`, identified as starting with `|`, the output will be printed by default. However, if `auto_display_md=True` is passed in the `kgtk(...)` call, or if the envar `KGTK_AUTO_DISPLAY_MD` is set to `true`, then the MarkDown will be displayed using `display(Markdown(output))`. If the standard output of the pipeline begins with "usage:", then it is treated as output from `--help` and printed. If the standard output starts with anything other than the cases listed above, then the output is assumed to be in KGTK format. It is converted to a Pandas DataFrame, which is returned to the caller. Error Output Processing ===== ====== ========== If standard output was printed or displayed, then any error output will be printed immediately after it. If standard output was convertd to a DataFrame and returned, and subsequently displayed by the iPython shell, then any error output will be printed before the DataFrame is displayed. Environment Variables =========== ========= This module directly uses the following environment variables: KGTK_AUTO_DISPLAY_HTML KGTK_AUTO_DISPLAY_JSON KGTK_AUTO_DISPLAY_MD KGTK_UNQUOTE_COLUMN_NAMES KGTK_BASH_COMMAND KGTK_KGTK_COMMAND """ # Important prefixes to look for in standard output: MD_SIGIL: str = "|" JSON_SIGIL: str = "[" JSONL_MAP_SIGIL: str = "{" HTML_SIGIL: str = "<!DOCTYPE html>" USAGE_SIGIL: str = "usage:" # Output from `kgtk --help` or `kgtk command --help` GRAPH_CACHE_SIGIL: str = "Graph Cache" # Output from `kgtk query --show-cache` # Set the defaults: if auto_display_html is None: auto_display_html = os.getenv("KGTK_AUTO_DISPLAY_HTML", "true").lower() in ["true", "yes", "y"] if auto_display_json is None: auto_display_json = os.getenv("KGTK_AUTO_DISPLAY_JSON", "true").lower() in ["true", "yes", "y"] if auto_display_md is None: auto_display_md = os.getenv("KGTK_AUTO_DISPLAY_MD", "false").lower() in ["true", "yes", "y"] if unquote_column_names is None: unquote_column_names = os.getenv( "KGTK_UNQUOTE_COLUMN_NAMES", "true").lower() in ["true", "yes", "y"] # Why not os.getenv("KGTK_BASH_COMMAND", "bash")? Splitting it up makes # mypy happier. if bash_command is None: bash_command = os.getenv("KGTK_BASH_COMMAND") if bash_command is None: bash_command = "bash" if kgtk_command is None: kgtk_command = os.getenv("KGTK_KGTK_COMMAND") if kgtk_command is None: kgtk_command = "kgtk" # Figure out the input DataFrame and pipeline arguments: in_df: typing.Optional[pandas.DataFrame] = None pipeline: str if isinstance(arg1, str): if arg2 is not None: raise ValueError( "kgtk(arg1, arg2): arg2 is not allowed when arg1 is a string") pipeline = arg1 elif isinstance(arg1, pandas.DataFrame): if arg2 is None: raise ValueError( "kgtk(arg1, arg2): arg2 is required when arg1 is a DataFrame") in_df = arg1 pipeline = arg2 if df is not None: if in_df is not None: raise ValueError( "kgtk(): df= is not allowed when arg1 is a DataFrame") in_df = df if len(pipeline) == 0: raise ValueError("kgtk(...): the pipeline is empty") pipeline = kgtk_command + " " + ' '.join(pipeline.splitlines()) # If we were supplied an input DataFrame, convert it to KGTK format. # # TODO: The conversion should optionally escape internal `|` characters as `\|`. in_tsv: typing.Optional[str] = None if in_df is not None: in_tsv = in_df.to_csv( sep='\t', index=False, quoting=csv.QUOTE_NONNUMERIC, quotechar='"', doublequote=False, escapechar='\\', ) if unquote_column_names: # Pandas will have treated the column names as strings and quoted # them. By convention, KGTK column names are symbols. So, we will # remove double quotes from the outside of each column name. # # TODO: Handle the troublesome case of a double quote inside a column # name. header, body = in_tsv.split('\n', 1) column_names = header.split('\t') column_names = [ x[1:-1] if x.startswith('"') else x for x in column_names ] header = "\t".join(column_names) in_tsv = header + "\n" + body # Execute the KGTK command pipeline: outbuf: StringIO = StringIO() errbuf: StringIO = StringIO() try: sh_bash = sh.Command(bash_command) sh_bash("-c", pipeline, _in=in_tsv, _out=outbuf, _err=errbuf) except sh.ErrorReturnCode as e: # The pipeline returned an error. stderr should hav ean error message. errmsg: str = errbuf.getvalue() if len(errmsg) > 0: print(errbuf.getvalue()) else: print(str(e)) return None output: str = outbuf.getvalue() # Decide what to do based on the start of the output: result: typing.Optional[pandas.DataFrame] = None if len(output) == 0: pass # No standard output elif output.startswith(MD_SIGIL): # Process Markdown output. if auto_display_md: display(Markdown(output)) else: print(output) elif output.startswith(JSON_SIGIL) or output.startswith(JSONL_MAP_SIGIL): # Process JSON output. if auto_display_json: display(JSON(json.loads(output))) else: print(output) elif output[:len(HTML_SIGIL)].casefold() == HTML_SIGIL.casefold(): # Process HTML output. if auto_display_html: display(HTML(output)) else: print(output) elif output[:len(USAGE_SIGIL)].casefold() == USAGE_SIGIL.casefold(): # Process --help output. print(output) elif output[:len(GRAPH_CACHE_SIGIL)].casefold( ) == GRAPH_CACHE_SIGIL.casefold(): # Process `kgtk query --show-cache` output. print(output) else: # Assume that anything else is KGTK formatted output. Convert it to a # pandas DataFrame and return it. # # TODO: Test this conversion with all KTK datatypes. Language-qualified # strings are problematic. Check what happens to quantites, date/times, # and locations. # # TODO: Remove the escape character from internal `|` characters? # If we do that, should we detect KGTK lists and complain? # `\|` -> `|` outbuf.seek(0) result = pandas.read_csv( outbuf, sep='\t', quotechar='"', doublequote=False, escapechar='\\', ) outbuf.close() # Any error messages? If so, print the at the end. errout: str = errbuf.getvalue() if len(errout) > 0: print(errout) return result
def job_upgrade(self, py_path): parser = argparse.ArgumentParser(prefix_chars=prefix) parser.add_argument('--platform', '-pm', type=Platform, help='Working platform') parser.add_argument('--name', '-n', type=str, help='Name of script file', default='default.py', nargs='+', action=JoinAction) parser.add_argument('--profile', '-p', type=str, help='Name of profile', default='DemoProfile', nargs='+', action=JoinAction) parser.add_argument('--old_job_id', type=str, help='ID of old version job', default=None, nargs='+', action=JoinAction) parser.add_argument('--validator', '-v', help='name of class Validator', type=str, nargs='+', action=JoinAction) parser.add_argument('--validator_path', '-vp', help='path to file with class Validator', type=str, nargs='+', action=JoinAction) parser.add_argument('--output_path', '-o', type=str, help='Output GCS path', default='', nargs='+', action=JoinAction) print("Parameters string = <<<{}>>>".format(py_path)) args = parser.parse_known_args(py_path.split()) prf_name = args[0].profile prf = Profile.get(prf_name) if prf is None: raise RuntimeError( 'Provide parameters profile {} does not exist.'.format( prf_name)) session, job, job_name, output_path = self.build_data_job(args, prf) # noinspection PyTypeChecker display( HTML( '<a href="{url}/{job_name}?project={project}®ion={region}">{job_name}</a>' .format(url=DATAPROC_JOBS_URL, job_name=job_name, project=prf.project, region=prf.region))) validator_module = run_path(args[0].validator_path) executor = JobUpgradeExecutor(job, session, args[0].old_job_id) res = executor.submit_upgrade_job(validator=args[0].validator, validator_path=validator_module, run_async=prf.job_async) job_tracker[job_name] = res # noinspection PyTypeChecker display(JSON(res)) job_reference = [ '#Use job_{job_name} instance to browse job properties.'.format( job_name=job_name), "job_{job_name} = job_tracker['{job_name}']".format( job_name=job_name) ] get_ipython().set_next_input('\n'.join(job_reference))
def py_deploy(self, py_path): parser = argparse.ArgumentParser(prefix_chars=prefix) parser.add_argument('--model', '-n', type=str, help='Name of model', nargs='+', action=JoinAction) parser.add_argument('--platform', '-pm', type=Platform, help='Working platform') parser.add_argument('--profile', '-p', type=str, help='Name of profile', default='AIDemoProfile', nargs='+', action=JoinAction) args = parser.parse_known_args(py_path.split()) prf_name = args[0].profile prf = Profile.get(prf_name) if prf is None: raise RuntimeError( 'Provide parameters profile {} does not exist.'.format( prf_name)) cred = prf.use_cloud_engine_credentials if args[0].platform == Platform.GCP: path_of_model = prf.path_to_saved_model args_dct = prf.arguments args_dct['pythonVersion'] = prf.python_version args_dct['runtimeVersion'] = prf.runtime_version args_dct['deploymentUri'] = f"{path_of_model}" deployment_artifacts = [] for a in prf.artifacts: if a.startswith("gs://"): deployment_artifact = Artifact(file_name=a, path=path_of_model) else: fname = os.path.basename(a) deployment_artifact = Artifact(file_name=fname, path=path_of_model) deployment_artifacts.append(deployment_artifact) m_builder = ModelBuilder() m_builder = m_builder.name(args[0].model).files_root(prf.root_path) if prf.custom_code is not None: m_builder = m_builder.custom_predictor_path(prf.custom_code) model = (m_builder.artifacts(deployment_artifacts).is_tuning( False).build()) ai_job_builder = AIJobBuilder() ai_job_builder = ai_job_builder.model(model).package_dst( prf.package_dst) if prf.custom_code is not None: ai_job_builder = ai_job_builder.package_src(prf.root_path) ai_job = ai_job_builder.deploy_input(args_dct).build() job_name = '{}_{}'.format(prf.job_prefix, int(datetime.now().timestamp())) project = prf.project if hasattr(prf, "project") else prf.job_prefix ai_region = prf.ai_region if hasattr(prf, "ai_region") else prf.region session = SessionFactory(platform=args[0].platform).build_session( job_bucket=prf.bucket, job_region=prf.region, cluster=prf.cluster, job_project_id=project, ml_region=ai_region, use_cloud_engine_credentials=cred) if args[0].platform == Platform.GCP: executor = AIPlatformJobExecutor(session, ai_job, wait_delay=10, wait_tries=1000) if prf.is_new_model: response = executor.submit_deploy_model_job( prf.version_name, create_new_model=True) else: response = executor.submit_deploy_model_job(prf.version_name) job_tracker[job_name] = executor # noinspection PyTypeChecker display( HTML( '<a href="{url}/{path_of_model}?project={project}">Deploy model path {job_name}</a>' .format(url=STORAGE_BROWSER_URL, path_of_model=path_of_model.split('gs://')[1], job_name=job_name, project=prf.project))) else: script_name = args[0].model #TODO: args={} executor = SageMakerExecutor(session, prf, mode='deploy', py_script_name=os.path.join( prf.root_path, script_name), args={}) predictor, response = executor.submit_deploy_model_job() job_tracker[job_name] = predictor # noinspection PyTypeChecker display(JSON(response)) job_reference = [ '#Use job_{job_name} instance to browse job properties.'.format( job_name=job_name), "#job_tracker['{job_name}']".format(job_name=job_name) ] get_ipython().set_next_input('\n'.join(job_reference))
def py_train(self, py_path): parser = argparse.ArgumentParser(prefix_chars=prefix) parser.add_argument('--platform', '-pm', type=Platform, help='Working platform') parser.add_argument('--name', '-n', type=str, help='Train script module name', default='./', nargs='+', action=JoinAction) parser.add_argument('--profile', '-p', type=str, help='Name of profile', default='AIDemoProfile', nargs='+', action=JoinAction) parser.add_argument('--output_path', '-o', type=str, help='Output GCS path', default='', nargs='+', action=JoinAction) args = parser.parse_known_args(py_path.split()) script_name = args[0].name prf_name = args[0].profile prf = Profile.get(prf_name) package_src = prf.root_path if prf is None: raise RuntimeError( 'Provide parameters profile {} does not exist.'.format( prf_name)) args_dct = ExecMagic.convert(args[1]) cred = prf.use_cloud_engine_credentials project = prf.project if hasattr(prf, "project") else prf.job_prefix ai_region = prf.ai_region if hasattr(prf, "ai_region") else prf.region session = SessionFactory(platform=args[0].platform).build_session( job_bucket=prf.bucket, job_region=prf.region, cluster=prf.cluster, job_project_id=project, ml_region=ai_region, use_cloud_engine_credentials=cred) job_name = '{}_{}'.format(prf.job_prefix, int(datetime.now().timestamp())) if args[0].platform == Platform.GCP: output_path = '{}/{}'.format(args[0].output_path, job_name) args_dct = {**prf.arguments, **args_dct} args_dct['--output_path'] = output_path arguments = Arguments() arguments.set_args(**args_dct) training_input = { "region": prf.ai_region, "scaleTier": prf.scale_tier, "jobDir": output_path, "pythonModule": '{}.{}'.format(prf.package_name, script_name.split('.py')[0]), "runtimeVersion": prf.runtime_version, "pythonVersion": prf.python_version } m_builder = ModelBuilder() model = m_builder.name(job_name).train_arguments(arguments).build() ai_job_builder = AIJobBuilder() ai_job = (ai_job_builder.model(model).package_src( package_src).package_dst('{}/{}'.format( prf.package_dst, job_name)).train_input(training_input).name( job_name).job_dir(output_path).build()) # noinspection PyTypeChecker display( HTML( '<a href="{url}/{job_name}/charts/cpu?project={project}">{job_name}</a>' .format(url=AI_JOBS_URL, job_name=job_name, project=prf.project))) executor = AIPlatformJobExecutor(session, ai_job, 10, 1000) else: for k in args_dct.copy(): args_dct[re.sub("--", '', k)] = args_dct[k] args_dct.pop(k) executor = SageMakerExecutor(session, prf, mode='train', py_script_name=os.path.join( package_src, script_name), args=args_dct) response = executor.submit_train_job() if args[0].platform == Platform.GCP: job_tracker[job_name] = executor # noinspection PyTypeChecker display( HTML( '<a href="{url}/{output_path}?project={project}">Output Data {job_name}</a>' .format(url=STORAGE_BROWSER_URL, output_path=output_path.split('gs://')[1], job_name=job_name, project=prf.project))) else: job_tracker[job_name] = executor.executor display( HTML('<a href="{url}">{job_name}</a>'.format( url=response['model_data'], job_name=response['model_data']))) display(JSON(response)) job_reference = [ '#Use job_{job_name} instance to browse job properties.'.format( job_name=job_name), "#job_{job_name} = job_tracker['{job_name}']".format( job_name=job_name) ] get_ipython().set_next_input('\n'.join(job_reference))
def py_data(self, py_path): parser = argparse.ArgumentParser(prefix_chars=prefix) parser.add_argument('--platform', '-pm', type=Platform, help='Working platform') parser.add_argument('--name', '-n', type=str, help='Name of script file', default='default.py', nargs='+', action=JoinAction) parser.add_argument('--profile', '-p', type=str, help='Name of profile', default='DemoProfile', nargs='+', action=JoinAction) parser.add_argument('--output_path', '-o', type=str, help='Output GCS path', default='', nargs='+', action=JoinAction) print("Parameters string = <<<{}>>>".format(py_path)) args = parser.parse_known_args(py_path.split()) prf_name = args[0].profile prf = Profile.get(prf_name) if prf is None: raise RuntimeError( 'Provide parameters profile {} does not exist.'.format( prf_name)) session, job, job_name, output_path = self.build_data_job(args, prf) if args[0].platform == Platform.GCP: # noinspection PyTypeChecker display( HTML( '<a href="{url}/{job_name}?project={project}®ion={region}">{job_name}</a>' .format(url=DATAPROC_JOBS_URL, job_name=job_name, project=prf.project, region=prf.region))) executor = DataprocExecutor(job, session) res = executor.submit_job(run_async=prf.job_async) else: executor = EmrExecutor(job, session) res = executor.submit_job(run_async=prf.job_async) job_tracker[job_name] = res # noinspection PyTypeChecker display( HTML( '<a href="{url}/{output_path}?{region}">Output Data {job_name}</a>' .format( url=STORAGE_BROWSER_URL if args[0].platform == Platform.GCP else S3_BROWSER_URL, output_path=output_path.split('gs://')[1] if args[0].platform == Platform.GCP else f"{prf.bucket}/emr/{res['placement']['cluster_id']}/steps/{res['placement']['step_id']}/", job_name=job_name, region=f'project={prf.project}' if args[0].platform == Platform.GCP else f'region={prf.region}'))) job_reference = [ '#Use job_{job_name} instance to browse job properties.'.format( job_name=job_name), "#job_{job_name} = job_tracker['{job_name}']".format( job_name=job_name) ] display(JSON(res)) get_ipython().set_next_input('\n'.join(job_reference))
def respond_json(self, data): display(Markdown("Response Set as JSON, preview below: ")) display(JSON(data, metadata={"naas_api": True}))
def respond_json(self, data): display(JSON(data, metadata={"naas_api": True}))