def write_template(output_path, template_path: str, **kwargs): """ Write an executable output file using Jinja template. """ logging.info( f'Write command {output_path} using template {basename(template_path)}' ) try: makedirs(dirname(output_path), exist_ok=True) with open(template_path, 'r') as template_fd, open(output_path, 'w') as fd: content = render_string_template(template_fd.read(), **kwargs) fd.write(content) chmod(output_path, 0o755) except IOError as e: raise MlVToolException( f'Cannot create executable {output_path} using template {template_path}' ) from e except UndefinedError as e: raise MlVToolException( f'Cannot render {output_path} using template {template_path} due to undefined ' f'variable: {e}') from e except TemplateError as e: raise MlVToolException( f'Cannot render {output_path} using template {template_path}' ) from e
def from_meta(args: List[str], description: str) -> 'DocstringDvcExtra': if len(args) != 1 or not description: raise MlVToolException( f'Docstring dvc-extra invalid syntax: {args}:{description}.' f'Expected :dvc-extra: {{python_other_param}}') if args[0] != DocstringDvcExtra.DVC_EXTRA_KEY: raise MlVToolException( f'Receive bad parameter for {DocstringDvcExtra.DVC_EXTRA_KEY} {args[0]}' ) return DocstringDvcExtra(description)
def from_meta(args: List[str], description: str) -> 'DocstringDvcCommand': if len(args) != 1 or not description: raise MlVToolException( f'Docstring dvc-cmd invalid syntax: {args}:{description}.' f'Expected :dvc-cmd: {{dvc_command}}') if args[0] != DocstringDvcCommand.DVC_CMD_KEY: raise MlVToolException( f'Receive bad parameter for {DocstringDvcCommand.DVC_CMD_KEY} {args[0]}' ) return DocstringDvcCommand(description)
def get_ast(content: str, name: str = 'undefined'): """ Return ast tree of the given python content """ try: return ast.parse(content, filename=name) except SyntaxError as e: raise MlVToolException( f'Invalid python format for file {name}: {e}') from e except Exception as e: raise MlVToolException( f'Cannot extract ast tree{f" {name}" if name else ""}: {e}') from e
def from_meta(args: List[str], description: str) -> 'DocstringDvcMetaFile': if len(args) != 1 or not description: raise MlVToolException( f'Docstring dvc-meta-file invalid syntax: {args}:{description}.' f'Expected :dvc-meta-file: {{meta_file_name}}') if args[0] != DocstringDvcMetaFile.DVC_META_FILE_KEY: raise MlVToolException( f'Receive bad parameter for {DocstringDvcMetaFile.DVC_META_FILE_KEY} {args[0]}' ) description = description if description.endswith( '.dvc') else f'{description}.dvc' return DocstringDvcMetaFile(description)
def get_dvc_params(docstring: Docstring) -> DvcParams: """ Return a set of dvc docstring parameters (dvc dependencies, outputs, extra parameters or whole command) """ dvc_in = [] dvc_out = [] dvc_out_persist = [] dvc_extra = [] dvc_cmd = [] params = {param.arg_name: param.type_name for param in docstring.params} dvc_meta = None for meta in docstring.meta: if not meta.args: continue if meta.args[0] == DocstringDvcIn.DVC_IN_KEY: dvc_in.append( DocstringDvcIn.from_meta(params, meta.args, meta.description)) elif meta.args[0] == DocstringDvcOut.DVC_OUT_KEY: dvc_out.append( DocstringDvcOut.from_meta(params, meta.args, meta.description)) elif meta.args[0] == DocstringDvcOutPersist.DVC_OUT_PERSIST_KEY: dvc_out_persist.append( DocstringDvcOutPersist.from_meta(params, meta.args, meta.description)) elif meta.args[0] == DocstringDvcExtra.DVC_EXTRA_KEY: dvc_extra.append( DocstringDvcExtra.from_meta(meta.args, meta.description)) elif meta.args[0] == DocstringDvcMetaFile.DVC_META_FILE_KEY: dvc_meta = DocstringDvcMetaFile.from_meta(meta.args, meta.description) elif meta.args[0] == DocstringDvcCommand.DVC_CMD_KEY: dvc_cmd.append( DocstringDvcCommand.from_meta(meta.args, meta.description)) if len(dvc_cmd) > 1: raise MlVToolException( f'Only one occurence of {DocstringDvcCommand.DVC_CMD_KEY} is allowed' ) if dvc_cmd and (dvc_in or dvc_out or dvc_extra): raise MlVToolException( f'Dvc command {DocstringDvcCommand.DVC_CMD_KEY} is exclusive with other dvc parameters ' f'[{DocstringDvcExtra.DVC_EXTRA_KEY}, {DocstringDvcIn.DVC_IN_KEY}, ' f'{DocstringDvcOut.DVC_OUT_KEY}, {DocstringDvcOutPersist.DVC_OUT_PERSIST_KEY}]' ) return DvcParams(dvc_in, dvc_out, dvc_out_persist, dvc_extra, dvc_cmd[0] if dvc_cmd else '', dvc_meta.file_name if dvc_meta else '')
def run(self, *args, **kwargs): args = ArgumentBuilder(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Convert Notebook to python script') \ .add_work_dir_argument() \ .add_conf_path_argument() \ .add_docstring_conf() \ .add_force_argument() \ .add_argument('-n', '--notebook', type=str, required=True, help='The notebook to convert') \ .parse(args) self.set_log_level(args) conf = self.get_conf(args.working_directory, args.notebook, args.conf_path) if not conf.path: raise MlVToolException('Configuration file is mandatory') docstring_conf_path = args.docstring_conf or conf.docstring_conf docstring_conf = load_docstring_conf( docstring_conf_path) if docstring_conf_path else None output_script = get_script_output_path(args.notebook, conf) out_dvc_cmd = get_dvc_cmd_output_path(output_script, conf) self.check_force(args.force, [output_script, out_dvc_cmd]) export_to_script(args.notebook, output_script, conf) gen_dvc_command(output_script, out_dvc_cmd, conf, docstring_conf)
def write_python_script(script_content: str, output_path: str): """ Write Python 3 generated code into an executable file - use yapf for code format """ try: makedirs(dirname(output_path), exist_ok=True) formatted_script = FormatCode(script_content, style_config=f'{{ based_on_style: pep8, ' f'column_limit: {MAX_LINE_LENGTH} }}') with open(output_path, 'w') as fd: fd.write(formatted_script[0]) chmod(output_path, 0o755) except SyntaxError as e: raise MlVToolException(f'Cannot write generated Python, content is wrongly formatted: {script_content}') from e except IOError as e: raise MlVToolException(f'Cannot write generated Python script {output_path}') from e
def run(self, *args, **kwargs): args = ArgumentBuilder(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Checks all notebooks and scripts consistency.\n' 'Run the up to date checks on all notebooks from the notebook directory. ' 'Script names are deduce from the conf.') \ .add_work_dir_argument() \ .add_conf_path_argument() \ .add_path_argument('-n', '--notebooks-dir', type=str, help='Notebooks directory') \ .add_argument('-i', '--ignore', action='append', help='Notebook filename to ignore', default=[]) \ .parse(args) self.set_log_level(args) conf = self.get_conf(args.working_directory, args.notebooks_dir, args.conf_path) if not conf.path: raise MlVToolException('Configuration file is mandatory') equals = True for notebook in glob.glob(join(args.notebooks_dir, '*.ipynb')): if basename(notebook) in args.ignore: logging.info(f'Ignore notebook {notebook}') continue associated_script = get_script_output_path(notebook, conf) equals = run_consistency_check(notebook, associated_script, conf) and equals sys.exit(0 if equals else 1)
def run(self, *args, **kwargs): args = ArgumentBuilder(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Generate python script wrappers') \ .add_work_dir_argument() \ .add_conf_path_argument() \ .add_force_argument() \ .add_docstring_conf() \ .add_path_argument('-i', '--input-script', type=str, required=True, help='The python input script') \ .add_path_argument('-o', '--out-dvc-cmd', type=str, help='Path to the generated bash dvc command') \ .parse(args) self.set_log_level(args) conf = self.get_conf(args.working_directory, args.input_script, args.conf_path) docstring_conf_path = args.docstring_conf or conf.docstring_conf if not conf.path and not args.out_dvc_cmd: raise MlVToolException( 'Parameter --out-dvc-cmd is mandatory if no conf provided') docstring_conf = load_docstring_conf( docstring_conf_path) if docstring_conf_path else None out_dvc_cmd = args.out_dvc_cmd or get_dvc_cmd_output_path( args.input_script, conf) self.check_force(args.force, [out_dvc_cmd]) gen_dvc_command(args.input_script, out_dvc_cmd, conf, docstring_conf)
def check_force(self, force: bool, outputs: List[str]): if force: return for output in outputs: if exists(output): raise MlVToolException(f'Output file {output} already exists, ' f'use --force option to overwrite it')
def get_dvc_meta(dvc_meta_file: str) -> DvcMeta: """ Get DVC meta from a DVC meta file """ logging.debug(f'Get DVC meta from {dvc_meta_file}') try: with open(dvc_meta_file, 'r') as fd: raw_data = yaml.load(fd.read()) deps = [v['path'] for v in raw_data.get('deps', [])] outs = [v['path'] for v in raw_data.get('outs', [])] meta = DvcMeta(basename(dvc_meta_file), raw_data.get('cmd', ''), deps, outs) logging.debug(f'Meta for {dvc_meta_file}: {meta}') return meta except (yaml.error.YAMLError, AttributeError) as e: raise MlVToolException(f'Cannot load DVC meta file {dvc_meta_file}. Wrong format') from e except IOError as e: raise MlVToolException(f'Cannot load DVC meta file {dvc_meta_file}') from e
def resolve_docstring(docstring: str, docstring_conf: dict) -> str: """ Use jinja to resolve docstring template using user custom configuration """ try: return render_string_template(docstring, conf=docstring_conf) except jinja2.exceptions.TemplateError as e: raise MlVToolException( f'Cannot resolve docstring using Jinja, {e}') from e
def get_git_top_dir(cwd: str) -> str: try: return subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], cwd=cwd) \ .decode() \ .strip('\n') except subprocess.SubprocessError as e: message = 'Can not run \'git rev-parse\' command to get top directory. Input files must belong ' \ 'to a git repository.' logging.fatal(message) raise MlVToolException(message) from e
def get_ast_from_file(file_path: str): """ Read provided file then return the corresponding ast tree """ try: with open(file_path, 'r') as fd: return get_ast(fd.read(), file_path) except IOError as e: raise MlVToolException( f'Cannot read file {file_path} for ast tree extraction') from e
def get_dvc_files(dvc_target_file: str) -> List[str]: """ Return the list of potential DVC meta file pipeline step. DVC meta files are all located in the same directory for a given pipeline. DVC file extension: .dvc """ if not exists(dvc_target_file): raise MlVToolException( f'Targeted pipeline metadata step {dvc_target_file} does not exist' ) return glob.glob(join(dirname(dvc_target_file), '*.dvc'))
def extract_docstring_from_file(input_path: str, docstring_conf: dict = None) -> DocstringInfo: """ Extract method docstring information (docstring, method_name, input_path) The provided python script must have one and only one method The extracted docstring is parsed and returned in docstring info """ logging.info(f'Extract docstring from "{input_path}".') try: with open(input_path, 'r') as fd: root = ast.parse(fd.read()) except FileNotFoundError as e: raise MlVToolException( f'Python input script {input_path} not found.') from e except SyntaxError as e: raise MlVToolException( f'Invalid python script format: {input_path}') from e for node in ast.walk(root): if isinstance(node, ast.FunctionDef): method_name = node.name docstring_str = ast.get_docstring(node) if docstring_conf: docstring_str = resolve_docstring(docstring_str, docstring_conf) docstring = dc_parse(docstring_str) break else: logging.error(f'Not method found in {input_path}') raise MlVToolException(f'Not method found in {input_path}') logging.debug( f'Docstring extracted from method {method_name}: {docstring_str}') docstring_info = DocstringInfo(method_name=method_name, docstring=docstring, repr=docstring_str, file_path=input_path) return docstring_info
def extract_docstring(cell_content: str) -> str: """ Extract a docstring from a cell content """ logging.info('Extract docstring from cell content') logging.debug(f'Cell content {cell_content}') docstring = '' try: root = get_ast(cell_content) except SyntaxError as e: raise MlVToolException( f'Invalid python cell format: {cell_content}') from e for node in ast.walk(root): if isinstance(node, ast.Module): docstring = ast.get_docstring(node) break return docstring
def run(self, *args, **kwargs): args = ArgumentBuilder(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Export a DVC pipeline to sequential execution.') \ .add_force_argument() \ .add_work_dir_argument() \ .add_argument('--dvc', type=str, required=True, help='DVC targeted pipeline metadata step') \ .add_argument('-o', '--output', type=str, help='The Python pipeline script output path', required=True) \ .parse(args) self.set_log_level(args) work_dir = args.working_directory if not args.force and exists(args.output): raise MlVToolException( f'Output file {args.output} already exists, use --force option to overwrite it' ) export_pipeline(args.dvc, args.output, work_dir)
def get_converted_script(input_notebook_path: str, conf: MlVToolConf) -> str: """ Extract notebook python content using nbconvert """ exporter = PythonExporter(get_config(TEMPLATE_PATH)) exporter.register_filter(name='filter_trailing_cells', jinja_filter=filter_trailing_cells) exporter.register_filter(name='get_formatted_cells', jinja_filter=get_formatted_cells) exporter.register_filter(name='get_data_from_docstring', jinja_filter=get_data_from_docstring) exporter.register_filter(name='sanitize_method_name', jinja_filter=to_method_name) resources = {'ignore_keys': conf.ignore_keys} logging.debug(f'Template info {resources}') try: script_content, _ = exporter.from_filename(input_notebook_path, resources=resources) except Exception as e: raise MlVToolException(e) from e return script_content
def run(self, *args, **kwargs): args = ArgumentBuilder(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Convert Notebook to python script') \ .add_work_dir_argument() \ .add_conf_path_argument() \ .add_force_argument() \ .add_path_argument('-n', '--notebook', type=str, required=True, help='The notebook to convert') \ .add_path_argument('-o', '--output', type=str, help='The Python script output path') \ .parse(args) self.set_log_level(args) conf = self.get_conf(args.working_directory, args.notebook, args.conf_path) if not conf.path and not args.output: raise MlVToolException( 'Parameter --output is mandatory if no conf provided') output = args.output or get_script_output_path(args.notebook, conf) self.check_force(args.force, [output]) export_to_script(args.notebook, output, conf)
def export_to_script(input_notebook_path: str, output_path: str, conf: MlVToolConf): """ Export a notebook to a parameterize Python 3 script using Jinja templates """ logging.info( f'Generate Python script {output_path} from Jupyter Notebook {input_notebook_path}' ) logging.debug(f'Global Configuration: {conf}') logging.debug(f'Template path {TEMPLATE_PATH}') exporter = PythonExporter(get_config(TEMPLATE_PATH)) exporter.register_filter(name='filter_trailing_cells', jinja_filter=filter_trailing_cells) exporter.register_filter(name='get_formatted_cells', jinja_filter=get_formatted_cells) exporter.register_filter(name='get_data_from_docstring', jinja_filter=get_data_from_docstring) exporter.register_filter(name='sanitize_method_name', jinja_filter=to_method_name) resources = {'ignore_keys': conf.ignore_keys} logging.debug(f'Template info {resources}') try: script_content, _ = exporter.from_filename(input_notebook_path, resources=resources) except Exception as e: raise MlVToolException(e) from e if not script_content: logging.warning('Empty notebook provided. Nothing to do.') return write_python_script(script_content, output_path) logging.log( logging.WARNING + 1, f'Python script successfully generated in {abspath(output_path)}')
def meta_checks(params: Dict[str, Optional[str]], args: List[str], description: str, expected_key: str): if len(args) == 0: raise MlVToolException('Cannot parse empty DocstringDVC') if len(args) > 2: raise MlVToolException( f'Invalid syntax: {args}. Expected :dvc-[in|out] [related_param]?: {{file_path}}' ) if args[0] != expected_key: raise MlVToolException('Receive bad parameter {}'.format(args[0])) if not description: raise MlVToolException(f'Not path given for {args}') related_param = args[1] if len(args) == 2 else None if related_param and related_param not in params: raise MlVToolException( f'Cannot find related parameter for {related_param} in {args}') if related_param and params[related_param] not in (None, 'str'): raise MlVToolException( f'Unsupported type {params[related_param]} for {args}. Discard.' )
def get_work_directory(input_path: str) -> str: if not exists(input_path): raise MlVToolException(f'Input file {input_path} does not exist.') return get_git_top_dir(dirname(input_path))
def parse_docstring(docstring_str: str) -> Docstring: try: docstring = dc_parse(docstring_str, style=Style.rest) except ParseError as e: raise MlVToolException(f'Docstring format error. {e}') from e return docstring