def validate_metadata(self): kale_block_name_regex = r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$' kale_name_msg = ("must consist of lower case alphanumeric characters " "or '-', and must start and end with an alphanumeric" " character.") k8s_valid_name_regex = r'^[\.\-a-z0-9]+$' k8s_name_msg = ("must consist of lower case alphanumeric characters, " "'-' or '.'") # check for required fields for required in METADATA_REQUIRED_KEYS: if required not in self.pipeline_metadata: raise ValueError("Key %s not found. Add this field either on " "the notebook metadata or as an override" % required) if not re.match(kale_block_name_regex, self.pipeline_metadata['pipeline_name']): raise ValueError("Pipeline name %s" % kale_name_msg) volumes = self.pipeline_metadata.get('volumes', []) if volumes or isinstance(volumes, list): for v in volumes: if 'name' not in v: raise ValueError("Provide a valid name for every volume") if not re.match(k8s_valid_name_regex, v['name']): raise ValueError( "PV/PVC resource name {}".format(k8s_name_msg)) if ('snapshot' in v and v['snapshot'] and (('snapshot_name' not in v) or not re.match(k8s_valid_name_regex, v['snapshot_name']))): raise ValueError( "Provide a valid snapshot resource name if you want to" " snapshot a volume. Snapshot resource name %s" % k8s_name_msg) # Convert annotations to a dictionary annotations = { a['key']: a['value'] for a in v['annotations'] or [] if a['key'] != '' and a['value'] != '' } v['annotations'] = annotations v['size'] = str(v['size']) # The Jupyter Web App assumes the first volume of the notebook # is the working directory, so we make sure to make it appear # first in the spec. volumes = sorted(volumes, reverse=True, key=lambda x: is_workspace_dir(x['mount_point'])) self.pipeline_metadata['volumes'] = volumes else: raise ValueError("Volumes must be a valid list of volumes spec")
def _parse_volumes_metadata(volumes): """Parse the volume spec. The transformations applied are the following: - The annotations dict from a list of {'key': k, 'value': v} to {k: v} - Convert the size field to str - Sort the volumes dict to place the workspace volume first Args: volumes: Volume spec Returns: Updated and validated volume spec """ validated_volumes = copy.deepcopy(volumes) for v in validated_volumes: for required in VOLUME_REQUIRED_FIELDS: if required not in v: raise ValueError( "Volume spec: missing {} value".format(required)) if not re.match(K8S_VALID_NAME_REGEX, v['name']): raise ValueError( "Volume spec: PV/PVC name {}".format(K8S_NAME_MSG)) if ('snapshot' in v and v['snapshot'] and (('snapshot_name' not in v) or not re.match(K8S_VALID_NAME_REGEX, v['snapshot_name']))): raise ValueError("Provide a valid snapshot resource name if you" " want to snapshot a volume. Snapshot resource" " name {}".format(K8S_NAME_MSG)) if not v['type'] in VOLUME_TYPES: raise ValueError("Volume spec: volume type {} not recognized." " Allowed volumes type: {}".format( v['type'], VOLUME_TYPES)) if not isinstance(v['annotations'], list): raise ValueError('Volume spec: annotations must be a list') # Convert annotations to a {k: v} dictionary try: # TODO: Make JupyterLab annotate with {k: v} instead of # {'key': k, 'value': v} annotations = { a['key']: a['value'] for a in v['annotations'] or [] if a['key'] != '' and a['value'] != '' } except KeyError as e: if str(e) in ["'key'", "'value'"]: raise ValueError("Volume spec: volume annotations must be a" " list of {'key': k, 'value': v} dicts") else: raise e v['annotations'] = annotations v['size'] = str(v['size']) # The Jupyter Web App assumes the first volume of the notebook is the # working directory, so we make sure to make it appear first in the spec. validated_volumes = sorted( validated_volumes, reverse=True, key=lambda _v: is_workspace_dir(_v['mount_point'])) return validated_volumes
def gen_kfp_code(nb_graph, nb_path, pipeline_parameters, metadata, auto_snapshot): """ Takes a NetworkX workflow graph with the following properties - node property 'code' contains the source code - node property 'ins' lists the variables to be de-serialized - node property 'outs' lists the variables to be serialized and generated a standalone Python script in KFP DSL to deploy a KFP pipeline. """ # initialize templating environment template_env = Environment(loader=PackageLoader('kale', 'templates')) template_env.filters['add_suffix'] = lambda s, suffix: s+suffix # List of light-weight components generated code function_blocks = list() # List of names of components function_names = list() # Dictionary of steps defining the dependency graph function_prevs = dict() # Include all volumes as pipeline parameters volumes = metadata.get('volumes', []) # Convert annotations to a dictionary and convert size to a string for v in volumes: # Convert annotations to a dictionary annotations = {a['key']: a['value'] for a in v['annotations'] or [] if a['key'] != '' and a['value'] != ''} v['annotations'] = annotations v['size'] = str(v['size']) if v['type'] == 'pv': # FIXME: How should we handle existing PVs? continue if v['type'] == 'pvc': par_name = f"vol_{v['mount_point'].replace('/', '_').strip('_')}" pipeline_parameters[par_name] = ('str', v['name']) elif v['type'] == 'new_pvc': rok_url = v['annotations'].get("rok/origin") if rok_url is not None: par_name = f"rok_{v['name'].replace('-', '_')}_url" pipeline_parameters[par_name] = ('str', rok_url) else: raise ValueError(f"Unknown volume type: {v['type']}") # The Jupyter Web App assumes the first volume of the notebook is the # working directory, so we make sure to make it appear first in the spec. volumes = sorted(volumes, reverse=True, key=lambda v: is_workspace_dir(v['mount_point'])) marshal_volume = True marshal_path = "/marshal" # Check if the workspace directory is under a mounted volume. # If so, marshal data into a folder in that volume, # otherwise create a new volume and mount it at /marshal wd = metadata.get('abs_working_dir', None) if wd: wd = os.path.realpath(wd) # get the volumes for which the working directory is a subpath of the mount point vols = list(filter(lambda x: wd.startswith(x['mount_point']), volumes)) # if we found any, then set marshal directory inside working directory if len(vols) >= 1: marshal_volume = False marshal_dir = f".{os.path.basename(nb_path)}.kale.marshal.dir" marshal_path = os.path.join(wd, marshal_dir) pipeline_args_names = list(pipeline_parameters.keys()) # wrap in quotes every parameter - required by kfp pipeline_args = ', '.join([f"{arg}='{pipeline_parameters[arg][1]}'" for arg in pipeline_parameters]) # arguments are actually the pipeline arguments. Since we don't know precisely in which pipeline # steps they are needed we just pass them to every one. The assumption is that these variables # were treated as constants notebook-wise. function_args = ', '.join([f"{arg}: {pipeline_parameters[arg][0]}" for arg in pipeline_parameters]) # Order the pipeline topologically to cycle through the DAG for block_name in nx.topological_sort(nb_graph): # first create the function function_template = template_env.get_template('function_template.txt') block_data = nb_graph.nodes(data=True)[block_name] # check if the block has any ancestors predecessors = list(nb_graph.predecessors(block_name)) args = list() if len(predecessors) > 0: for a in predecessors: args.append(f"{a}_task") function_prevs[block_name] = args function_blocks.append(function_template.render( pipeline_name=metadata['pipeline_name'], function_name=block_name, function_args=function_args, function_blocks=[block_data['source']], in_variables=block_data['ins'], out_variables=block_data['outs'], marshal_path=marshal_path, auto_snapshot=auto_snapshot, nb_path=nb_path )) function_names.append(block_name) leaf_nodes = [x for x in nb_graph.nodes() if nb_graph.out_degree(x) == 0] if auto_snapshot: final_auto_snapshot_name = 'final_auto_snapshot' function_blocks.append(function_template.render( pipeline_name=metadata['pipeline_name'], function_name=final_auto_snapshot_name, function_args=function_args, function_blocks=[], in_variables=set(), out_variables=set(), marshal_path=marshal_path, auto_snapshot=auto_snapshot, nb_path=nb_path )) function_names.append(final_auto_snapshot_name) function_prevs[final_auto_snapshot_name] = [f"{x}_task" for x in leaf_nodes] pipeline_template = template_env.get_template('pipeline_template.txt') pipeline_code = pipeline_template.render( block_functions=function_blocks, block_functions_names=function_names, block_function_prevs=function_prevs, experiment_name=metadata['experiment_name'], pipeline_name=metadata['pipeline_name'], pipeline_description=metadata.get('pipeline_description', ''), pipeline_arguments=pipeline_args, pipeline_arguments_names=', '.join(pipeline_args_names), docker_base_image=metadata.get('docker_image', ''), volumes=volumes, leaf_nodes=leaf_nodes, working_dir=metadata.get('abs_working_dir', None), marshal_volume=marshal_volume, marshal_path=marshal_path, auto_snapshot=auto_snapshot ) # fix code style using pep8 guidelines pipeline_code = autopep8.fix_code(pipeline_code) return pipeline_code