class ApiClient(): def __init__(self, profile=None): api_client = get_api_client(profile) self.dbfs_client = DbfsApi(api_client) self.runs_client = RunsApi(api_client) def mkdirs(self, dbfs_path): return self.dbfs_client.mkdirs(DbfsPath(dbfs_path)) def list_files(self, dbfs_path): return self.dbfs_client.list_files(DbfsPath(dbfs_path)) def put_file(self, src_path, dbfs_path, overwrite=True): return self.dbfs_client.put_file(src_path, DbfsPath(dbfs_path), overwrite) def submit_run(self, json_data): return self.runs_client.submit_run(json_data) def get_run(self, run_id): return self.runs_client.get_run(run_id)
class PipelinesApi(object): def __init__(self, api_client): self.client = DeltaPipelinesService(api_client) self.dbfs_client = DbfsApi(api_client) def create(self, spec, allow_duplicate_names, headers=None): data = self._upload_libraries_and_update_spec(spec) data['allow_duplicate_names'] = allow_duplicate_names return self.client.client.perform_query('POST', '/pipelines', data=data, headers=headers) def deploy(self, spec, allow_duplicate_names, headers=None): data = self._upload_libraries_and_update_spec(spec) data['allow_duplicate_names'] = allow_duplicate_names pipeline_id = data['id'] self.client.client.perform_query('PUT', '/pipelines/{}'.format(pipeline_id), data=data, headers=headers) def delete(self, pipeline_id, headers=None): self.client.delete(pipeline_id, headers) def get(self, pipeline_id, headers=None): return self.client.get(pipeline_id, headers) def list(self, headers=None): def call(page_token=None, max_results=None, order_by=None): _data = {} if page_token: _data["pagination.page_token"] = page_token if max_results: _data["pagination.max_results"] = max_results if order_by: _data["pagination.order_by"] = order_by return self.client.client.perform_query('GET', '/pipelines', data=_data, headers=headers) response = call() pipelines = response.get("statuses", []) while "next_page_token" in response.get("pagination", {}): response = call( page_token=response["pagination"]["next_page_token"]) pipelines.extend(response.get("statuses", [])) return pipelines def reset(self, pipeline_id, headers=None): self.client.reset(pipeline_id, headers) def run(self, pipeline_id, headers=None): self.client.run(pipeline_id, headers) def stop(self, pipeline_id, headers=None): self.client.stop(pipeline_id, headers) def _upload_libraries_and_update_spec(self, spec): spec = copy.deepcopy(spec) lib_objects = LibraryObject.from_json(spec.get('libraries', [])) local_lib_objects, external_lib_objects = self._identify_local_libraries( lib_objects) spec['libraries'] = LibraryObject.to_json( external_lib_objects + self._upload_local_libraries(local_lib_objects)) return spec @staticmethod def _identify_local_libraries(lib_objects): """ Partitions the given set of libraries into local and those already present in dbfs/s3 etc. Local libraries are (currently) jar files with a file scheme or no scheme at all. All other libraries should be present in a supported external source. :param lib_objects: List[LibraryObject] :return: List[List[LibraryObject], List[LibraryObject]] ([Local, External]) """ local_lib_objects, external_lib_objects = [], [] for lib_object in lib_objects: if lib_object.lib_type == 'maven': external_lib_objects.append(lib_object) continue parsed_uri = urllib.parse.urlparse(lib_object.path) if lib_object.lib_type in supported_lib_types and parsed_uri.scheme == '': local_lib_objects.append(lib_object) elif lib_object.lib_type in supported_lib_types and parsed_uri.scheme.lower( ) == 'file': # exactly 1 or 3 if parsed_uri.path.startswith('//') or parsed_uri.netloc != '': raise RuntimeError( 'invalid file uri scheme, ' 'did you mean to use file:/ or file:///') local_lib_objects.append( LibraryObject(lib_object.lib_type, parsed_uri.path)) else: external_lib_objects.append(lib_object) return local_lib_objects, external_lib_objects def _upload_local_libraries(self, local_lib_objects): remote_lib_objects = [ LibraryObject(llo.lib_type, self._get_hashed_path(llo.path)) for llo in local_lib_objects ] transformed_remote_lib_objects = [ LibraryObject(rlo.lib_type, DbfsPath(rlo.path)) for rlo in remote_lib_objects ] upload_files = [ llo_tuple for llo_tuple in zip(local_lib_objects, transformed_remote_lib_objects) if not self.dbfs_client.file_exists(llo_tuple[1].path) ] for llo, rlo in upload_files: self.dbfs_client.put_file(llo.path, rlo.path, False) return remote_lib_objects @staticmethod def _get_hashed_path(path): """ Finds the corresponding dbfs file path for the file located at the supplied path by calculating its hash using SHA1. :param path: Local File Path :return: Remote Path (pipeline_base_dir + file_hash (dot) file_extension) """ hash_buffer = sha1() with open(path, 'rb') as f: while True: data = f.read(BUFFER_SIZE) if not data: break hash_buffer.update(data) file_hash = hash_buffer.hexdigest() # splitext includes the period in the extension extension = os.path.splitext(path)[1][1:] if extension == 'whl': # Wheels need to follow the format described in the PEP, so we simply # pre-pend the content hash to the wheel_name # basename in Python returns the extension as well wheel_name = os.path.basename(path) path = '{}/{}/{}'.format(base_pipelines_dir, file_hash, wheel_name) else: path = '{}/{}.{}'.format(base_pipelines_dir, file_hash, extension) return path