Ejemplo n.º 1
0
class ApiClient():
    def __init__(self, profile=None):
        api_client =  get_api_client(profile)
        self.dbfs_client = DbfsApi(api_client)
        self.runs_client = RunsApi(api_client)

    def mkdirs(self, dbfs_path):
        return self.dbfs_client.mkdirs(DbfsPath(dbfs_path))

    def list_files(self, dbfs_path):
        return self.dbfs_client.list_files(DbfsPath(dbfs_path))

    def put_file(self, src_path, dbfs_path, overwrite=True):
        return self.dbfs_client.put_file(src_path, DbfsPath(dbfs_path), overwrite)

    def submit_run(self, json_data):
        return self.runs_client.submit_run(json_data)

    def get_run(self, run_id):
        return self.runs_client.get_run(run_id)
Ejemplo n.º 2
0
class PipelinesApi(object):
    def __init__(self, api_client):
        self.client = DeltaPipelinesService(api_client)
        self.dbfs_client = DbfsApi(api_client)

    def create(self, spec, allow_duplicate_names, headers=None):
        data = self._upload_libraries_and_update_spec(spec)
        data['allow_duplicate_names'] = allow_duplicate_names
        return self.client.client.perform_query('POST',
                                                '/pipelines',
                                                data=data,
                                                headers=headers)

    def deploy(self, spec, allow_duplicate_names, headers=None):
        data = self._upload_libraries_and_update_spec(spec)
        data['allow_duplicate_names'] = allow_duplicate_names
        pipeline_id = data['id']
        self.client.client.perform_query('PUT',
                                         '/pipelines/{}'.format(pipeline_id),
                                         data=data,
                                         headers=headers)

    def delete(self, pipeline_id, headers=None):
        self.client.delete(pipeline_id, headers)

    def get(self, pipeline_id, headers=None):
        return self.client.get(pipeline_id, headers)

    def list(self, headers=None):
        def call(page_token=None, max_results=None, order_by=None):
            _data = {}
            if page_token:
                _data["pagination.page_token"] = page_token
            if max_results:
                _data["pagination.max_results"] = max_results
            if order_by:
                _data["pagination.order_by"] = order_by

            return self.client.client.perform_query('GET',
                                                    '/pipelines',
                                                    data=_data,
                                                    headers=headers)

        response = call()
        pipelines = response.get("statuses", [])

        while "next_page_token" in response.get("pagination", {}):
            response = call(
                page_token=response["pagination"]["next_page_token"])
            pipelines.extend(response.get("statuses", []))
        return pipelines

    def reset(self, pipeline_id, headers=None):
        self.client.reset(pipeline_id, headers)

    def run(self, pipeline_id, headers=None):
        self.client.run(pipeline_id, headers)

    def stop(self, pipeline_id, headers=None):
        self.client.stop(pipeline_id, headers)

    def _upload_libraries_and_update_spec(self, spec):
        spec = copy.deepcopy(spec)
        lib_objects = LibraryObject.from_json(spec.get('libraries', []))
        local_lib_objects, external_lib_objects = self._identify_local_libraries(
            lib_objects)

        spec['libraries'] = LibraryObject.to_json(
            external_lib_objects +
            self._upload_local_libraries(local_lib_objects))
        return spec

    @staticmethod
    def _identify_local_libraries(lib_objects):
        """
        Partitions the given set of libraries into local and those already present in dbfs/s3 etc.
        Local libraries are (currently) jar files with a file scheme or no scheme at all.
        All other libraries should be present in a supported external source.
        :param lib_objects: List[LibraryObject]
        :return: List[List[LibraryObject], List[LibraryObject]] ([Local, External])
        """
        local_lib_objects, external_lib_objects = [], []
        for lib_object in lib_objects:
            if lib_object.lib_type == 'maven':
                external_lib_objects.append(lib_object)
                continue
            parsed_uri = urllib.parse.urlparse(lib_object.path)
            if lib_object.lib_type in supported_lib_types and parsed_uri.scheme == '':
                local_lib_objects.append(lib_object)
            elif lib_object.lib_type in supported_lib_types and parsed_uri.scheme.lower(
            ) == 'file':
                # exactly 1 or 3
                if parsed_uri.path.startswith('//') or parsed_uri.netloc != '':
                    raise RuntimeError(
                        'invalid file uri scheme, '
                        'did you mean to use file:/ or file:///')
                local_lib_objects.append(
                    LibraryObject(lib_object.lib_type, parsed_uri.path))
            else:
                external_lib_objects.append(lib_object)
        return local_lib_objects, external_lib_objects

    def _upload_local_libraries(self, local_lib_objects):
        remote_lib_objects = [
            LibraryObject(llo.lib_type, self._get_hashed_path(llo.path))
            for llo in local_lib_objects
        ]

        transformed_remote_lib_objects = [
            LibraryObject(rlo.lib_type, DbfsPath(rlo.path))
            for rlo in remote_lib_objects
        ]
        upload_files = [
            llo_tuple for llo_tuple in zip(local_lib_objects,
                                           transformed_remote_lib_objects)
            if not self.dbfs_client.file_exists(llo_tuple[1].path)
        ]

        for llo, rlo in upload_files:
            self.dbfs_client.put_file(llo.path, rlo.path, False)

        return remote_lib_objects

    @staticmethod
    def _get_hashed_path(path):
        """
        Finds the corresponding dbfs file path for the file located at the supplied path by
        calculating its hash using SHA1.
        :param path: Local File Path
        :return: Remote Path (pipeline_base_dir + file_hash (dot) file_extension)
        """
        hash_buffer = sha1()
        with open(path, 'rb') as f:
            while True:
                data = f.read(BUFFER_SIZE)
                if not data:
                    break
                hash_buffer.update(data)

        file_hash = hash_buffer.hexdigest()
        # splitext includes the period in the extension
        extension = os.path.splitext(path)[1][1:]
        if extension == 'whl':
            # Wheels need to follow the format described in the PEP, so we simply
            # pre-pend the content hash to the wheel_name
            # basename in Python returns the extension as well
            wheel_name = os.path.basename(path)
            path = '{}/{}/{}'.format(base_pipelines_dir, file_hash, wheel_name)
        else:
            path = '{}/{}.{}'.format(base_pipelines_dir, file_hash, extension)
        return path