def run_batch_job(self,
                      job_json,
                      background=True,
                      timeout=constants.SYNC_JOB_MAX_WAIT_TIME):
        """
        Submits a job request using Livy batches endpoint

        Keyword arguments:
            job_json {str} -- Job request payload
            background {bool} -- Flag indicating if the method should wait until the job finishes or return immediately after submitting the request.

        Returns:
             response {dict} -- Dictionary with job id, state and the application id.
        """
        job_url = "{}/batches".format(self.url)
        response = RestUtil.request_with_retry().post(
            url=job_url,
            json=job_json,
            headers={"Content-Type": "application/json"},
            auth=self.auth)

        if not response.ok:
            raise ServiceError("Failed to run job. " + response.text)

        job_response = response.json()
        job_id = job_response.get("id")
        state = job_response.get("state")
        appId = job_response.get("appId")
        if background is False:
            start_time = time.time()
            elapsed_time = 0
            sleep_time = 15
            while state not in (constants.LIVY_JOB_FINISHED_STATE,
                                constants.LIVY_JOB_FAILED_STATE,
                                constants.LIVY_JOB_DEAD_STATE,
                                constants.LIVY_JOB_KILLED_STATE):
                if elapsed_time > int(timeout):
                    raise ServiceError(
                        "Job didn't come to Finished/Failed state in {} seconds. Current state is "
                        .format(timeout, state))
                print("{}: Sleeping for {} seconds. Current state {}".format(
                    datetime.datetime.now(), sleep_time, state))
                sleep(sleep_time)
                elapsed_time = time.time() - start_time
                status = self.get_job_status(job_id)
                state = status.get("state")
                appId = status.get("appId")

        response = {"id": job_id, "state": state, "appId": appId}

        return response
    def download_file(self, file_name_with_path):
        """
        Downloads a file from HDFS location identified by the path

        Keyword arguments:
            file_name_with_path {str} -- Name of the file identified with a path

        Returns:
             response -- Default Flask response object with file content and appropriate headers set
        """
        file_name_with_path = self._get_actual_download_file_path(
            file_name_with_path)

        open_file_url = self.url + file_name_with_path + "?op=OPEN"

        response = RestUtil.request_with_retry().get(open_file_url,
                                                     auth=self.auth,
                                                     allow_redirects=False)
        if response.status_code != 307:
            if response.status_code == 404:
                raise ObjectNotFoundError(
                    "File {} not found.".format(file_name_with_path))
            raise ServiceError(
                "Attempt to open file {0} failed with {1} and {2}.".format(
                    file_name_with_path, response.status_code,
                    response.reason))

        file_download_url = None
        if response.headers is not None:
            file_download_url = response.headers["Location"]

        if file_download_url is not None:
            res = RestUtil.request_with_retry().get(file_download_url,
                                                    auth=self.auth,
                                                    stream=True)
            if not response.ok:
                raise ServiceError(
                    "Attempt to download file {0} failed with {1} and {2}.".
                    format(file_name_with_path, response.status_code,
                           response.reason))

            response = Response(res.content, headers=dict(res.headers))
            response.headers['Content-Type'] = 'application/octet-stream'
            response.headers[
                'Content-Disposition'] = 'attachment;filename="{}"'.format(
                    file_name_with_path.split("/")[-1])

            return response
    def get_job_status(self, job_id):
        """
        Fetches the status of the batch job using Livy's batches endpoint

        Keyword arguments:
            job_id {str} -- Job identifier

        Returns:
             response {dict} -- Dictionary with job id, state and the application id.
        """
        job_url = "{}/batches/{}".format(self.url, job_id)

        response = RestUtil.request_with_retry().get(url=job_url,
                                                     auth=self.auth)

        if not response.ok:
            if response.status_code == 404:
                raise ObjectNotFoundError(
                    "Job with id {} not found.".format(job_id))

            raise ServiceError("Failed to get jobs state. " + response.text)

        job_response = response.json()
        response = {
            "id": job_response.get("id"),
            "state": job_response.get("state"),
            "appId": job_response.get("appId")
        }

        return response
    def get_job_logs(self, job_id, size):
        """
        Fetches the logs of the batch job using Livy's batches logs endpoint

        Keyword arguments:
            job_id {str} -- Job identifier
            size {int} -- Number of log lines to be returned

        Returns:
             response -- Http method response
        """
        job_logs_url = "{}/batches/{}/logs".format(self.url, job_id)

        if size is not None and size > 0:
            job_logs_url = job_logs_url + "?size={}".format(size)

        response = RestUtil.request_with_retry().get(url=job_logs_url,
                                                     auth=self.auth)

        if not response.ok:
            if response.status_code == 404:
                raise ObjectNotFoundError(
                    "Job with id {} not found.".format(job_id))

            raise ServiceError("Failed to get job logs. " + response.text)

        return response
    def delete_file(self, file_name_with_path):
        """
        Deletes a file from HDFS identified by the path

        Keyword arguments:
            file_name_with_path {str} -- Name of the file identified with a path

        Returns:
             response -- Http method response
        """
        delete_file_url = self.url + file_name_with_path + "?op=DELETE"

        if os.path.splitext(file_name_with_path)[-1] == "":
            delete_file_url = delete_file_url + "&recursive=true"

        response = RestUtil.request_with_retry().delete(delete_file_url,
                                                        auth=self.auth)

        if not response.ok:
            raise ServiceError(
                "Attempt to delete file {0} failed with {1} and {2}.".format(
                    file_name_with_path, response.status_code,
                    response.reason))

        return response
    def upload_directory(self, directory_path, archive_directory_data):
        '''Untars the archive_directory_data provided as input,
        and uploads all the contents of the tar to the directory path
        specified on HDFS.
        '''
        logger.log_info("Uploading the directory to HDFS")
        web_hdfs_url = Environment().get_web_hdfs_url()
        hdfs_file_base_url = Environment().get_hdfs_file_base_url()
        session = SwSessionManager().get_session()
        user_name = session.get_username()
        client = InsecureClient(web_hdfs_url, user_name)
        directory_name_with_path = "/" + directory_path
        directory_name = os.path.split(directory_path)[1]
        try:
            with tempfile.TemporaryDirectory() as temp:
                local_dir_path = temp + "/" + directory_name + ".tar.gz"
                with open(local_dir_path, "wb") as dir_archive:
                    dir_archive.write(archive_directory_data)
                with tarfile.open(local_dir_path, "r:gz") as tar:
                    tar.extractall(temp)
                os.remove(local_dir_path)
                response = client.upload(hdfs_path=directory_name_with_path,
                                         local_path=temp)
                logger.log_info(
                    "Successfully uploaded the directory {0} to HDFS".format(
                        response))
            return hdfs_file_base_url + directory_name_with_path

        except Exception as e:
            raise ServiceError(
                "Uploading the directory to HDFS failed with the error: {0}".
                format(str(e)))
 def download_directory(self, directory_url):
     '''Downloads directory from remote HDFS to local, archives it and
     returns the zip of the directory'''
     logger.log_info("Downloading the directory {0} ".format(directory_url))
     # Remove the base url from the absolute directory path provided as parameter
     # For example, if the absolute path is hdfs://alpha:9000/configuration/12345/drift,
     # the below statement will return /configuration/12345/drift
     directory_name_with_path = urllib3.util.parse_url(directory_url).path
     directory_name = os.path.split(directory_name_with_path)[1]
     web_hdfs_url = Environment().get_web_hdfs_url()
     session = SwSessionManager().get_session()
     user_name = session.get_username()
     client = InsecureClient(web_hdfs_url, user_name)
     try:
         with tempfile.TemporaryDirectory() as temp:
             client.download(hdfs_path=directory_name_with_path,
                             local_path=temp,
                             n_threads=5)
             tmp_archive = os.path.join(temp)
             data = io.BytesIO()
             with open(shutil.make_archive(tmp_archive, 'gztar', temp),
                       "rb") as output_data:
                 data.write(output_data.read())
             data.seek(0)
         return send_file(data,
                          as_attachment=True,
                          attachment_filename=directory_name + ".tar.gz")
     except Exception as e:
         raise ServiceError(
             "Downloading the folder from HDFS failed with the error: {0}".
             format(str(e)))
    def delete_directory(self, directory_url):
        web_hdfs_url = Environment().get_web_hdfs_url()
        session = SwSessionManager().get_session()
        user_name = session.get_username()
        client = InsecureClient(web_hdfs_url, user_name)
        try:
            directory_name_with_path = urllib3.util.parse_url(
                directory_url).path
            logger.log_info(
                "Deleting the directory {}".format(directory_name_with_path))
            response = client.delete(directory_name_with_path, recursive=True)
            if not response:
                raise ServiceError("Directory {0} doesn't exist".format(
                    directory_name_with_path))
            return

        except Exception as e:
            raise ServiceError(
                "Deleting the folder from HDFS failed with the error: {0}".
                format(str(e)))
    def _get_actual_download_file_path(self, file_name_with_path):

        download_file_path = None

        list_status_url = self.url + file_name_with_path + "?op=LISTSTATUS"

        response = RestUtil.request_with_retry().get(list_status_url,
                                                     auth=self.auth)

        if not response.ok:
            if response.status_code == 404:
                raise ObjectNotFoundError(
                    "File {} not found.".format(file_name_with_path))
            raise ServiceError(
                "Attempt to open file {0} failed with {1} and {2}.".format(
                    file_name_with_path, response.status_code,
                    response.reason))

        list_status_response = json.loads(response.text)
        if list_status_response is not None and list_status_response.get(
                "FileStatuses") is not None:
            files_statuses = list_status_response.get("FileStatuses")
            if files_statuses.get("FileStatus") is not None:
                file_status_list = files_statuses.get("FileStatus")

                if len(file_status_list) > 1:
                    raise BadRequestError(
                        "Specified path is a directory containing multiple files. Supported only if single part file is inside folder."
                    )

                path_suffix = file_status_list[0]["pathSuffix"]
                if len(path_suffix) > 0:
                    if file_status_list[0]["type"] == "DIRECTORY":
                        download_file_path = self._get_actual_download_file_path(
                            file_name_with_path + "/" + path_suffix)
                    elif file_status_list[0]["type"] == "FILE":
                        download_file_path = file_name_with_path + "/" + path_suffix
                else:
                    download_file_path = file_name_with_path

        return download_file_path
Esempio n. 10
0
    def upload_file(self, file_name_with_path, data, overwrite=False):
        """
        Uploads file to a HDFS location identified by the path

        Keyword arguments:
            file_name_with_path {str} -- Name of the file identified with a path
            data {bytearray} -- Byte array representation of the file
            overwrite {bool} -- Flag indicating of the file should be overwritten

        Returns:
             response {dict} -- Dictionary denoting the status of the upload operation and the relative location of the file.
        """
        # The following lines (114-123) are a temporary fix until we have the right change made in the
        # ibm-wos-utils module to handle upload of this specific file only when its not found
        # in the HDFS location
        if "main_job.py" in file_name_with_path:
            check_file_status_url = self.url + file_name_with_path + "?op=LISTSTATUS"
            response = RestUtil.request_with_retry().get(check_file_status_url,
                                                         auth=self.auth)
            if response.status_code == 200:
                logger.log_warning("File already exists.. Skipping upload...")
                response = {
                    "status": "finished",
                    "location": file_name_with_path
                }
                return response

        create_file_url = self.url + file_name_with_path + "?op=CREATE"
        if overwrite:
            create_file_url = create_file_url + "&overwrite=true"

        response = RestUtil.request_with_retry().put(create_file_url,
                                                     auth=self.auth,
                                                     allow_redirects=False)
        if response.status_code != 307:
            raise ServiceError(
                "Attempt to create file {0} failed with {1} and {2}.".format(
                    file_name_with_path, response.status_code,
                    response.reason))

        file_write_url = None
        if response.headers is not None:
            file_write_url = response.headers["Location"]

        if file_write_url is not None:
            response = RestUtil.request_with_retry().put(file_write_url,
                                                         auth=self.auth,
                                                         data=data)

            retry_attempt = 0
            sleep_factor = random.randint(1, 5)
            # If the file upload fails with 404, during multiple parallel requests trying to upload the same file,
            # attempting retry up-to 5 times with a random start sleep time ranging between 1 and 5 seconds
            # and a back-off factor of 1.5
            if response.status_code == 404:
                while retry_attempt < 5:
                    sleep_factor = sleep_factor * 1.5
                    time.sleep(sleep_factor)
                    retry_attempt += 1
                    logger.log_info("Re-attempt {} of file {} upload.".format(
                        retry_attempt, file_name_with_path))
                    actual_response = RestUtil.request_with_retry().put(
                        create_file_url, auth=self.auth, allow_redirects=False)
                    if actual_response.headers is not None:
                        file_url = actual_response.headers["Location"]
                        response = RestUtil.request_with_retry().put(
                            file_url, auth=self.auth, data=data)

            if not response.ok:
                raise ServiceError(
                    "Attempt to write to file {0} failed with {1} and {2}.".
                    format(file_name_with_path, response.status_code,
                           response.reason))

            check_file_status_url = self.url + file_name_with_path + "?op=LISTSTTAUS"
            response = RestUtil.request_with_retry().get(check_file_status_url,
                                                         auth=self.auth)
            if response.status_code == 404:
                raise ServiceError(
                    "File {} not found".format(file_name_with_path))

            response = {"status": "finished", "location": file_name_with_path}
            return response