def __init__(self):
     self.url = Environment().get_spark_livy_url()
     self.auth = None
     if Environment().is_kerberos_enabled():
         from requests_kerberos import HTTPKerberosAuth, REQUIRED
         self.auth = HTTPKerberosAuth(mutual_authentication=REQUIRED,
                                      sanitize_mutual_error_response=False)
 def __init__(self):
     if Environment().get_base_hdfs_location():
         self.file_path_prefix = Environment().get_hdfs_file_base_url() + "/" + \
             Environment().get_base_hdfs_location()
     else:
         self.file_path_prefix = Environment().get_hdfs_file_base_url()
     self.client = LivyClient()
    def upload_directory(self, directory_path, archive_directory_data):
        '''Untars the archive_directory_data provided as input,
        and uploads all the contents of the tar to the directory path
        specified on HDFS.
        '''
        logger.log_info("Uploading the directory to HDFS")
        web_hdfs_url = Environment().get_web_hdfs_url()
        hdfs_file_base_url = Environment().get_hdfs_file_base_url()
        session = SwSessionManager().get_session()
        user_name = session.get_username()
        client = InsecureClient(web_hdfs_url, user_name)
        directory_name_with_path = "/" + directory_path
        directory_name = os.path.split(directory_path)[1]
        try:
            with tempfile.TemporaryDirectory() as temp:
                local_dir_path = temp + "/" + directory_name + ".tar.gz"
                with open(local_dir_path, "wb") as dir_archive:
                    dir_archive.write(archive_directory_data)
                with tarfile.open(local_dir_path, "r:gz") as tar:
                    tar.extractall(temp)
                os.remove(local_dir_path)
                response = client.upload(hdfs_path=directory_name_with_path,
                                         local_path=temp)
                logger.log_info(
                    "Successfully uploaded the directory {0} to HDFS".format(
                        response))
            return hdfs_file_base_url + directory_name_with_path

        except Exception as e:
            raise ServiceError(
                "Uploading the directory to HDFS failed with the error: {0}".
                format(str(e)))
Exemple #4
0
    def get_user_principal(username):

        principals = Environment().get_kerberos_principals()
        if principals is not None:
            return principals.get(username)
        else:
            raise AuthenticationError("No kerberized principal found for the user {}".format(username))
Exemple #5
0
    def authenticate(self):
        """
        Checks whether the user is valid.
        """
        try:
            auth_header = self.basic_token
            username, password = decode(auth_header)

            user_principal = None
            allowlisted_users = Environment().get_allowlisted_users()
            if allowlisted_users is not None:
                password_from_allowlist = allowlisted_users.get(username)
                if password_from_allowlist is None or password_from_allowlist != password:
                    logger.log_error("Invalid user credentials provided")
                    raise AuthenticationError("Invalid user credential")
            else:
                raise AuthenticationError("No whitelisted users found to authenticate against")

            if Environment().is_kerberos_enabled():
                user_principal = self.get_user_principal(username)
                key_tab_path = Environment().get_hdfs_keytab_file_path()
                logger.log_info("Minting a kerberos ticket for principal {} using keytab {}".format(user_principal, key_tab_path))
                if key_tab_path is None or user_principal is None:
                    raise AuthenticationError("Keytab file or kerberos principal missing")
                returncode = KerberosUtil.renew_kinit(key_tab_path, user_principal)
                logger.log_info('kinit return code:' + str(returncode))

            return username, user_principal
        except Exception as e:
            logger.log_exception("Failed while authenticating user", exc_info=True)
            raise AuthenticationError(str(e))
Exemple #6
0
 def __init__(self, record_type, deployment_id):
     self.record_type = RecordType(record_type)
     self.env = Environment()
     config = self.env.get_deployment_config(deployment_id)
     self.database = config.get("database")
     self.schema = config.get("schema")
     self.table = config.get(self.record_type.value + "_table")
    def __update_absolute_hdfs_file_path(file_name_with_path):

        param_dict = {"hdfs": Environment().get_base_hdfs_location()}
        if file_name_with_path is not None:
            if file_name_with_path.startswith("$hdfs"):
                replaced_file_path = Template(file_name_with_path)
                replaced_file_path = replaced_file_path.substitute(param_dict)
                return replaced_file_path
            elif file_name_with_path.startswith("/"):
                file_name_with_path = Environment().get_base_hdfs_location(
                ) + "/" + file_name_with_path[1:]
            else:
                file_name_with_path = Environment().get_base_hdfs_location(
                ) + "/" + file_name_with_path

        return file_name_with_path
 def download_directory(self, directory_url):
     '''Downloads directory from remote HDFS to local, archives it and
     returns the zip of the directory'''
     logger.log_info("Downloading the directory {0} ".format(directory_url))
     # Remove the base url from the absolute directory path provided as parameter
     # For example, if the absolute path is hdfs://alpha:9000/configuration/12345/drift,
     # the below statement will return /configuration/12345/drift
     directory_name_with_path = urllib3.util.parse_url(directory_url).path
     directory_name = os.path.split(directory_name_with_path)[1]
     web_hdfs_url = Environment().get_web_hdfs_url()
     session = SwSessionManager().get_session()
     user_name = session.get_username()
     client = InsecureClient(web_hdfs_url, user_name)
     try:
         with tempfile.TemporaryDirectory() as temp:
             client.download(hdfs_path=directory_name_with_path,
                             local_path=temp,
                             n_threads=5)
             tmp_archive = os.path.join(temp)
             data = io.BytesIO()
             with open(shutil.make_archive(tmp_archive, 'gztar', temp),
                       "rb") as output_data:
                 data.write(output_data.read())
             data.seek(0)
         return send_file(data,
                          as_attachment=True,
                          attachment_filename=directory_name + ".tar.gz")
     except Exception as e:
         raise ServiceError(
             "Downloading the folder from HDFS failed with the error: {0}".
             format(str(e)))
    def __add_additional_parameters_in_run_request(run_request_json):

        conf = run_request_json.get("conf")
        if conf is None:
            run_request_json["conf"] = {}

        # Add archives to the job payload
        if Environment().get_wos_env_archive_location() is not None:
            archives = run_request_json.get("archives")
            if archives is None:
                archives = []
            archives.append(Environment().get_wos_env_archive_location())
            run_request_json["archives"] = archives
            if Environment().get_wos_env_site_packages_path() is not None:
                run_request_json["conf"][
                    "spark.yarn.appMasterEnv.PYTHONPATH"] = Environment(
                    ).get_wos_env_site_packages_path()
                run_request_json["conf"][
                    "spark.executorEnv.PYTHONPATH"] = Environment(
                    ).get_wos_env_site_packages_path()

        if Environment().is_kerberos_enabled():
            session = SwSessionManager().get_session()
            run_request_json["conf"][
                "spark.yarn.principal"] = session.get_user_principal()
            run_request_json["conf"]["spark.yarn.keytab"] = Environment(
            ).get_spark_yarn_keytab_file_path()

        return run_request_json
    def delete_directory(self, directory_url):
        web_hdfs_url = Environment().get_web_hdfs_url()
        session = SwSessionManager().get_session()
        user_name = session.get_username()
        client = InsecureClient(web_hdfs_url, user_name)
        try:
            directory_name_with_path = urllib3.util.parse_url(
                directory_url).path
            logger.log_info(
                "Deleting the directory {}".format(directory_name_with_path))
            response = client.delete(directory_name_with_path, recursive=True)
            if not response:
                raise ServiceError("Directory {0} doesn't exist".format(
                    directory_name_with_path))
            return

        except Exception as e:
            raise ServiceError(
                "Deleting the folder from HDFS failed with the error: {0}".
                format(str(e)))
Exemple #11
0
class DataProvider:
    def __init__(self, record_type, deployment_id):
        self.record_type = RecordType(record_type)
        self.env = Environment()
        config = self.env.get_deployment_config(deployment_id)
        self.database = config.get("database")
        self.schema = config.get("schema")
        self.table = config.get(self.record_type.value + "_table")

    def get_records(self, search_filter, column_filter, order_by, limit,
                    offset):
        conn = None
        cursor = None
        try:
            conn = hive.connect(host=self.env.get_db_hostname(),
                                database=self.database)
            logger.log_debug("Created connection")
            cursor = conn.cursor()

            fields, fields_types = self.__get_fields_types(
                cursor, self.table, column_filter)

            query = self.__get_query(column_filter=column_filter,
                                     table=self.table,
                                     search_filter=search_filter,
                                     order_by=order_by,
                                     limit=limit,
                                     offset=offset)
            logger.log_info("Executing query: " + query)
            cursor.execute(query)
            rows = cursor.fetchall()

            values = self.__get_values(rows=rows,
                                       fields=fields,
                                       fields_types=fields_types)

            return {"fields": fields, "values": values}
        except Exception as e:
            logger.log_error(
                "Failed while fetching data from database with error : " +
                str(e))
            raise e
        finally:
            if cursor:
                cursor.close()
            if conn:
                conn.close()

    def __get_values(self, rows, fields, fields_types):
        values = []
        for row in rows:
            value = list(row)
            for i in range(len(value)):
                if fields_types.get(fields[i]) == "timestamp":
                    value[i] = value[i].strftime("%Y-%m-%dT%H:%M:%S.%fZ")
                if fields_types.get(fields[i]) == "binary":
                    value[i] = value[i].decode("utf-8")
            values.append(value)
        return values

    def __get_query(self, column_filter, table, search_filter, order_by, limit,
                    offset):
        if column_filter:
            query = "SELECT " + column_filter + " FROM " + table
        else:
            query = "SELECT * FROM " + table

        if search_filter:
            filters = search_filter.split(",")
            conditions = []
            for f in filters:
                fs = f.split(":")
                if fs[1] == "eq":
                    conditions.append("{}='{}'".format(fs[0], fs[2]))
                elif fs[1] == "in":
                    values = ",".join(
                        ["'{}'".format(x) for x in fs[2].split(",")])
                    conditions.append("{} IN ({})".format(fs[0], values))

            if conditions:
                query = "{} WHERE {}".format(query, " AND ".join(conditions))

        if order_by:
            fs = order_by.split(":")
            query = "{} ORDER BY {} {}".format(query, fs[0], fs[1].upper())

        limit = min(limit, 100)
        return "{} LIMIT {} OFFSET {}".format(query, limit, offset)

    def __get_fields_types(self, cursor, table, column_filter):
        cursor.execute("describe {}".format(table))
        columns = cursor.fetchall()
        fields_types = {c[0]: c[1] for c in columns}
        fields = [c[0] for c in columns]

        if column_filter:
            fields = column_filter.split(",")

        return fields, fields_types