def __init__(self): self.url = Environment().get_spark_livy_url() self.auth = None if Environment().is_kerberos_enabled(): from requests_kerberos import HTTPKerberosAuth, REQUIRED self.auth = HTTPKerberosAuth(mutual_authentication=REQUIRED, sanitize_mutual_error_response=False)
def __init__(self): if Environment().get_base_hdfs_location(): self.file_path_prefix = Environment().get_hdfs_file_base_url() + "/" + \ Environment().get_base_hdfs_location() else: self.file_path_prefix = Environment().get_hdfs_file_base_url() self.client = LivyClient()
def upload_directory(self, directory_path, archive_directory_data): '''Untars the archive_directory_data provided as input, and uploads all the contents of the tar to the directory path specified on HDFS. ''' logger.log_info("Uploading the directory to HDFS") web_hdfs_url = Environment().get_web_hdfs_url() hdfs_file_base_url = Environment().get_hdfs_file_base_url() session = SwSessionManager().get_session() user_name = session.get_username() client = InsecureClient(web_hdfs_url, user_name) directory_name_with_path = "/" + directory_path directory_name = os.path.split(directory_path)[1] try: with tempfile.TemporaryDirectory() as temp: local_dir_path = temp + "/" + directory_name + ".tar.gz" with open(local_dir_path, "wb") as dir_archive: dir_archive.write(archive_directory_data) with tarfile.open(local_dir_path, "r:gz") as tar: tar.extractall(temp) os.remove(local_dir_path) response = client.upload(hdfs_path=directory_name_with_path, local_path=temp) logger.log_info( "Successfully uploaded the directory {0} to HDFS".format( response)) return hdfs_file_base_url + directory_name_with_path except Exception as e: raise ServiceError( "Uploading the directory to HDFS failed with the error: {0}". format(str(e)))
def get_user_principal(username): principals = Environment().get_kerberos_principals() if principals is not None: return principals.get(username) else: raise AuthenticationError("No kerberized principal found for the user {}".format(username))
def authenticate(self): """ Checks whether the user is valid. """ try: auth_header = self.basic_token username, password = decode(auth_header) user_principal = None allowlisted_users = Environment().get_allowlisted_users() if allowlisted_users is not None: password_from_allowlist = allowlisted_users.get(username) if password_from_allowlist is None or password_from_allowlist != password: logger.log_error("Invalid user credentials provided") raise AuthenticationError("Invalid user credential") else: raise AuthenticationError("No whitelisted users found to authenticate against") if Environment().is_kerberos_enabled(): user_principal = self.get_user_principal(username) key_tab_path = Environment().get_hdfs_keytab_file_path() logger.log_info("Minting a kerberos ticket for principal {} using keytab {}".format(user_principal, key_tab_path)) if key_tab_path is None or user_principal is None: raise AuthenticationError("Keytab file or kerberos principal missing") returncode = KerberosUtil.renew_kinit(key_tab_path, user_principal) logger.log_info('kinit return code:' + str(returncode)) return username, user_principal except Exception as e: logger.log_exception("Failed while authenticating user", exc_info=True) raise AuthenticationError(str(e))
def __init__(self, record_type, deployment_id): self.record_type = RecordType(record_type) self.env = Environment() config = self.env.get_deployment_config(deployment_id) self.database = config.get("database") self.schema = config.get("schema") self.table = config.get(self.record_type.value + "_table")
def __update_absolute_hdfs_file_path(file_name_with_path): param_dict = {"hdfs": Environment().get_base_hdfs_location()} if file_name_with_path is not None: if file_name_with_path.startswith("$hdfs"): replaced_file_path = Template(file_name_with_path) replaced_file_path = replaced_file_path.substitute(param_dict) return replaced_file_path elif file_name_with_path.startswith("/"): file_name_with_path = Environment().get_base_hdfs_location( ) + "/" + file_name_with_path[1:] else: file_name_with_path = Environment().get_base_hdfs_location( ) + "/" + file_name_with_path return file_name_with_path
def download_directory(self, directory_url): '''Downloads directory from remote HDFS to local, archives it and returns the zip of the directory''' logger.log_info("Downloading the directory {0} ".format(directory_url)) # Remove the base url from the absolute directory path provided as parameter # For example, if the absolute path is hdfs://alpha:9000/configuration/12345/drift, # the below statement will return /configuration/12345/drift directory_name_with_path = urllib3.util.parse_url(directory_url).path directory_name = os.path.split(directory_name_with_path)[1] web_hdfs_url = Environment().get_web_hdfs_url() session = SwSessionManager().get_session() user_name = session.get_username() client = InsecureClient(web_hdfs_url, user_name) try: with tempfile.TemporaryDirectory() as temp: client.download(hdfs_path=directory_name_with_path, local_path=temp, n_threads=5) tmp_archive = os.path.join(temp) data = io.BytesIO() with open(shutil.make_archive(tmp_archive, 'gztar', temp), "rb") as output_data: data.write(output_data.read()) data.seek(0) return send_file(data, as_attachment=True, attachment_filename=directory_name + ".tar.gz") except Exception as e: raise ServiceError( "Downloading the folder from HDFS failed with the error: {0}". format(str(e)))
def __add_additional_parameters_in_run_request(run_request_json): conf = run_request_json.get("conf") if conf is None: run_request_json["conf"] = {} # Add archives to the job payload if Environment().get_wos_env_archive_location() is not None: archives = run_request_json.get("archives") if archives is None: archives = [] archives.append(Environment().get_wos_env_archive_location()) run_request_json["archives"] = archives if Environment().get_wos_env_site_packages_path() is not None: run_request_json["conf"][ "spark.yarn.appMasterEnv.PYTHONPATH"] = Environment( ).get_wos_env_site_packages_path() run_request_json["conf"][ "spark.executorEnv.PYTHONPATH"] = Environment( ).get_wos_env_site_packages_path() if Environment().is_kerberos_enabled(): session = SwSessionManager().get_session() run_request_json["conf"][ "spark.yarn.principal"] = session.get_user_principal() run_request_json["conf"]["spark.yarn.keytab"] = Environment( ).get_spark_yarn_keytab_file_path() return run_request_json
def delete_directory(self, directory_url): web_hdfs_url = Environment().get_web_hdfs_url() session = SwSessionManager().get_session() user_name = session.get_username() client = InsecureClient(web_hdfs_url, user_name) try: directory_name_with_path = urllib3.util.parse_url( directory_url).path logger.log_info( "Deleting the directory {}".format(directory_name_with_path)) response = client.delete(directory_name_with_path, recursive=True) if not response: raise ServiceError("Directory {0} doesn't exist".format( directory_name_with_path)) return except Exception as e: raise ServiceError( "Deleting the folder from HDFS failed with the error: {0}". format(str(e)))
class DataProvider: def __init__(self, record_type, deployment_id): self.record_type = RecordType(record_type) self.env = Environment() config = self.env.get_deployment_config(deployment_id) self.database = config.get("database") self.schema = config.get("schema") self.table = config.get(self.record_type.value + "_table") def get_records(self, search_filter, column_filter, order_by, limit, offset): conn = None cursor = None try: conn = hive.connect(host=self.env.get_db_hostname(), database=self.database) logger.log_debug("Created connection") cursor = conn.cursor() fields, fields_types = self.__get_fields_types( cursor, self.table, column_filter) query = self.__get_query(column_filter=column_filter, table=self.table, search_filter=search_filter, order_by=order_by, limit=limit, offset=offset) logger.log_info("Executing query: " + query) cursor.execute(query) rows = cursor.fetchall() values = self.__get_values(rows=rows, fields=fields, fields_types=fields_types) return {"fields": fields, "values": values} except Exception as e: logger.log_error( "Failed while fetching data from database with error : " + str(e)) raise e finally: if cursor: cursor.close() if conn: conn.close() def __get_values(self, rows, fields, fields_types): values = [] for row in rows: value = list(row) for i in range(len(value)): if fields_types.get(fields[i]) == "timestamp": value[i] = value[i].strftime("%Y-%m-%dT%H:%M:%S.%fZ") if fields_types.get(fields[i]) == "binary": value[i] = value[i].decode("utf-8") values.append(value) return values def __get_query(self, column_filter, table, search_filter, order_by, limit, offset): if column_filter: query = "SELECT " + column_filter + " FROM " + table else: query = "SELECT * FROM " + table if search_filter: filters = search_filter.split(",") conditions = [] for f in filters: fs = f.split(":") if fs[1] == "eq": conditions.append("{}='{}'".format(fs[0], fs[2])) elif fs[1] == "in": values = ",".join( ["'{}'".format(x) for x in fs[2].split(",")]) conditions.append("{} IN ({})".format(fs[0], values)) if conditions: query = "{} WHERE {}".format(query, " AND ".join(conditions)) if order_by: fs = order_by.split(":") query = "{} ORDER BY {} {}".format(query, fs[0], fs[1].upper()) limit = min(limit, 100) return "{} LIMIT {} OFFSET {}".format(query, limit, offset) def __get_fields_types(self, cursor, table, column_filter): cursor.execute("describe {}".format(table)) columns = cursor.fetchall() fields_types = {c[0]: c[1] for c in columns} fields = [c[0] for c in columns] if column_filter: fields = column_filter.split(",") return fields, fields_types