def get_meta_table(meta_table: str, aws_env: AWSENV = AWSENV.STG.value, user="******", edd: bool = False) -> Dict[str, Any]: """ Get a meta_table information Args. : - meta_table : (str) the name of meta_table - aws_env : (str) AWS ENV in 'stg / prd' (default is 'stg') - user : (str) the name of user (default is 'reco') - edd : (bool) True if On-prem env is on EDD (default is False) Returns : - Dictionary value of meta_table (id / name / description / schema / items / created_at / updated_at) """ assert type(meta_table) == str assert type(aws_env) == str secret = get_secrets("mls") token = secret.get("user_token").get(user) url = get_secrets("mls")[f"ab_{'onprem_' if edd else ''}{aws_env}_url"] url = f"{url}{MLS_META_API_URL}/{meta_table}" response = requests.get(url, headers={ "Authorization": f"Basic {{{token}}}" }).json() results = response.get("results") if not results: raise MLSModelError(response.get("error")) else: return results
class Hash: access_token = get_access_token() url = { "hash": get_secrets("lake/hash")["hash_url"], "unhash": get_secrets("lake/hash")["unhash_url"], } @classmethod def renew_token(cls): cls.access_token = get_access_token() @classmethod def make_headers(cls): return { "Authorization": f"Bearer {cls.access_token}", } @classmethod def map_s(cls, values, unhash=False): task = hash_task if unhash: task = unhash_task url = cls.url[task.url_key] data = {"type": "s", task.input_key: values} r = requests.post(url, headers=cls.make_headers(), json=data) if r.status_code == 401: cls.renew_token() r = requests.post(url, headers=cls.make_headers(), json=data) if r.status_code != 200: raise Exception(r.content.decode("utf8")) return [x[task.output_key] for x in r.json()["response"]]
def get_github_util(): from skt.github_utils import GithubUtil github_token = get_secrets("github/sktaiflow")["token"] proxy = get_secrets("proxy")["proxy"] proxies = { "http": proxy, "https": proxy, } g = GithubUtil(github_token, proxies) return g
def publish_relation(source, destination, context=None): from datetime import datetime msg = { "source": source, "destination": destination, "timestamp": round(datetime.utcnow().timestamp() * 1000), "context": context, } proxies = get_secrets(path="proxies") url = get_secrets(path="data_lineage")["url"] return requests.post(url, proxies=proxies, json=msg)
def update_meta_table_item( meta_table: str, item_name: str, item_dict: Dict[str, Any], aws_env: AWSENV = AWSENV.STG.value, user="******", edd: bool = False, ) -> None: """ Update a meta_item Args. : - meta_table : (str) the name of meta_table - item_name : (str) the name of meta_item to be added - item_dict : (dict) A dictionary type (item-value) value to upload to or update of the item - aws_env : (str) AWS ENV in 'stg / prd' (default is 'stg') - user : (str) the name of user (default is 'reco') - edd : (bool) True if On-prem env is on EDD (default is False) """ assert type(meta_table) == str assert type(item_name) == str assert type(item_dict) == dict assert type(aws_env) == str secret = get_secrets("mls") token = secret.get("user_token").get(user) meta_table_info = get_meta_table(meta_table, aws_env, user, edd) values_data = dict() for field_name, field_spec in meta_table_info["schema"].items(): values_data[field_name] = item_dict.get(field_name) request_data = dict() request_data["name"] = item_name request_data["values"] = values_data url = get_secrets("mls")[f"ab_{'onprem_' if edd else ''}{aws_env}_url"] url = f"{url}{MLS_META_API_URL}/{meta_table}/meta_items/{item_name}" response = requests.put(url, json=request_data, headers={ "Authorization": f"Basic {{{token}}}" }).json() results = response.get("results") if not results: raise MLSModelError(response.get("error"))
def update_ml_model_meta( user: str, model_name: str, model_version: str, model_meta_dict: Dict[str, Any], aws_env: AWSENV = AWSENV.STG.value, edd: bool = False, ) -> None: """ Update(or Create) model_meta Args. : - user : (str) the name of a MLModel user - model_name : (str) the name of MLModel - model_version : (str) the version of MLModel - model_meta_dict : (dict) the version of MLModel - aws_env : (str) AWS ENV in 'stg / prd' (default is 'stg') - edd : (bool) True if On-prem env is on EDD (default is False) """ assert type(model_name) == str assert type(model_version) == str assert type(model_meta_dict) == dict assert type(aws_env) == str url = get_secrets("mls")[f"ab_{'onprem_' if edd else ''}{aws_env}_url"] url = f"{url}{MLS_MLMODEL_API_URL}/{model_name}/versions/{model_version}/meta" request_data = dict() request_data["user"] = user request_data["model_meta"] = model_meta_dict requests.patch(url, json=request_data).json()
def _bq_table_to_df(dataset, table_name, col_list, partition=None, where=None, spark_session=None): import base64 from skt.vault_utils import get_secrets if not spark_session: spark_session = get_spark() spark_session.conf.set("spark.sql.execution.arrow.enabled", "false") key = get_secrets("gcp/sktaic-datahub/dataflow")["config"] df = ( spark_session.read.format("bigquery") .option("project", "sktaic-datahub") .option("table", f"sktaic-datahub:{dataset}.{table_name}") .option("credentials", base64.b64encode(key.encode()).decode()) ) if partition: table = get_bigquery_client().get_table(f"{dataset}.{table_name}") if "timePartitioning" in table._properties: partition_column_name = table._properties["timePartitioning"]["field"] filter = f"{partition_column_name} = '{partition}'" elif "rangePartitioning" in table._properties: partition_column_name = table._properties["rangePartitioning"]["field"] filter = f"{partition_column_name} = {partition}" else: partition_column_name = None if partition_column_name: df = df.option("filter", filter) df = df.load().select(col_list) if where: df.where(where) return df
def get_ml_model_meta(user: str, model_name: str, model_version: str, aws_env: AWSENV = AWSENV.STG.value, edd: bool = False) -> Dict[str, Any]: """ Get a list of MLModel meta Args. : - user : (str) the name of a MLModel user - model_name : (str) the name of MLModel - model_version : (str) the version of MLModel - aws_env : (str) AWS ENV in 'stg / prd' (default is 'stg') - edd : (bool) True if On-prem env is on EDD (default is False) Returns : - Dictionary value of model_meta """ assert type(user) == str assert type(model_name) == str assert type(model_version) == str assert type(aws_env) == str url = get_secrets("mls")[f"ab_{'onprem_' if edd else ''}{aws_env}_url"] url = f"{url}{MLS_MLMODEL_API_URL}/{model_name}/versions/{model_version}/meta" response = requests.get(url, params={"user": user}).json() results = response.get("results") if not results: raise MLSModelError( f"No MLModel for user: {user} / model_name: {model_name} / model_version: {model_version}" ) else: return results[0].get("model_meta")
def get_access_token(): secrets = get_secrets("lake/hash") url = secrets["auth_url"] client_id = secrets["client_id"] client_secret = secrets["client_secret"] data = {"grant_type": "client_credentials"} res = requests.post(url, auth=(client_id, client_secret), data=data) return res.json()["access_token"]
def get_sqlalchemy_engine(): from sqlalchemy import create_engine hiveserver2 = get_secrets(path="ye/hiveserver2") host = hiveserver2["ip"] port = hiveserver2["port"] user = hiveserver2["user"] return create_engine(f"hive://{user}@{host}:{port}/tmp")
def _df_to_bq_table(df, dataset, table_name, partition=None, mode="overwrite"): import base64 from skt.vault_utils import get_secrets key = get_secrets("gcp/sktaic-datahub/dataflow")["config"] table = f"{dataset}.{table_name}${partition}" if partition else f"{dataset}.{table_name}" df.write.format("bigquery").option("project", "sktaic-datahub").option( "credentials", base64.b64encode(key.encode()).decode() ).option("table", table).option("temporaryGcsBucket", "temp-seoul-7d").save(mode=mode)
def get_hive_conn(): from pyhive import hive hiveserver2 = get_secrets(path="ye/hiveserver2") host = hiveserver2["ip"] port = hiveserver2["port"] user = hiveserver2["user"] conn = hive.connect(host, port=port, username=user) return conn
def set_gcp_credentials(): import os import tempfile from skt.vault_utils import get_secrets key = get_secrets("gcp/sktaic-datahub/dataflow")["config"] key_file_name = tempfile.mkstemp()[1] with open(key_file_name, "wb") as key_file: key_file.write(key.encode()) os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_file.name
def slack_send( text="This is default text", username="******", channel="#leavemealone", icon_emoji=":large_blue_circle:", blocks=None, dataframe=False, ): import requests from skt.vault_utils import get_secrets if dataframe: from tabulate import tabulate text = "```" + tabulate(text, tablefmt="simple", headers="keys") + "```" token = get_secrets("slack")["bot_token"]["airflow"] proxy = get_secrets("proxy")["proxy"] proxies = { "http": proxy, "https": proxy, } headers = { "Content-Type": "application/json;charset=utf-8", "Authorization": f"Bearer {token}", } json_body = { "username": username, "channel": channel, "text": text, "blocks": blocks, "icon_emoji": icon_emoji, } r = requests.post( "https://www.slack.com/api/chat.postMessage", proxies=proxies, headers=headers, json=json_body, ) r.raise_for_status() if not r.json()["ok"]: raise Exception(r.json())
def get_table_top_n_columns(n, start_date=None, end_date=None): lineage_secrets = get_secrets(DATA_LINEAGE_SECRETS_NAME) params = {"top_n": n, "start_date": start_date, "end_date": end_date} response = requests.get(lineage_secrets["url_prd"] + "/relationships/queries/top_n/columns", params=params).json() return response
def meta_table_to_pandas(meta_table: str, aws_env: AWSENV = AWSENV.STG.value, user="******", edd: bool = False) -> Any: """ Get a meta_table as pandas dataframe Args. : - meta_table : (str) the name of meta_table - aws_env : (str) AWS ENV in 'stg / prd' (default is 'stg') - user : (str) the name of user (default is 'reco') - edd : (bool) True if On-prem env is on EDD (default is False) Returns : - A Pandas dataframe type of the item_meta """ assert type(meta_table) == str assert type(aws_env) == str secret = get_secrets("mls") token = secret.get("user_token").get(user) url = get_secrets("mls")[f"ab_{'onprem_' if edd else ''}{aws_env}_url"] url = f"{url}{MLS_META_API_URL}/{meta_table}" response = requests.get(url, headers={ "Authorization": f"Basic {{{token}}}" }).json() if not response.get("results"): raise MLSModelError( f"No meta_table '{meta_table}' exists on AWS {aws_env}") items = response["results"]["items"] key = pd.DataFrame.from_records(items)["name"] values = pd.DataFrame.from_records( pd.DataFrame.from_records(items)["values"]) df = pd.concat([key, values], axis=1) return df
def pandas_to_meta_table( method: str, meta_table: str, df: pd.DataFrame, key: str, values: list, aws_env: AWSENV = AWSENV.STG.value, user="******", edd: bool = False, ) -> None: """ Create or Update items of a meta_table from Pandas Dataframe Args. : - method : (str) requests method 'create' or 'update' - meta_table : (str) MLS meta table name - df : (pd.DataFrame) input table - key : (str) key column in dataframe - values : (list) Dataframe columns for input - aws_env : (str) AWS ENV in 'stg / prd' (default is 'stg') - user : (str) the name of user (default is 'reco') - edd : (bool) True if On-prem env is on EDD (default is False) """ assert type(aws_env) == str assert method in ["create", "update"] assert type(meta_table) == str assert type(df) == pd.core.frame.DataFrame assert type(key) == str assert type(values) == list url = get_secrets("mls")[f"ab_{'onprem_' if edd else ''}{aws_env}_url"] url = f"{url}{MLS_META_API_URL}/{meta_table}/meta_items" def to_json(x): insert_dict = {} insert_dict["name"] = x[key] insert_dict["values"] = {} for value in values: insert_dict["values"][value] = x[value] return insert_dict json_series = df.apply(lambda x: to_json(x), axis=1) for meta in json_series: if method == "create": create_meta_table_item(meta_table, meta.get("name"), meta.get("values"), aws_env, user) else: update_meta_table_item(meta_table, meta.get("name"), meta.get("values"), aws_env, user)
def set_model_name(comm_db, params, user="******", edd: bool = False): secret = get_secrets("mls") token = secret.get("user_token").get(user) if comm_db[-3:] == "dev": # stg url = secret["ab_onprem_stg_url"] if edd else secret["ab_stg_url"] url = f"{url}{MLS_COMPONENTS_API_URL}" else: # prd url = secret["ab_onprem_prd_url"] if edd else secret["ab_prd_url"] url = f"{url}{MLS_COMPONENTS_API_URL}" requests.post( url, json=params, headers={"Authorization": f"Basic {{{token}}}"}, )
def get_bigquery_client(): import os import tempfile from google.cloud import bigquery from skt.vault_utils import get_secrets if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ and os.path.isfile(os.environ["GOOGLE_APPLICATION_CREDENTIALS"]): return bigquery.Client() key = get_secrets("gcp/sktaic-datahub/dataflow")["config"] with tempfile.NamedTemporaryFile() as f: f.write(key.encode()) f.seek(0) os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = f.name client = bigquery.Client() return client
def get_mls_meta_table_client(env="stg", user="******"): from sktmls.meta_tables.meta_table import MetaTableClient from sktmls import MLSENV if env == "prd": env = MLSENV.PRD else: env = MLSENV.STG secrets = get_secrets(path="mls") if user != "reco": user_id = secrets.get(f"{user}_id") user_pass = secrets.get(f"{user}_pass") else: user_id = secrets.get("reco_id") user_pass = secrets.get("reco_pass") if not user_id or not user_pass: raise Exception("No ID or Password for the user {user}") return MetaTableClient(env=env, username=user_id, password=user_pass)
def get_all_recent_model_path(comm_db, user="******", edd: bool = False): secret = get_secrets("mls") token = secret.get("user_token").get(user) if comm_db[-3:] == "dev": # stg url = secret["ab_onprem_stg_url"] if edd else secret["ab_stg_url"] url = f"{url}{MLS_COMPONENTS_API_URL}" else: # prd url = secret["ab_onprem_prd_url"] if edd else secret["ab_prd_url"] url = f"{url}{MLS_COMPONENTS_API_URL}" response = requests.get(url, headers={ "Authorization": f"Basic {{{token}}}" }).json().get("results") results = { component.get("name"): component.get("info") for component in response if component.get("is_latest") } return results
def search_queries_by_table_id(table_id, **kwargs): limit = kwargs.get("limit", 100) fuzziness = kwargs.get("fuzziness", "AUTO") operator = kwargs.get("operator", "and") offset = kwargs.get("offset", None) fields = kwargs.get("fields", None) must = kwargs.get("must", None) sort = kwargs.get("sort", "desc") start_date = kwargs.get("start_date", None) end_date = kwargs.get("end_date", None) secrets = get_secrets(DATA_CATALOG_SECRETS_NAME) es_sort = [{"start_time": sort}] params = { "inputs": table_id, "outputs": table_id, "limit": limit, "fuzziness": fuzziness, "offset": offset, "operator": operator, "fields": fields, "must": must, "sort": json.dumps(es_sort), } if start_date or end_date: range_filter = {"range": {"start_time": {}}} if start_date: range_filter["range"]["start_time"]["gte"] = start_date if end_date: range_filter["range"]["start_time"]["lt"] = end_date params["range_filter"] = json.dumps(range_filter) return requests.get(secrets["url_prd"] + "/v1/search/processes", params=params).json()
def get_user_queries(user_name, start_date=None, end_date=None, **kwargs): secrets = get_secrets(DATA_CATALOG_SECRETS_NAME) default_order = "asc" if (start_date or end_date) else "desc" order = kwargs.get("sort", default_order) limit = kwargs.get("limit", 100) es_sort = [{"start_time": order}] es_limit = min(100, limit) params = { "user_name": user_name, "limit": es_limit, "sort": json.dumps(es_sort) } gte = start_date or (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d") lt = end_date or datetime.datetime.now().strftime("%Y-%m-%d") range_filter = {"start_time": {"gte": gte, "lt": lt}} params["range_filter"] = json.dumps(range_filter) total_queries = [] response = requests.get(secrets["url_prd"] + "/v1/search/processes", params=params).json() total_queries.extend(response["user_name"]["hits"]) total = response["user_name"]["total"]["value"] while total > len(total_queries) and limit < len(total_queries): params["offset"] = json.dumps(total_queries[-1]["sort"]) response = requests.get(secrets["url_prd"] + "/v1/search/processes", params=params).json() total_queries.extend(response["user_name"]["hits"]) return total_queries
def search_table_by_name(name, **kwargs): secrets = get_secrets(DATA_CATALOG_SECRETS_NAME) kwargs["name"] = name return requests.get(f"{secrets['url_prd']}/v1/search/tables", params=kwargs).json()
def get_queries(source, limit=100): secrets = get_secrets(DATA_CATALOG_SECRETS_NAME) return requests.get( f"{secrets['url_prd']}/sources/{source}/processes?limit={limit}").json( )
def get_source(source): secrets = get_secrets(DATA_CATALOG_SECRETS_NAME) return requests.get(f"{secrets['url_prd']}/sources/{source}").json()
def get_spark(scale=0, queue=None): import os import uuid import tempfile from pyspark.sql import SparkSession from skt.vault_utils import get_secrets tmp_uuid = str(uuid.uuid4()) app_name = f"skt-{os.environ.get('USER', 'default')}-{tmp_uuid}" if not queue: if "JUPYTERHUB_USER" in os.environ: queue = "dmig_eda" else: queue = "airflow_job" os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1" key = get_secrets("gcp/sktaic-datahub/dataflow")["config"] key_file_name = tempfile.mkstemp()[1] with open(key_file_name, "wb") as key_file: key_file.write(key.encode()) os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_file.name if scale in [1, 2, 3, 4]: spark = (SparkSession.builder.config( "spark.app.name", app_name ).config("spark.driver.memory", f"{scale*8}g").config( "spark.executor.memory", f"{scale*3}g" ).config( "spark.executor.instances", f"{scale*8}" ).config( "spark.driver.maxResultSize", f"{scale*4}g" ).config( "spark.rpc.message.maxSize", "1024" ).config( "spark.yarn.queue", queue ).config( "spark.ui.enabled", "false" ).config( "spark.port.maxRetries", "128" ).config("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", "1").config( "spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", "1" ).config( "spark.jars", "gs://external_libs/spark/jars/spark-bigquery-with-dependencies_2.11-0.16.1.jar", ).enableHiveSupport().getOrCreate()) else: spark = (SparkSession.builder.config( "spark.app.name", app_name ).config("spark.driver.memory", "6g").config( "spark.executor.memory", "8g" ).config("spark.shuffle.service.enabled", "true").config( "spark.dynamicAllocation.enabled", "true" ).config("spark.dynamicAllocation.maxExecutors", "200").config( "spark.driver.maxResultSize", "6g" ).config( "spark.rpc.message.maxSize", "1024" ).config( "spark.yarn.queue", queue ).config( "spark.ui.enabled", "false" ).config( "spark.port.maxRetries", "128" ).config("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", "1").config( "spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", "1" ).config( "spark.jars", "gs://external_libs/spark/jars/spark-bigquery-with-dependencies_2.11-0.16.1.jar", ).enableHiveSupport().getOrCreate()) spark.conf.set("spark.sql.execution.arrow.enabled", "true") return spark
def get_columns(source, table_id): secrets = get_secrets(DATA_CATALOG_SECRETS_NAME) return requests.get( f"{secrets['url_prd']}/sources/{source}/tables/{table_id}/columns" ).json()
def get_user_data_access(user_name, start_date=None, end_date=None, timeseries=False, **kwargs): secrets = get_secrets(DATA_CATALOG_SECRETS_NAME) lineage_secrets = get_secrets(DATA_LINEAGE_SECRETS_NAME) default_order = "asc" if (start_date or end_date) else "desc" order = kwargs.get("sort", default_order) limit = kwargs.get("limit", 1000) es_sort = [{"start_time": order}] es_limit = min(1000, limit) params = { "user_name": user_name, "sort": json.dumps(es_sort), "limit": es_limit, "fields": json.dumps(["inputs", "outputs"]), } gte = start_date or (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d") lt = end_date or datetime.datetime.now().strftime("%Y-%m-%d") range_filter = {"start_time": {"gte": gte, "lt": lt}} params["range_filter"] = json.dumps(range_filter) total_queries = [] response = requests.get(secrets["url_prd"] + "/v1/search/processes", params=params).json() total_queries.extend(response["user_name"]["hits"]) total = response["user_name"]["total"]["value"] while total > len(total_queries) and limit < len(total_queries): params["offset"] = json.dumps(total_queries[-1]["sort"]) response = requests.get(secrets["url_prd"] + "/v1/search/processes", params=params).json() total_queries.extend(response["user_name"]["hits"]) result = [] table_dict = {} column_dict = {} for each_query in total_queries: query_id = each_query["_id"] if timeseries: inputs = each_query["_source"].get("inputs", []) outputs = each_query["_source"].get("outputs", []) response = requests.get( lineage_secrets["url_prd"] + f"/relationships/queries/query/{query_id}/columns", params=params).json() column_list = list(map(lambda each: each["target"], response)) result.append({ "inputs": inputs, "outputs": outputs, "columns": column_list, "start_time": each_query["sort"][0], "query_id": query_id, }) else: inputs = each_query["_source"].get("inputs", []) or [] outputs = each_query["_source"].get("outputs", []) or [] for each in inputs: if each not in table_dict: table_dict[each] = 1 for each in outputs: if each not in table_dict: table_dict[each] = 1 response = requests.get( lineage_secrets["url_prd"] + f"/relationships/queries/query/{query_id}/columns", params=params).json() column_list = list(map(lambda each: each["target"], response)) for each_column in column_list: if each_column not in column_dict: column_dict[each_column] = 1 if timeseries: return result else: return { "tables": list(table_dict.keys()), "columns": list(column_dict.keys()) }
def get_resource(resource_name, resource_id): secrets = get_secrets(DATA_CATALOG_SECRETS_NAME) return requests.get( f"{secrets['url_prd']}/v1/resources/{resource_name}/{resource_id}" ).json()