def create_report(data: pd.DataFrame) -> pd.DataFrame: avg_score = int(data["score_label"].sum() + randint(-2 * len(data.columns), 2 * len(data.columns))) log_metric("Column Count", len(data.columns)) log_metric("Avg Score", avg_score) log_dataframe("ready_data", data, with_histograms=True) return pd.DataFrame(data=[[avg_score]], columns=["avg_score"])
def monitor_redshift_table(**op_kwarg): """Redshift table monitor collects the following metrics: - record count - duplicate records - Null/NaN record counts in each column - mean, median, min, max, std of each numeric column """ hook = PostgresHook(REDSHIFT_CONNECTION_ID) data = hook.get_pandas_df(SELECT_DATA, parameters=[REDSHIFT_MONITOR_TABLE_LIMIT]) log_dataframe( "{}".format(REDSHIFT_TABLE), data, with_histograms=True, with_stats=True, with_schema=True, ) log_metric("record count", data.shape[0]) log_metric("Duplicate records", data.shape[0] - data.drop_duplicates().shape[0]) for column in data.columns: log_metric("{} null record count".format(column), int(data[column].isna().sum())) if issubdtype(data[column].dtype, number): log_metric("{} mean".format(column), round(data[column].mean(), 2)) log_metric("{} median".format(column), data[column].median()) log_metric("{} min".format(column), data[column].min()) log_metric("{} max".format(column), data[column].max()) log_metric("{} std".format(column), round(data[column].std(), 2))
def stub(stage, test_df=None): # type: (str, pd.DataFrame) -> None # if random.randint(0, 1): # raise Exception("brrrr") log_metric(stage, utcnow()) log_dataframe("df_" + stage, test_df)
def validate_model_for_customer( model: ElasticNet, validation_dataset: pd.DataFrame, threshold=0.2, target_date: datetime.date = None, ) -> Tuple[str, figure.Figure]: log_dataframe("validation", validation_dataset) # support for py3 parqeut validation_dataset = validation_dataset.rename(str, axis="columns") validation_x = validation_dataset.drop([TARGET_LABEL], 1) validation_y = validation_dataset[[TARGET_LABEL]] prediction = model.predict(validation_x) (rmse, mae, r2) = calculate_metrics(validation_y, prediction, target_date=target_date, additional_name="_validate") fig = create_scatter_plot(validation_y, prediction) # if r2 < threshold: # raise Exception( # "Model quality is below threshold. Got R2 equal to %s, expect at least %s" # % (r2, threshold) # ) return "%s,%s,%s" % (rmse, mae, r2), fig
def prepare_data() -> Tuple[DataFrame, DataFrame]: """load dataset from sklearn. split into training and testing sets""" raw_data = datasets.load_diabetes() # create a pandas DataFrame from sklearn dataset df = DataFrame(raw_data["data"], columns=raw_data["feature_names"]) df["target"] = Series(raw_data["target"]) # split the data into training and testing sets training_data, testing_data = train_test_split(df, test_size=0.25) # use DBND logging features to log DataFrames with histograms log_dataframe( "training data", training_data, with_histograms=True, with_schema=True, with_stats=True, ) log_dataframe("testing_data", testing_data) # use DBND logging features to log the mean of s1 log_metric("mean s1", training_data["s1"].mean()) return training_data, testing_data
def histogram_test(input_file, app_name, stats): execution_time = None if stats: app_name += "_with_stats" app_name += "-" + os.path.basename(input_file) spark = SparkSession.builder.appName(app_name).getOrCreate() try: if input_file.endswith(".csv"): df = spark.read.csv(input_file, inferSchema=True, header=True, sep=",") elif input_file.endswith(".parquet"): df = spark.read.parquet(input_file) else: print("not supported file type: {}".format(input_file)) return start_time = time.time() log_dataframe("df", df, with_histograms=True, with_stats=stats) execution_time = time.time() - start_time finally: spark.stop() create_test_report(input_file, app_name, execution_time)
def run_create_report(input_path, output_path): data = pd.read_csv(input_path) log_dataframe( "data", data, path=input_path, with_histograms=True, operation_type=DbndTargetOperationType.write, ) create_report(data).to_csv(output_path, index=False) return output_path
def split_data( raw_data: pd.DataFrame, ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: raw_data.drop(["id", "1_norm", "10_norm"], axis=1, inplace=True, errors="ignore") log_dataframe("raw", raw_data, with_histograms=True) train_df, test_df = train_test_split(raw_data) test_df, validation_df = train_test_split(test_df, test_size=0.5) return train_df, test_df, validation_df
def clean_pii(data: pd.DataFrame, pii_columns: List[str], target_date: datetime.date = None) -> pd.DataFrame: # I am not sure about this code, but this might help if target_date and target_date >= datetime.date(2020, 7, 12): if "10" not in data.columns: log_metric("Fixed columns", ["10"]) data["10"] = 0 data[pii_columns] = data[pii_columns].apply( lambda x: x.apply(get_hash_for_obj), axis=1) log_metric("PII items removed:", len(pii_columns) * data.shape[0]) log_dataframe("pii_clean", data) return data
def run_clean_piis(input_path, output_path, pii_columns, target_date_str=None): target_date = datetime.datetime.strptime(target_date_str, "%Y-%m-%d").date() data = pd.read_csv(input_path) log_dataframe( "data", data, path=input_path, with_histograms=True, operation_type=DbndTargetOperationType.read, ) clean_pii(data=data, pii_columns=pii_columns, target_date=target_date).to_csv( output_path, index=False ) return output_path
def prepare_data( raw_data: DataFrame) -> Tuple[DataFrame, DataFrame, DataFrame]: """ Split data into train, test and validation """ train_df, test_df = train_test_split(raw_data) test_df, validation_df = train_test_split(test_df, test_size=0.5) sys.stderr.write( "Running Prepare Data! You'll see this message in task log \n") print("..and this one..\n") logger.info("..and this one for sure!") log_dataframe("raw", raw_data) return train_df, test_df, validation_df
def split_data( raw_data: pd.DataFrame, ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: columns_to_remove = set(["id", "0_norm", "10_norm"]) if columns_to_remove.issubset(raw_data.columns): raw_data.drop(columns_to_remove, axis=1, inplace=True) train_df, test_df = train_test_split(raw_data) test_df, validation_df = train_test_split(test_df, test_size=0.5) log_dataframe("raw", raw_data) log_metric("target.mean", raw_data["target"].mean()) log_metric("target.std", raw_data["target"].std()) return train_df, test_df, validation_df
def validate_model(model: ElasticNet, validation_dataset: DataFrame) -> str: log_dataframe("validation", validation_dataset) validation_x = validation_dataset.drop(["quality"], 1) validation_y = validation_dataset[["quality"]] prediction = model.predict(validation_x) rmse = np.sqrt(mean_squared_error(validation_y, prediction)) mae = mean_absolute_error(validation_y, prediction) r2 = r2_score(validation_y, prediction) log_metric("rmse", rmse) log_metric("mae", rmse) log_metric("r2", r2) return "%s,%s,%s" % (rmse, mae, r2)
def validate_model(model: ElasticNet, validation_dataset: DataFrame) -> str: """Calculates metrics of wine prediction model""" log_dataframe("validation", validation_dataset) # support for py3 parqeut validation_dataset = validation_dataset.rename(str, axis="columns") validation_x = validation_dataset.drop(["quality"], 1) validation_y = validation_dataset[["quality"]] prediction = model.predict(validation_x) (rmse, mae, r2) = calculate_metrics(validation_y, prediction) log_metric("rmse", rmse) log_metric("mae", rmse) log_metric("r2", r2) return "%s,%s,%s" % (rmse, mae, r2)
def word_count_inline(text=parameter.csv[spark.DataFrame], counters=output.txt.data): # type: (spark.DataFrame, Target) -> spark.DataFrame from operator import add from dbnd_spark.spark import get_spark_session lines = text.rdd.map(lambda r: r[0]) counts = (lines.flatMap(lambda x: x.split(" ")).map( lambda x: (x, 1)).reduceByKey(add)) counts.saveAsTextFile(str(counters)) output = counts.collect() for (word, count) in output: print("%s: %i" % (word, count)) counts_df = get_spark_session().createDataFrame(counts) log_dataframe("counts_df", counts_df) log_metric("test", 1) return counts_df
def run(self): validation = self.validation_dataset self.log_metric("test_size", len(validation)) actual_model = self.model.read_pickle() logger.info("%s", validation.shape) validation_x = validation.drop(["quality"], 1) validation_y = validation[["quality"]] prediction = actual_model.predict(validation_x) (rmse, mae, r2) = calculate_metrics(validation_y, prediction) log_dataframe("validation", validation) log_metric("rmse", rmse) log_metric("mae", rmse) log_metric("r2", r2) self.model_metrics.write("%s,%s,%s" % (rmse, mae, r2))
def word_count(input_path, output_path): spark = SparkSession.builder.appName("PythonWordCount").getOrCreate() lines = spark.read.text(input_path) check = Check(spark, CheckLevel.Warning, "Review Check") # check result should be run before analysis runner because its spins up Java Gateway server check_result = (VerificationSuite(spark).onData(lines).addCheck( check.hasSize(lambda x: x >= 3)).run()) # "name" will be used as prefix in metric key result_key = ResultKey(spark, ResultKey.current_milli_time(), {"name": "words_df"}) AnalysisRunner(spark).onData(lines).addAnalyzer( ApproxCountDistinct("value")).useRepository( DbndMetricsRepository(spark)).saveOrAppendResult(result_key).run() log_dataframe("lines", lines) lines = lines.rdd.map(lambda r: r[0]) log_dataframe("lines_rdd", lines) counts = (lines.flatMap(lambda x: x.split(" ")).map( lambda x: (x, 1)).reduceByKey(add)) # counts.saveAsTextFile(output_path) output = counts.collect() log_dataframe("output", output) for (word, count) in output: print("%s: %i" % (word, count)) # this makes trouble on job submit on databricks! # spark.close() # Java gateway should be closed. If it won't be closed, the process won't quit. spark.sparkContext._gateway.close()
def track_database(): engine = create_engine(DB_CONNECTION) log_metric("query executed", QUERY) with engine.connect() as connection: result = connection.execute(QUERY).keys() header = [row for row in result] result = connection.execute(QUERY) data = [row for row in result] df = pd.DataFrame(data, columns=header) log_dataframe("DataFrame", df, with_histograms=True, with_schema=True, with_size=True, with_stats=True, with_preview=True) log_metric("row_count", df.shape[0]) log_metric("column_count", df.shape[1])
def validate_model_for_customer(model: ElasticNet, validation_dataset: pd.DataFrame, threshold=0.2) -> Tuple[str, figure.Figure]: log_dataframe("validation", validation_dataset) # support for py3 parqeut validation_dataset = validation_dataset.rename(str, axis="columns") validation_x = validation_dataset.drop(["target"], 1) validation_y = validation_dataset[["target"]] prediction = model.predict(validation_x) (rmse, mae, r2) = calculate_metrics(validation_y, prediction) log_metric("rmse", rmse) log_metric("mae", mae) log_metric("r2", r2) fig = _create_scatter_plot(validation_y, prediction) if r2 < threshold: raise Exception( "Model quality is below threshold. Got R2 equal to %s, expect at least %s" % (r2, threshold)) return "%s,%s,%s" % (rmse, mae, r2), fig
def dedup_records( data: DataFrame, key_columns: list, to_pandas: bool, with_histograms: bool, sampling_type: str, sampling_fraction: float, ) -> Tuple[DataFrame, tuple]: data = data.dropDuplicates(key_columns) if sampling_type is not None: if sampling_type == "random": data = data.sample(False, sampling_fraction) if sampling_type == "first": data = data.limit(int(data.count() * sampling_fraction)) inputs_shape = (data.count(), len(data.columns)) if to_pandas: log_dataframe("data", data.toPandas(), with_histograms=with_histograms) else: log_dataframe("data", data, with_histograms=with_histograms) return data, inputs_shape
def word_count(input_path, output_path): spark = SparkSession.builder.appName("PythonWordCount").getOrCreate() lines = spark.read.text(input_path) log_dataframe("lines", lines) lines = lines.rdd.map(lambda r: r[0]) log_dataframe("lines_rdd", lines) counts = (lines.flatMap(lambda x: x.split(" ")).map( lambda x: (x, 1)).reduceByKey(add)) counts.saveAsTextFile(output_path) output = counts.collect() log_dataframe("output", output) for (word, count) in output: print("%s: %i" % (word, count))
def preprocess( raw_data: pd.DataFrame = sagemaker_data_repo.amazon_reviews_raw_data, ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Preprocesses data based on business logic - Reads delimited file passed as s3_url and preprocess data by filtering long tail in the customer ratings data i.e. keep customers who have rated 5 or more videos, and videos that have been rated by 9+ customers - Preprocessed data is then written to output """ # limit dataframe to customer_id, product_id, and star_rating # `product_title` will be useful validating recommendations df = raw_data[[ "customer_id", "product_id", "star_rating", "product_title" ]] # clean out the long tail because most people haven't seen most videos, # and people rate fewer videos than they actually watch customers = df["customer_id"].value_counts() products = df["product_id"].value_counts() # based on data exploration only about 5% of customers have rated 5 or # more videos, and only 25% of videos have been rated by 9+ customers customers = customers[customers >= 5] products = products[products >= 10] log_dataframe("Original data shape", df) reduced_df = df.merge(pd.DataFrame( {"customer_id": customers.index})).merge(pd.DataFrame({"product_id": products.index})) log_dataframe("Shape after removing long tail", reduced_df) reduced_df = reduced_df.drop_duplicates(["customer_id", "product_id"]) log_dataframe("Shape after removing duplicates", reduced_df) # recreate customer and product lists since there are customers with # more than 5 reviews, but all of their reviews are on products with # less than 5 reviews (and vice versa) customers = reduced_df["customer_id"].value_counts() products = reduced_df["product_id"].value_counts() # sequentially index each user and item to hold the sparse format where # the indices indicate the row and column in our ratings matrix customer_index = pd.DataFrame({ "customer_id": customers.index, "customer": np.arange(customers.shape[0]) }) product_index = pd.DataFrame({ "product_id": products.index, "product": np.arange(products.shape[0]) }) reduced_df = reduced_df.merge(customer_index).merge(product_index) nb_customer = reduced_df["customer"].max() + 1 nb_products = reduced_df["product"].max() + 1 feature_dim = nb_customer + nb_products log_metric("features(customer,product,total)", (nb_customer, nb_products, feature_dim)) # print(nb_customer, nb_products, feature_dim) product_df = reduced_df[["customer", "product", "star_rating"]] # split into train, validation and test data sets train_df, validate_df, test_df = np.split( product_df.sample(frac=1), [int(0.6 * len(product_df)), int(0.8 * len(product_df))], ) log_metric("# of rows train", train_df.shape[0]) log_metric("# of rows test", test_df.shape[0]) log_metric("# of rows validation", validate_df.shape[0]) # select columns required for training the model # excluding columns "customer_id", "product_id", "product_title" to # keep files small cols = ["customer", "product", "star_rating"] train_df = train_df[cols] validate_df = validate_df[cols] test_df = test_df[cols] return train_df, test_df, validate_df
# from __future__ import print_function import sys from operator import add from pyspark.sql import SparkSession from dbnd import log_dataframe, log_metric if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: wordcount <file> <output>") sys.exit(-1) spark = SparkSession.builder.appName("PythonWordCount").getOrCreate() lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0]) counts = (lines.flatMap(lambda x: x.split(" ")).map( lambda x: (x, 1)).reduceByKey(add)) counts.saveAsTextFile(sys.argv[2]) output = counts.collect() log_dataframe("output", output) for (word, count) in output: print("%s: %i" % (word, count)) log_metric("output_len", len(output))
def filter_partner(data, partner): # type: (DataFrame, int) -> DataFrame partner_data = data[data["partner"] == partner] log_dataframe("partner_%s" % partner, partner_data) return partner_data
def monitor_S3_bucket(**context): ''' This S3 monitor takes a niave approach and is not suitable for large buckets S3 monitor will log metrics for the target key, collecting the following metrics: - Total bucket size (GB) - Largest key name - Largest key (MB) - Pandas DataFrame with the following metrics on each object inside bucket: - key - size (MB) - last modified timestamp ''' MB = 1048576 GB = 1073741824 bucket_name = context['bucket_name'] s3_hook = S3Hook(aws_conn_id=AWS_CONN_ID) bucket = s3_hook.get_bucket(bucket_name) bucket_info = { "{}-key".format(bucket_name): [], "{}-size(MB)".format(bucket_name): [], "{}-last_modified".format(bucket_name): [] } bucket_size = 0 # WARNING: bucket.objects.all() returns objects recursively and can affect performance on large buckets for s3_object in bucket.objects.all(): bucket_size += s3_object.size bucket_info["{}-key".format(bucket_name)].append(s3_object.key) bucket_info["{}-size(MB)".format(bucket_name)].append(s3_object.size/MB) bucket_info["{}-last_modified".format(bucket_name)].append(s3_object.last_modified) bucket_info_df = pd.DataFrame(bucket_info) num_objects = bucket_info_df.shape[0] largest_key_size = max(bucket_info["{}-size(MB)".format(bucket_name)]) largest_key_size_idx = bucket_info["{}-size(MB)".format(bucket_name)].index(largest_key_size) largest_key_name = bucket_info["{}-key".format(bucket_name)][largest_key_size_idx] log_metric("{}-largest_key_size_(MB)".format(bucket_name), largest_key_size) log_metric("{}-largest_key_name".format(bucket_name), largest_key_name) log_dataframe( "{}-full_bucket_information".format(bucket_name), bucket_info_df, with_histograms=True, with_stats=True, with_schema=True, path="s3://{}".format(bucket_name), ) log_metric("{}-total_bucket_size(GB)".format(bucket_name), bucket_size/GB) log_metric("{}-number_of_objects".format(bucket_name), num_objects) key_metrics = { "{}-largest_key_size(MB)".format(bucket_name): largest_key_size, "{}-largest_key_name".format(bucket_name): largest_key_name, "{}-total_bucket_size".format(bucket_name): bucket_size/GB, "{}-number_of_objects".format(bucket_name): num_objects } return key_metrics
def filter_by_id(data, partner_id): # type: (DataFrame, int) -> DataFrame partner_data = data[data["partner"] == partner_id] log_dataframe("partner_%s" % partner_id, partner_data) return partner_data
def join_data(raw_data: List[pd.DataFrame]) -> pd.DataFrame: result = raw_data.pop(0) for d in raw_data: result = result.merge(d, on="id") log_dataframe(result) return result
def dedup_records(data: DataFrame, key_columns=["name"]) -> DataFrame: data = data.dropDuplicates(key_columns) log_dataframe("data", data, with_histograms=True) return data
def calculate_features(data: pd.DataFrame, selected_features: List[str] = None, data_path: PathStr = None) -> pd.DataFrame: log_dataframe("data_path", data, with_histograms=True, path=data_path) data = data[selected_features] return data