def stub(stage, test_df=None): # type: (str, pd.DataFrame) -> None # if random.randint(0, 1): # raise Exception("brrrr") log_metric(stage, utcnow()) log_dataframe("df_" + stage, test_df)
def train_model_spark( test_set: parameter(log_histograms=True)[spark.DataFrame], training_set: spark.DataFrame, alpha: float = 1.0, l1_ratio: float = 0.5, saved_model=model_output_parameter, ) -> str: transform = VectorAssembler(inputCols=SELECTED_FEATURES, outputCol="features") lr = LogisticRegression( featuresCol="features", labelCol=LABEL_COLUMN, regParam=l1_ratio, elasticNetParam=alpha, family="multinomial", maxIter=1, ) ppl = Pipeline(stages=[transform, lr]) # Fit the pipeline to training documents. model = ppl.fit(training_set) prediction = model.transform(test_set) evaluation = prediction.withColumn( "label", prediction["score_label"].cast(DoubleType())).select( ["label", "prediction"]) evaluation.show() metrics = RegressionMetrics(evaluation.rdd) log_metric("r2", metrics.r2) log_metric("alpha", alpha) path = str(saved_model) model.write().save(path) return path
def bash_script( script=None, check_retcode=0, cwd=None, env=None, dbnd_env=True, output_encoding="utf-8", popen_kwargs=None, ): # type:( str, Optional[int],str, Dict[str,str], bool, str, Dict[str,Any]) -> int # we need a working folder to create bash script task_run = try_get_current_task_run() if task_run: script_dir = str(task_run.local_task_run_root) else: script_dir = None bash_script_path = os.path.join(script_dir, "bash_cmd.sh") with open(bash_script_path, "wb") as bs: bs.write(bytes(script, "utf_8")) log_metric("bash_script", bash_script_path) logger.info("Bash script location: %s", bash_script_path) args = ["bash", bash_script_path] return bash_cmd.func( args=args, check_retcode=check_retcode, cwd=cwd or script_dir, env=env, dbnd_env=dbnd_env, output_encoding=output_encoding, popen_kwargs=popen_kwargs, )
def convert_sparse_matrix(df, nb_rows, nb_customer, nb_products): # dataframe to array df_val = df.values # determine feature size nb_cols = nb_customer + nb_products log_metric("# of rows", nb_rows) log_metric("# of cols", nb_cols) # extract customers and ratings df_X = df_val[:, 0:2] # Features are one-hot encoded in a sparse matrix X = lil_matrix((nb_rows, nb_cols)).astype("float32") df_X[:, 1] = nb_customer + df_X[:, 1] coords = df_X[:, 0:2] X[np.arange(nb_rows), coords[:, 0]] = 1 X[np.arange(nb_rows), coords[:, 1]] = 1 # create label with ratings Y = df_val[:, 2].astype("float32") # validate size and shape logger.info("Shape of X: %s ", str(X.shape)) logger.info("Shape of Y: %s", str(Y.shape)) assert X.shape == (nb_rows, nb_cols) assert Y.shape == (nb_rows, ) return X, Y
def compare_metrics(**context): ''' compare metrics passed through xcom. This task will log the following metrics - largest bucket by number of objects - largest bucket by memory size ''' bucket_names = context['bucket_names'] task_ids = ["{}_monitor".format(bucket_name) for bucket_name in bucket_names] log_metric("task_ids", task_ids) # extract the metric dictionaries from xcom and join them by bucket name aggregated_metrics = {"buckets": bucket_names} for task_id in task_ids: task_metric = context["ti"].xcom_pull(task_ids=task_id) for task_metric_name, task_metric_value in task_metric.items(): metric_name = task_metric_name.split('-')[-1] if metric_name in aggregated_metrics: aggregated_metrics[metric_name].append(task_metric_value) else: aggregated_metrics[metric_name] = [task_metric_value] # log interesting metrics largest_num_objs= max(aggregated_metrics['number_of_objects']) largest_bucket_by_obj = aggregated_metrics['buckets'][aggregated_metrics['number_of_objects'].index(largest_num_objs)] log_metric("largest_obj_count", largest_num_objs) log_metric("largest_bucket_by_obj_count", largest_bucket_by_obj) largest_bucket_mem = max(aggregated_metrics['total_bucket_size']) largest_bucket_by_mem = aggregated_metrics['buckets'][aggregated_metrics['total_bucket_size'].index(largest_bucket_mem)] log_metric("largest_bucket_by_memory", largest_bucket_by_mem) log_metric("largest_memory_consumed", largest_bucket_mem)
def prepare_data() -> Tuple[DataFrame, DataFrame]: """load dataset from sklearn. split into training and testing sets""" raw_data = datasets.load_diabetes() # create a pandas DataFrame from sklearn dataset df = DataFrame(raw_data["data"], columns=raw_data["feature_names"]) df["target"] = Series(raw_data["target"]) # split the data into training and testing sets training_data, testing_data = train_test_split(df, test_size=0.25) # use DBND logging features to log DataFrames with histograms log_dataframe( "training data", training_data, with_histograms=True, with_schema=True, with_stats=True, ) log_dataframe("testing_data", testing_data) # use DBND logging features to log the mean of s1 log_metric("mean s1", training_data["s1"].mean()) return training_data, testing_data
def target_timeit_log(target, operation): from dbnd import log_metric log_level = get_target_logging_level() path = target if hasattr(target, "target"): path = target.target if hasattr(target, "path"): path = target.path name = target.source.name if target.source else path start_time = time.time() try: yield finally: end_time = time.time() delta_ms = (end_time - start_time) * 1000 logger.log( level=log_level, msg="Total {} time for target {} is {} milliseconds".format( operation, path, delta_ms), ) log_metric("marshalling_{}".format(name), delta_ms, source="system")
def train_model_spark( test_set: spark.DataFrame, training_set: spark.DataFrame, alpha: float = 1.0, l1_ratio: float = 0.5, saved_model=parameter.output.folder_data.with_flag(None)[PathStr], ) -> str: transform = VectorAssembler(inputCols=["0", "1", "2"], outputCol="features") lr = LogisticRegression( featuresCol="features", labelCol="target", regParam=l1_ratio, elasticNetParam=alpha, family="multinomial", maxIter=1, ) ppl = Pipeline(stages=[transform, lr]) # Fit the pipeline to training documents. model = ppl.fit(training_set) prediction = model.transform(test_set) evaluation = prediction.withColumn("label", prediction["target"].cast( DoubleType())).select(["label", "prediction"]) evaluation.show() metrics = RegressionMetrics(evaluation.rdd) log_metric("r2", metrics.r2) log_metric("alpha", alpha) model.write().save(str(saved_model)) return "ok"
def unit_imputation(raw_data: DataFrame, columns_to_impute=["10"], value=0) -> DataFrame: counter = int(raw_data.describe().first().phone) noise = randint(-counter, counter) log_metric("Replaced NaNs", counter + noise) return raw_data.na.fill(value, columns_to_impute)
def create_report(data: pd.DataFrame) -> pd.DataFrame: avg_score = int(data["score_label"].sum() + randint(-2 * len(data.columns), 2 * len(data.columns))) log_metric("Column Count", len(data.columns)) log_metric("Avg Score", avg_score) log_dataframe("ready_data", data, with_histograms=True) return pd.DataFrame(data=[[avg_score]], columns=["avg_score"])
def split_data_spark( raw_data: spark.DataFrame ) -> Tuple[spark.DataFrame, spark.DataFrame, spark.DataFrame]: columns_to_remove = set(["id", "0_norm", "10_norm"]) if columns_to_remove.issubset(list(raw_data.schema.names)): raw_data = raw_data.drop(columns_to_remove) (train, test) = raw_data.randomSplit([0.8, 0.2]) (test, validation) = raw_data.randomSplit([0.5, 0.5]) target_stats = raw_data.describe(["target"]) log_metric( "target.mean", target_stats.filter( target_stats["summary"] == "mean").collect()[0].asDict()["target"], ) log_metric( "target.std", target_stats.filter(target_stats["summary"] == "stddev").collect() [0].asDict()["target"], ) return train, test, validation
def unit_imputations(raw_data: DataFrame, value: int) -> DataFrame: counter = int(raw_data.describe().first().phone) noise = random.randint(-counter, counter) log_metric("Replaced NaNs", counter + noise) return raw_data.na.fill(value)
def log_operator_result(task_run, result, operator, track_xcom): _log_result(task_run, result) # after airflow runs the operator it xcom_push the result, so we log it if track_xcom and operator.do_xcom_push and result is not None: from airflow.models import XCOM_RETURN_KEY log_metric(key=XCOM_RETURN_KEY, value=result)
def word_count(text, factor=1): # type: (DataList[str], int)-> int log_metric("input", len(text)) logger.info("Factor: %s", factor) result = Counter() for line in text: result.update(line.split() * factor) return sum(result.values())
def generate_report(): spark = SparkSession.builder.appName("benchmark-pyspark").getOrCreate() df = spark.read.csv("./datasets/backblaze-data-01gb", inferSchema=True, header=True, sep=",") df.select("serial_number").show(10) log_metric("records_count", 1)
def t_f_a(t_input, t_param, t_default="d1"): # type: (DataList[str], str, str) -> DataList[str] # adds dressing assert t_default == "d1" assert t_param == "d2" log_metric("t_input", len(t_input)) logger.info("Got string: %s", t_input) return t_input[:2]
def train_model(training_data: DataFrame) -> LinearRegression: """ train a linear regression model """ model = LinearRegression() # train a linear regression model model.fit(training_data.drop("target", axis=1), training_data["target"]) # use DBND log crucial details about the regression model with log_metric: log_metric("model intercept", model.intercept_) # logging a numeric log_metric("coefficients", model.coef_) # logging an np array return model
def create_report(data: DataFrame) -> DataFrame: log_metric("Column Count", len(data.columns)) log_metric( "Avg Score", int( data.agg({ "score": "sum" }).collect()[0][0] + randint(-2 * len(data.columns), 2 * len(data.columns))), ) return data
def fake_task_inside_dag(): log_metric("Testing", "Metric") run = try_get_databand_run() assert run is not None, "Task should run in databand run, check airflow tracking!" root_task = run.root_task # Validate regular subdag properties assert run.job_name == "%s.%s" % (PARENT_DAG, CHILD_DAG) # this test got problematic cause airflow_inplace task named as the script that ran it assert root_task.task_name return "Regular test"
def fake_task_inside_dag(): log_metric("Testing", "Metric") run = try_get_databand_run() assert run is not None, "Task should run in databand run, check airflow tracking!" root_task = run.root_task # Validate regular subdag properties assert run.job_name == "%s.%s.fake_task_inside_dag" % (PARENT_DAG, CHILD_DAG) assert root_task.task_name == "fake_task_inside_dag__execute" return "Regular test"
def task_pass_through_default(data, dt, expect_pass_through): # type: (pd.DataFrame, datetime.datetime, bool) -> str # print needed to test that log is sent print("hello task_pass_through_default") if expect_pass_through: assert isinstance(data, str) assert isinstance(dt, str) else: assert isinstance(data, pd.DataFrame) assert isinstance(dt, datetime.datetime) log_metric("data", data) return str(data)
def dbnd_status(): report = DoctorStatusReportBuilder("Databand Status") report.log("env.DBND_HOME", os.environ.get("DBND_HOME")) dc = try_get_databand_context() report.log("DatabandContext", dc) if dc: report.log("initialized", dc) # calling metrics. log_metric("metric_check", "OK") log_metric("metric_random_value", random.random()) return report.get_status_str()
def prepare_data(data, output_file): spark = SparkSession.builder.appName( "PythonWordCount").getOrCreate() lines = spark.read.text(data).rdd.map(lambda r: r[0]) counts = (lines.flatMap(lambda x: x.split(" ")).map( lambda x: (x, 1)).reduceByKey(add)) counts.saveAsTextFile(output_file) output = counts.collect() for (word, count) in output: print("%s: %i" % (word, count)) log_metric("counts", len(output)) spark.sparkContext.stop()
def test_model(model: LinearRegression, testing_data: DataFrame) -> str: """ test the model, output mean squared error and r2 score """ testing_x = testing_data.drop("target", axis=1) testing_y = testing_data["target"] predictions = model.predict(testing_x) mse = mean_squared_error(testing_y, predictions) r2_score = model.score(testing_x, testing_y) # use DBND log_metric to capture important model details: log_metric("mean squared error:", mse) log_metric("r2 score", r2_score) return f"MSE: {mse}, R2: {r2_score}"
def clean_pii(data: pd.DataFrame, pii_columns: List[str], target_date: datetime.date = None) -> pd.DataFrame: # I am not sure about this code, but this might help if target_date and target_date >= datetime.date(2020, 7, 12): if "10" not in data.columns: log_metric("Fixed columns", ["10"]) data["10"] = 0 data[pii_columns] = data[pii_columns].apply( lambda x: x.apply(get_hash_for_obj), axis=1) log_metric("PII items removed:", len(pii_columns) * data.shape[0]) log_dataframe("pii_clean", data) return data
def train_model( test_set: DataFrame, training_set: DataFrame, alpha: float = 0.5, l1_ratio: float = 0.5, ) -> ElasticNet: """ Train wine prediction model """ lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) lr.fit(training_set.drop(["quality"], 1), training_set[["quality"]]) prediction = lr.predict(test_set.drop(["quality"], 1)) (rmse, mae, r2) = calculate_metrics(test_set[["quality"]], prediction) log_metric("alpha", alpha) log_metric("rmse", rmse) log_metric("mae", rmse) log_metric("r2", r2) logging.info( "Elasticnet model (alpha=%f, l1_ratio=%f): rmse = %f, mae = %f, r2 = %f", alpha, l1_ratio, rmse, mae, r2, ) return lr
def run(self): """ Generates bogus data and writes it into the :py:meth:`~.Streams.output` target. """ logger.warning("Hey, this is streams task!") with self.output().open("w") as output: for _ in range(1000): output.write("{} {} {}\n".format( random.randint(0, 999), get_random_name(), random.randint(0, 999), )) log_metric("lines", 1000)
def run(self): from dbnd import log_metric logger.warning("Hey, this is top artists task!") top_10 = nlargest(10, self._input_iterator()) log_metric(key="Top10Artists", value=str(top_10)) with self.output().open("w") as out_file: for streams, artist in top_10: out_line = "\t".join([ str(self.date_interval.date_a), str(self.date_interval.date_b), artist, str(streams), ]) out_file.write((out_line + "\n"))
def split_data( raw_data: pd.DataFrame, ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: columns_to_remove = set(["id", "0_norm", "10_norm"]) if columns_to_remove.issubset(raw_data.columns): raw_data.drop(columns_to_remove, axis=1, inplace=True) train_df, test_df = train_test_split(raw_data) test_df, validation_df = train_test_split(test_df, test_size=0.5) log_dataframe("raw", raw_data) log_metric("target.mean", raw_data["target"].mean()) log_metric("target.std", raw_data["target"].std()) return train_df, test_df, validation_df
def prepare( train: pd.DataFrame, test: pd.DataFrame, validation: pd.DataFrame, train_out=output[Target], test_out=output.folder_data.with_flag(None)[Target], validation_out=output[Target], ): """Prepare data for training with Sagemaker algorithms - Read preprocessed data and converts to ProtoBuf format to prepare for training with Sagemaker algorithms Args: test - dataframe with test data train - dataframe with train data validation - dataframe with validation data train_out - output path for train data in protobuf format test_out - output path for test data in protobuf format validation_out - output path for validation data in protobuf format Returns: s3 url with key to the prepared data """ all_df = pd.concat([train, validation, test]) nb_customer = np.unique(all_df["customer"].values).shape[0] nb_products = np.unique(all_df["product"].values).shape[0] feature_dim = nb_customer + nb_products log_metric("customers x products x feature_dim", (nb_customer, nb_products, feature_dim)) train_X, train_Y = convert_sparse_matrix(train, train.shape[0], nb_customer, nb_products) validate_X, validate_Y = convert_sparse_matrix(validation, validation.shape[0], nb_customer, nb_products) test_X, test_Y = convert_sparse_matrix(test, test.shape[0], nb_customer, nb_products) save_as_protobuf(train_X, train_Y, train_out) save_as_protobuf(validate_X, validate_Y, validation_out) test_x_chunks = chunk(test_X, 10000) test_y_chunks = chunk(test_Y, 10000) N = len(test_x_chunks) for i in range(N): save_as_protobuf(test_x_chunks[i], test_y_chunks[i], test_out.partition()) return "OK"