def upload_resource_status_end(self, exit_code, *resources): timestamp = util.now_timestamp_rfc_3339() for resource in resources: status = self.get_resource_status(resource) if status.get("end") != None: continue status["end"] = timestamp status["exit_code"] = exit_code key = self.resource_status_key(resource) aws.upload_json_to_s3(status, key, self.bucket)
def drop_null_and_write(ingest_df, ctx, spark): full_dataset_size = ingest_df.count() logger.info("Dropping any rows that contain null values") ingest_df = ingest_df.dropna() written_count = write_raw_dataset(ingest_df, ctx, spark) metadata = {"dataset_size": written_count} aws.upload_json_to_s3(metadata, ctx.raw_dataset["metadata_key"], ctx.bucket) logger.info("{} rows read, {} rows dropped, {} rows ingested".format( full_dataset_size, full_dataset_size - written_count, written_count))
def upload_resource_status_start(self, *resources): timestamp = util.now_timestamp_rfc_3339() for resource in resources: key = self.resource_status_key(resource) status = { "resource_id": resource["id"], "resource_type": resource["resource_type"], "workload_id": resource["workload_id"], "app_name": self.app["name"], "start": timestamp, } aws.upload_json_to_s3(status, key, self.bucket)
def ingest_raw_dataset(spark, ctx, cols_to_validate, should_ingest): if should_ingest: cols_to_validate = list(ctx.rf_id_map.keys()) if len(cols_to_validate) == 0: logger.info("Reading {} data (version: {})".format(ctx.app["name"], ctx.dataset_version)) return spark_util.read_raw_dataset(ctx, spark) col_resources_to_validate = [ctx.rf_id_map[f] for f in cols_to_validate] ctx.upload_resource_status_start(*col_resources_to_validate) try: if should_ingest: data_config = ctx.environment["data"] logger.info("Ingesting") logger.info("Ingesting {} data from {}".format(ctx.app["name"], data_config["path"])) ingest_df = spark_util.ingest(ctx, spark) full_dataset_size = ingest_df.count() if data_config.get("drop_null"): logger.info("Dropping any rows that contain null values") ingest_df = ingest_df.dropna() if ctx.environment.get("limit"): ingest_df = limit_dataset(full_dataset_size, ingest_df, ctx.environment["limit"]) written_count = write_raw_dataset(ingest_df, ctx, spark) metadata = {"dataset_size": written_count} aws.upload_json_to_s3(metadata, ctx.raw_dataset["metadata_key"], ctx.bucket) if written_count != full_dataset_size: logger.info( "{} rows read, {} rows dropped, {} rows ingested".format( full_dataset_size, full_dataset_size - written_count, written_count ) ) else: logger.info("{} rows ingested".format(written_count)) logger.info("Reading {} data (version: {})".format(ctx.app["name"], ctx.dataset_version)) raw_df = spark_util.read_raw_dataset(ctx, spark) validate_dataset(ctx, raw_df, cols_to_validate) except: ctx.upload_resource_status_failed(*col_resources_to_validate) raise ctx.upload_resource_status_success(*col_resources_to_validate) logger.info("First {} samples:".format(3)) show_df(raw_df, ctx, 3) return raw_df
def ingest_raw_dataset(spark, ctx, cols_to_validate, should_ingest): if should_ingest: cols_to_validate = list(ctx.rf_id_map.keys()) if len(cols_to_validate) == 0: logger.info("Reading {} data (version: {})".format( ctx.app["name"], ctx.dataset_version)) return spark_util.read_raw_dataset(ctx, spark) col_resources_to_validate = [ctx.rf_id_map[f] for f in cols_to_validate] ctx.upload_resource_status_start(*col_resources_to_validate) try: if should_ingest: logger.info("Ingesting") logger.info("Ingesting {} data from {}".format( ctx.app["name"], ctx.environment["data"]["path"])) ingest_df = spark_util.ingest(ctx, spark) if ctx.environment["data"].get("drop_null"): drop_null_and_write(ingest_df, ctx, spark) else: written_count = write_raw_dataset(ingest_df, ctx, spark) metadata = {"dataset_size": written_count} aws.upload_json_to_s3(metadata, ctx.raw_dataset["metadata_key"], ctx.bucket) logger.info("{} rows ingested".format(written_count)) logger.info("Reading {} data (version: {})".format( ctx.app["name"], ctx.dataset_version)) raw_df = spark_util.read_raw_dataset(ctx, spark) validate_dataset(ctx, raw_df, cols_to_validate) except: ctx.upload_resource_status_failed(*col_resources_to_validate) raise ctx.upload_resource_status_success(*col_resources_to_validate) logger.info("First {} samples:".format(3)) show_df(raw_df, ctx, 3) return raw_df
def write_training_data(model_name, df, ctx): model = ctx.models[model_name] training_dataset = model["dataset"] feature_names = model["features"] + [model["target"] ] + model["training_features"] df = df.select(*feature_names) metadata = {"dataset_size": df.count()} aws.upload_json_to_s3(metadata, training_dataset["metadata_key"], ctx.bucket) train_ratio = model["data_partition_ratio"]["training"] eval_ratio = model["data_partition_ratio"]["evaluation"] [train_df, eval_df] = df.randomSplit([train_ratio, eval_ratio]) train_df.write.mode("overwrite").format("tfrecords").option( "recordType", "Example").save(aws.s3a_path(ctx.bucket, training_dataset["train_key"])) eval_df.write.mode("overwrite").format("tfrecords").option( "recordType", "Example").save(aws.s3a_path(ctx.bucket, training_dataset["eval_key"])) return df