def write_raw_dataset(df, ctx, spark): logger.info("Caching {} data (version: {})".format(ctx.app["name"], ctx.dataset_version)) acc, df = spark_util.accumulate_count(df, spark) df.write.mode("overwrite").parquet( aws.s3a_path(ctx.bucket, ctx.raw_dataset["key"])) return acc.value
def write_training_data(model_name, df, ctx): model = ctx.models[model_name] training_dataset = model["dataset"] feature_names = model["features"] + [model["target"] ] + model["training_features"] df = df.select(*feature_names) metadata = {"dataset_size": df.count()} aws.upload_json_to_s3(metadata, training_dataset["metadata_key"], ctx.bucket) train_ratio = model["data_partition_ratio"]["training"] eval_ratio = model["data_partition_ratio"]["evaluation"] [train_df, eval_df] = df.randomSplit([train_ratio, eval_ratio]) train_df.write.mode("overwrite").format("tfrecords").option( "recordType", "Example").save(aws.s3a_path(ctx.bucket, training_dataset["train_key"])) eval_df.write.mode("overwrite").format("tfrecords").option( "recordType", "Example").save(aws.s3a_path(ctx.bucket, training_dataset["eval_key"])) return df
def write_raw_dataset(df, ctx): df.write.mode("overwrite").parquet( aws.s3a_path(ctx.bucket, ctx.raw_dataset_key))
def read_raw_dataset(ctx, spark): return spark.read.parquet(aws.s3a_path(ctx.bucket, ctx.raw_dataset_key))