def main( date, input_bucket, input_prefix, output_bucket, output_prefix, output_version, lag_days, ): """ Aggregate by (client_id, experiment_id, day). Note that the target day will actually be `lag-days` days before the supplied date. In other words, if you pass in 2017-01-20 and set `lag-days` to 5, the aggregation will be processed for day 2017-01-15 (the resulting data will cover submission dates including the activity day itself plus 5 days of lag for a total of 6 days). """ spark = SparkSession.builder.appName("experiments_daily").getOrCreate() parquet_path = format_spark_path(input_bucket, input_prefix) frame = load_experiments_summary(spark, parquet_path) day_frame, start_date = extract_submission_window_for_activity_day( frame, date, lag_days) searches_frame = extract_search_counts(frame) results = to_experiment_profile_day_aggregates(searches_frame) spark.conf.set( "mapreduce.fileoutputcommitter.marksuccessfuljobs", "false" ) # Don't write _SUCCESS files, which interfere w/ReDash discovery output_base_path = "{}/v{}/activity_date_s3={}".format( format_spark_path(output_bucket, output_prefix), output_version, start_date.strftime("%Y-%m-%d"), ) results.write.mode("overwrite").parquet(output_base_path)
def main(input_bucket, input_prefix, output_bucket, output_prefix): s3client = boto3.client('s3', 'us-west-2') transferer = S3Transfer(s3client) last_rollup_basename = get_last_rollup(transferer) if last_rollup_basename: since, carryover = parse_last_rollup(last_rollup_basename) logging.info("Generating counts since {}".format(since)) else: since, carryover = None, [] logging.info("Generating counts since beginning") spark = ( SparkSession .builder .appName("maudau") .getOrCreate() ) path = U.format_spark_path(input_bucket, input_prefix) logging.info("Loading main_summary from {}".format(path)) main_summary = spark.read.option("mergeSchema", "true").parquet(path) updates = generate_counts(main_summary, since) logging.info("Generated counts for {} days".format(len(updates))) results = carryover + updates output_basename = write_locally(results) publish_to_s3(s3client, output_bucket, output_prefix, output_basename) if not DEVELOPMENT: logging.info("Published to S3; done.")
def main(date, input_bucket, input_prefix, output_bucket, output_prefix, output_version, sample_id, lag_days): """ Aggregate by (client_id, day) for a given day. Note that the target day will actually be `lag-days` days before the supplied date. In other words, if you pass in 2017-01-20 and set `lag-days` to 5, the aggregation will be processed for day 2017-01-15 (the resulting data will cover submission dates including the activity day itself plus 5 days of lag for a total of 6 days). """ spark = (SparkSession.builder.appName("clients_daily").getOrCreate()) # Per https://issues.apache.org/jira/browse/PARQUET-142 , # don't write _SUCCESS files, which interfere w/ReDash discovery spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") main_summary = load_main_summary(spark, input_bucket, input_prefix) day_frame, start_date = extract_submission_window_for_activity_day( main_summary, date, lag_days) if sample_id: day_frame = day_frame.where("sample_id = '{}'".format(sample_id)) with_searches = extract_search_counts(day_frame) results = to_profile_day_aggregates(with_searches) partition_count = get_partition_count_for_writing(bool(sample_id)) output_base_path = "{}/v{}/".format( format_spark_path(output_bucket, output_prefix), output_version) write_one_activity_day(results, start_date, output_base_path, partition_count)
def write_by_activity_day(results, day_pointer, output_bucket, output_prefix, partition_count): month = day_pointer.month prefix_template = os.path.join(output_prefix, 'activity_date_s3={}') while day_pointer.month == month: isoday = day_pointer.isoformat() prefix = prefix_template.format(isoday) output_path = format_spark_path(output_bucket, prefix) data_for_date = results.where(results.activity_date == isoday) data_for_date.coalesce(partition_count).write.parquet(output_path) day_pointer += DT.timedelta(1)
def main(local, submission_date_s3, input_bucket, input_prefix, output_bucket, output_prefix): # print argument information for k, v in locals().items(): print("{}: {}".format(k, v)) print("Python version: {}".format(sys.version_info)) spark = SparkSession.builder.getOrCreate() print("Spark version: {}".format(spark.version)) # run a basic count over a sample of `main_summary` from 2 days ago if not local: ds_nodash = submission_date_s3 input_path = format_spark_path(input_bucket, input_prefix) output_path = format_spark_path(output_bucket, output_prefix) print( "Reading data for {ds_nodash} from {input_path} and writing to {output_path}" .format(ds_nodash=ds_nodash, input_path=input_path, output_path=output_path)) path = "{}/submission_date_s3={}/sample_id={}".format( input_path, ds_nodash, 1) subset = spark.read.parquet(path) print("Saw {} documents".format(subset.count())) summary = subset.select("memory_mb", "cpu_cores", "subsession_length").describe() summary.show() summary.write.parquet(output_path + "/submission_date_s3={}/".format(ds_nodash), mode="overwrite") stop_session_safely(spark) print("Done!")
def load_main_summary(spark, input_bucket, input_prefix): main_summary_path = format_spark_path(input_bucket, input_prefix) return (spark.read.option("mergeSchema", "true").parquet(main_summary_path))