def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science", shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*", latestfirst=False) # Apply quality cuts logger.info(qualitycuts) df = apply_user_defined_filter(df, qualitycuts) # Apply science modules df = apply_science_modules(df, logger) # Add library versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns. # Partitioned data doesn't preserve type information (cast as int...) df_partitionedby = df\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd")) # Append new rows in the tmp science database countquery = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_sci_tmp) \ .option("path", args.scitmpdatapath)\ .partitionBy("year", "month", "day") \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession( name="raw2science_{}".format(args.night), shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Processing {}/{}/{}'.format(year, month, day)) input_raw = 'ztf_alerts/raw/year={}/month={}/day={}'.format( year, month, day) # basepath output_science = 'ztf_alerts/science_reprocessed' df = spark.read.format('parquet').load(input_raw) # Apply level one filters logger.info(qualitycuts) df = apply_user_defined_filter(df, qualitycuts) # Apply science modules df = apply_science_modules(df, logger) # Add librarys versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns. # Partitioned data doesn't preserve type information (cast as int...) df\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_science)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science_{}".format(args.night), shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # data path rawdatapath = args.online_data_prefix + '/raw' scitmpdatapath = args.online_data_prefix + '/science' checkpointpath_sci_tmp = args.online_data_prefix + '/science_checkpoint' df = connect_to_raw_database( rawdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), rawdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), latestfirst=False) # Apply quality cuts logger.info("Applying quality cuts") df = df\ .filter(df['candidate.nbad'] == 0)\ .filter(df['candidate.rb'] >= 0.55) # Apply science modules df = apply_science_modules(df, logger) # Add library versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns if needed. if 'timestamp' not in df.columns: df = df\ .withColumn("timestamp", jd_to_datetime(df['candidate.jd'])) if "year" not in df.columns: df = df\ .withColumn("year", F.date_format("timestamp", "yyyy")) if "month" not in df.columns: df = df\ .withColumn("month", F.date_format("timestamp", "MM")) if "day" not in df.columns: df = df\ .withColumn("day", F.date_format("timestamp", "dd")) # Append new rows in the tmp science database countquery = df\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", checkpointpath_sci_tmp) \ .option("path", scitmpdatapath)\ .partitionBy("year", "month", "day") \ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science_{}".format(args.night), shuffle_partitions=None) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Processing {}/{}/{}'.format(year, month, day)) # data path input_raw = args.agg_data_prefix + '/raw/year={}/month={}/day={}'.format( year, month, day) # basepath output_science = args.agg_data_prefix + '/science' df = spark.read.format('parquet').load(input_raw) npart = df.rdd.getNumPartitions() # Apply level one filters logger.info(qualitycuts) df = df.filter(df['candidate.nbad'] == 0).filter( df['candidate.rb'] >= 0.55) # Apply science modules df = apply_science_modules(df, logger) # Add tracklet information df_trck = spark.read.format('parquet').load(input_raw) df_trck = df_trck.filter(df_trck['candidate.nbad'] == 0).filter( df_trck['candidate.rb'] >= 0.55) df_trck = add_tracklet_information(df_trck) # join back information to the initial dataframe df = df\ .join( F.broadcast(df_trck.select(['candid', 'tracklet'])), on='candid', how='outer' ) # Add librarys versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns if needed. if 'timestamp' not in df.columns: df = df\ .withColumn("timestamp", jd_to_datetime(df['candidate.jd'])) if "year" not in df.columns: df = df\ .withColumn("year", F.date_format("timestamp", "yyyy")) if "month" not in df.columns: df = df\ .withColumn("month", F.date_format("timestamp", "MM")) if "day" not in df.columns: df = df\ .withColumn("day", F.date_format("timestamp", "dd")) df.coalesce(npart).write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_science)