def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session init_sparksession(name="monitoringStream", shuffle_partitions=2, log_level="ERROR") # Create a streaming dataframe pointing to a Kafka stream df = connect_with_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets, failondataloss=False) # Trigger the streaming computation, # by defining the sink (memory here) and starting it countquery = df \ .writeStream \ .queryName("qraw")\ .format("console")\ .outputMode("update") \ .start() # Monitor the progress of the stream, and save data for the webUI colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"] monitor_progress_webui(countquery, 2, colnames, args.finkwebpath) # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() print("Exiting the monitoring service normally...") else: countquery.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Grab the running Spark Session, # otherwise create it. spark = init_sparksession(name="readingScienceDB", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) with open(args.science_db_catalog) as f: catalog = json.load(f) catalog_dic = json.loads(catalog) df = spark.read.option("catalog", catalog)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .load() print("Number of entries in {}: ".format(catalog_dic["table"]["name"]), df.count()) print( "Number of distinct objects in {}: ".format( catalog_dic["table"]["name"]), df.select('objectId').distinct().count())
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science", shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*", latestfirst=False) # Apply quality cuts logger.info(qualitycuts) df = apply_user_defined_filter(df, qualitycuts) # Apply science modules df = apply_science_modules(df, logger) # Add library versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns. # Partitioned data doesn't preserve type information (cast as int...) df_partitionedby = df\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd")) # Append new rows in the tmp science database countquery = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_sci_tmp) \ .option("path", args.scitmpdatapath)\ .partitionBy("year", "month", "day") \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession( name="raw2science_{}".format(args.night), shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Processing {}/{}/{}'.format(year, month, day)) input_raw = 'ztf_alerts/raw/year={}/month={}/day={}'.format( year, month, day) # basepath output_science = 'ztf_alerts/science_reprocessed' df = spark.read.format('parquet').load(input_raw) # Apply level one filters logger.info(qualitycuts) df = apply_user_defined_filter(df, qualitycuts) # Apply science modules df = apply_science_modules(df, logger) # Add librarys versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns. # Partitioned data doesn't preserve type information (cast as int...) df\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_science)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="save_schema_{}".format(args.night), shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the aggregated science database year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Processing {}/{}/{}'.format(year, month, day)) input_science = '{}/science/year={}/month={}/day={}'.format( args.agg_data_prefix, year, month, day) df = load_parquet_files(input_science) # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Cast fields to ease the distribution cnames = df.columns cnames[cnames.index('timestamp')] = 'cast(timestamp as string) as timestamp' cnames[cnames.index('cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience' cnames[cnames.index('cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate' cnames[cnames.index('cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference' cnames[cnames.index('prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates' cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate' df_kafka = df.selectExpr(cnames) path_for_avro = 'new_schema_{}.avro'.format(time()) df_kafka.limit(1).write.format("avro").save(path_for_avro) # retrieve data on local disk subprocess.run(["hdfs", "dfs", '-get', path_for_avro]) # Read the avro schema from .avro file avro_file = glob.glob(path_for_avro + "/part*")[0] avro_schema = readschemafromavrofile(avro_file) # Write the schema to a file for decoding Kafka messages with open('schemas/{}'.format(path_for_avro.replace('.avro', '.avsc')), 'w') as f: json.dump(avro_schema, f, indent=2)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="distribution_test", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Topic to read from topic = args.distribution_topic broker_list = args.distribution_servers # Read from the Kafka topic df_kafka = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", broker_list) \ .option("kafka.security.protocol", "SASL_PLAINTEXT")\ .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\ .option("subscribe", topic) \ .load() # Decode df_kafka into a Spark DataFrame with StructType column df = decode_kafka_df(df_kafka, args.distribution_schema) # Print received stream to the console df = df.select("struct.*") print("\nReading Fink OutStream\n") debug_query = df.writeStream\ .format("console")\ .trigger(processingTime='2 seconds')\ .start() # Keep the Streaming running for some time if args.exit_after is not None: time.sleep(args.exit_after) debug_query.stop() logger.info("Exiting distribution_test service normally...") else: debug_query.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="checkstream", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Create a streaming dataframe pointing to a Kafka stream df = connect_to_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets_stream, failondataloss=False) # Trigger the streaming computation, # by defining the sink (memory here) and starting it countquery = df \ .writeStream \ .queryName("qraw")\ .format("console")\ .outputMode("update") \ .start() # Monitor the progress of the stream, and save data for the webUI colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"] monitor_progress_webui(countquery, 2, colnames, args.finkwebpath, "live_raw.csv", "live") # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the checkstream service normally...") else: countquery.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Get or create a Spark Session spark = init_sparksession( name="distribution_test", shuffle_partitions=2, log_level="ERROR") # Topic to read from topic = "distribution_test" # Read from the Kafka topic df_kafka = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9093") \ .option("subscribe", topic) \ .load() # Decode df_kafka into a Spark DataFrame with StructType column df = decode_kafka_df(df_kafka, args.distribution_schema) # Print to console cols = ["objectId", "candid", "candidate_ra", "candidate_dec", "simbadType"] cols = ["struct." + c for c in cols] df = df.select(cols) print("\nReading Fink OutStream\n") debug_query = df.writeStream\ .format("console")\ .trigger(processingTime='2 seconds')\ .start() # Keep the Streaming running for some time if args.exit_after is not None: time.sleep(args.exit_after) debug_query.stop() else: debug_query.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="mergeAndClean_{}".format(args.night)) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Processing {}/{}/{}'.format(year, month, day)) input_raw = '{}/year={}/month={}/day={}'.format(args.rawdatapath, year, month, day) input_science = '{}/year={}/month={}/day={}'.format( args.scitmpdatapath, year, month, day) # basepath output_raw = 'ztf_alerts/raw' output_science = 'ztf_alerts/science' print('Raw data processing....') df_raw = spark.read.format('parquet').load(input_raw) print('Num partitions before: ', df_raw.rdd.getNumPartitions()) print('Num partitions after : ', numPart(df_raw)) df_raw.withColumn('timestamp', jd_to_datetime(df_raw['candidate.jd']))\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .coalesce(numPart(df_raw))\ .write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_raw) print('Science data processing....') df_science = spark.read.format('parquet').load(input_science) print('Num partitions before: ', df_science.rdd.getNumPartitions()) print('Num partitions after : ', numPart(df_science)) df_science.withColumn('timestamp', jd_to_datetime(df_science['candidate.jd']))\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .coalesce(numPart(df_science))\ .write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_science) # Remove temporary alert folder - beware you'll never get it back! if args.fs == 'hdfs': subprocess.run(["hdfs", "dfs", '-rm', '-rf', args.datapath]) elif args.fs == 'local': subprocess.run(['rm', '-rf', args.datapath]) else: print('Filesystem not understood. FS_KIND must be hdfs or local.')
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science_{}".format(args.night), shuffle_partitions=None) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Processing {}/{}/{}'.format(year, month, day)) # data path input_raw = args.agg_data_prefix + '/raw/year={}/month={}/day={}'.format( year, month, day) # basepath output_science = args.agg_data_prefix + '/science' df = spark.read.format('parquet').load(input_raw) npart = df.rdd.getNumPartitions() # Apply level one filters logger.info(qualitycuts) df = df.filter(df['candidate.nbad'] == 0).filter( df['candidate.rb'] >= 0.55) # Apply science modules df = apply_science_modules(df, logger) # Add tracklet information df_trck = spark.read.format('parquet').load(input_raw) df_trck = df_trck.filter(df_trck['candidate.nbad'] == 0).filter( df_trck['candidate.rb'] >= 0.55) df_trck = add_tracklet_information(df_trck) # join back information to the initial dataframe df = df\ .join( F.broadcast(df_trck.select(['candid', 'tracklet'])), on='candid', how='outer' ) # Add librarys versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns if needed. if 'timestamp' not in df.columns: df = df\ .withColumn("timestamp", jd_to_datetime(df['candidate.jd'])) if "year" not in df.columns: df = df\ .withColumn("year", F.date_format("timestamp", "yyyy")) if "month" not in df.columns: df = df\ .withColumn("month", F.date_format("timestamp", "MM")) if "day" not in df.columns: df = df\ .withColumn("day", F.date_format("timestamp", "dd")) df.coalesce(npart).write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_science)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="TNS_report_{}".format(args.night), shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the aggregated science database path = '{}/science/year={}/month={}/day={}'.format(args.agg_data_prefix, args.night[:4], args.night[4:6], args.night[6:8]) df = load_parquet_files(path) with open('{}/tns_marker.txt'.format(args.tns_folder)) as f: tns_marker = f.read().replace('\n', '') if not args.tns_sandbox: print("WARNING: submitting to real (not sandbox) TNS website") if args.tns_sandbox: url_tns_api = "https://sandbox.wis-tns.org/api" with open('{}/sandbox-tns_api.key'.format(args.tns_folder)) as f: # remove line break... key = f.read().replace('\n', '') else: url_tns_api = "https://www.wis-tns.org/api" with open('{}/tns_api.key'.format(args.tns_folder)) as f: # remove line break... key = f.read().replace('\n', '') cols = [ 'cdsxmatch', 'roid', 'mulens', 'snn_snia_vs_nonia', 'snn_sn_vs_all', 'rf_snia_vs_nonia', 'candidate.ndethist', 'candidate.drb', 'candidate.classtar', 'candidate.jd', 'candidate.jdstarthist', 'rf_kn_vs_nonkn', 'tracklet' ] df = df.withColumn('class', extract_fink_classification(*cols)) pdf = df\ .filter(df['class'] == 'Early SN candidate')\ .filter(df['candidate.ndethist'] <= 20)\ .toPandas() pdf_unique = pdf.groupby('objectId')[pdf.columns].min() print("{} new alerts".format(len(pdf))) print("{} new sources".format(len(pdf_unique))) pdf = pdf_unique ids = [] report = {"at_report": {}} check_tns = False for index, row in enumerate(pdf.iterrows()): alert = row[1] past_ids = read_past_ids(args.tns_folder) if alert['objectId'] in past_ids.values: print('{} already sent!'.format(alert['objectId'])) continue if check_tns: groupid = retrieve_groupid(key, tns_marker, alert['objectId']) if groupid > 0: print("{} already reported by {}".format( alert['objectId'], groupid)) else: print('New report for object {}'.format(alert['objectId'])) photometry, non_detection = extract_discovery_photometry(alert) report['at_report']["{}".format(index)] = build_report( alert, photometry, non_detection) ids.append(alert['objectId']) print('new objects: ', ids) if len(ids) != 0: json_report = save_logs_and_return_json_report(name='{}{}{}'.format( args.night[:4], args.night[4:6], args.night[6:8]), folder=args.tns_folder, ids=ids, report=report) r = send_json_report(key, url_tns_api, json_report, tns_marker) print(r.json()) # post to slack slacktxt = ' \n '.join( ['https://fink-portal/{}'.format(i) for i in ids]) slacktxt = '{} \n '.format(args.night) + slacktxt r = requests.post(os.environ['TNSWEBHOOK'], json={ 'text': slacktxt, "username": "******" }, headers={'Content-Type': 'application/json'}) print(r.status_code) else: slacktxt = '{} \n No new sources'.format(args.night) r = requests.post(os.environ['TNSWEBHOOK'], json={ 'text': slacktxt, "username": "******" }, headers={'Content-Type': 'application/json'})
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session init_sparksession(name="archivingStream", shuffle_partitions=2, log_level="ERROR") # Create a streaming dataframe pointing to a Kafka stream df = connect_with_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets, failondataloss=False) # Get Schema of alerts _, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) df_decoded = df.select([ "timestamp", "topic", from_avro(df["value"], alert_schema_json).alias("decoded") ]) # Partition the data hourly df_partitionedby = df_decoded\ .withColumn("year", date_format("timestamp", "yyyy"))\ .withColumn("month", date_format("timestamp", "MM"))\ .withColumn("day", date_format("timestamp", "dd"))\ .withColumn("hour", date_format("timestamp", "HH")) # Append new rows every `tinterval` seconds countquery_tmp = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath) \ .option("path", args.outputpath)\ .partitionBy("topic", "year", "month", "day", "hour") # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() ui_refresh = args.tinterval else: countquery = countquery_tmp.start() # Update the UI every 2 seconds to place less load on the browser. ui_refresh = 2 # Monitor the progress of the stream, and save data for the webUI colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"] monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath) # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() print("Exiting the archiving service normally...") else: countquery.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="distribute", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the TMP science database df = connect_to_raw_database(args.scitmpdatapath, args.scitmpdatapath + "/*", latestfirst=False) # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Cast fields to ease the distribution cnames = df.columns cnames[cnames.index( 'timestamp')] = 'cast(timestamp as string) as timestamp' cnames[cnames.index( 'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience' cnames[cnames.index( 'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate' cnames[cnames.index( 'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference' cnames[cnames.index( 'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates' cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate' broker_list = args.distribution_servers for userfilter in userfilters: # The topic name is the filter name topicname = userfilter.split('.')[-1] # Apply user-defined filter df_tmp = apply_user_defined_filter(df, userfilter) # Wrap alert data df_tmp = df_tmp.selectExpr(cnames) # Get the DataFrame for publishing to Kafka (avro serialized) df_kafka = get_kafka_df(df_tmp, '') # Ensure that the topic(s) exist on the Kafka Server) disquery = df_kafka\ .writeStream\ .format("kafka")\ .option("kafka.bootstrap.servers", broker_list)\ .option("kafka.security.protocol", "SASL_PLAINTEXT")\ .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\ .option("topic", topicname)\ .option("checkpointLocation", args.checkpointpath_kafka + topicname)\ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) disquery.stop() logger.info("Exiting the distribute service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session init_sparksession(name="classifyStream", shuffle_partitions=2, log_level="ERROR") # Create a streaming dataframe pointing to a Kafka stream df = connect_with_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets, failondataloss=False) # Get Schema of alerts _, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) df_decoded = df.select([ "timestamp", from_avro(df["value"], alert_schema_json).alias("decoded") ]) # Select only (timestamp, id, ra, dec) df_expanded = df_decoded.select([ df_decoded["timestamp"], df_decoded["decoded.objectId"], df_decoded["decoded.candidate.ra"], df_decoded["decoded.candidate.dec"] ]) # for each micro-batch, perform a cross-match with an external catalog, # and return the types of the objects (Star, AGN, Unknown, etc.) df_type = df_expanded.withColumn( "type", cross_match_alerts_per_batch(col("objectId"), col("ra"), col("dec"))) # Group data by type and count members df_group = df_type.groupBy("type").count() # Update the DataFrame every tinterval seconds countquery_tmp = df_group\ .writeStream\ .outputMode("complete") \ .foreachBatch(write_to_csv) # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() ui_refresh = args.tinterval else: countquery = countquery_tmp.start() # Update the UI every 2 seconds to place less load on the browser. ui_refresh = 2 # Monitor the progress of the stream, and save data for the webUI colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"] monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath) # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() print("Exiting the classify service normally...") else: countquery.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession( name="statistics_{}".format(args.night), shuffle_partitions=2 ) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Statistics for {}/{}/{}'.format(year, month, day)) input_raw = '{}/raw/year={}/month={}/day={}'.format( args.agg_data_prefix, year, month, day) input_science = '{}/science/year={}/month={}/day={}'.format( args.agg_data_prefix, year, month, day) df_raw = spark.read.format('parquet').load(input_raw) df_sci = spark.read.format('parquet').load(input_science) df_sci = df_sci.cache() # Number of alerts n_raw_alert = df_raw.count() n_sci_alert = df_sci.count() out_dic = {} out_dic['raw'] = n_raw_alert out_dic['sci'] = n_sci_alert # matches with SIMBAD n_simbad = df_sci.select('cdsxmatch')\ .filter(df_sci['cdsxmatch'] != 'Unknown')\ .count() out_dic['simbad_tot'] = n_simbad # Alerts with a close-by candidate host-galaxy list_simbad_galaxies = [ "galaxy", "Galaxy", "EmG", "Seyfert", "Seyfert_1", "Seyfert_2", "BlueCompG", "StarburstG", "LSB_G", "HII_G", "High_z_G", "GinPair", "GinGroup", "BClG", "GinCl", "PartofG", ] n_simbad_gal = df_sci.select('cdsxmatch')\ .filter(df_sci['cdsxmatch'].isin(list_simbad_galaxies))\ .count() out_dic['simbad_gal'] = n_simbad_gal df_class = df_sci.withColumn( 'class', extract_fink_classification( df_sci['cdsxmatch'], df_sci['roid'], df_sci['mulens'], df_sci['snn_snia_vs_nonia'], df_sci['snn_sn_vs_all'], df_sci['rf_snia_vs_nonia'], df_sci['candidate.ndethist'], df_sci['candidate.drb'], df_sci['candidate.classtar'], df_sci['candidate.jd'], df_sci['candidate.jdstarthist'], df_sci['rf_kn_vs_nonkn'], df_sci['tracklet'] ) ) out_class = df_class.groupBy('class').count().collect() out_class_ = [o.asDict() for o in out_class] out_class_ = [list(o.values()) for o in out_class_] for kv in out_class_: out_dic[kv[0]] = kv[1] # Number of fields n_field = df_sci.select('candidate.field').distinct().count() out_dic['fields'] = n_field # number of measurements per band n_g = df_sci.select('candidate.fid').filter('fid == 1').count() n_r = df_sci.select('candidate.fid').filter('fid == 2').count() out_dic['n_g'] = n_g out_dic['n_r'] = n_r # Number of exposures n_exp = df_sci.select('candidate.jd').distinct().count() out_dic['exposures'] = n_exp out_dic['night'] = 'ztf_{}'.format(args.night) # make a Spark DataFrame pdf = pd.DataFrame([out_dic]) df_hbase = spark.createDataFrame(pdf) # rowkey is the night YYYYMMDD index_row_key_name = 'night' # Columns to use cols_basic = [ 'raw', 'sci', 'night', 'n_g', 'n_r', 'exposures', 'fields' ] cols_class_ = np.transpose(out_class_)[0] cols_class = np.concatenate((cols_class_, ['simbad_tot', 'simbad_gal'])) # column families cf = {i: 'basic' for i in df_hbase.select(*cols_basic).columns} cf.update({i: 'class' for i in df_hbase.select(*cols_class).columns}) # construct the time catalog hbcatalog_index = construct_hbase_catalog_from_flatten_schema( df_hbase.schema, 'statistics_class', rowkeyname=index_row_key_name, cf=cf ) # Push index table df_hbase.write\ .options(catalog=hbcatalog_index, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Construct the schema row - inplace replacement schema_row_key_name = 'schema_version' df_hbase = df_hbase.withColumnRenamed( index_row_key_name, schema_row_key_name ) df_hbase_schema = construct_schema_row( df_hbase, rowkeyname=schema_row_key_name, version='schema_{}_{}'.format(fbvsn, fsvsn)) # construct the hbase catalog for the schema hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema( df_hbase_schema.schema, 'statistics_class', rowkeyname=schema_row_key_name, cf=cf) # Push the data using the shc connector df_hbase_schema.write\ .options(catalog=hbcatalog_index_schema, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Get or create a Spark Session spark = init_sparksession( name="distribution", shuffle_partitions=2, log_level="ERROR") # Read the catalog file generated by raw2science science_db_catalog = args.science_db_catalog with open(science_db_catalog) as f: catalog = json.load(f) # Define variables min_timestamp = 100 # set a default t_end = 1577836799 # some default value # get distribution offset min_timestamp = get_distribution_offset( args.checkpointpath_dist, args.startingOffset_dist) # Run distribution for (args.exit_after) seconds if args.exit_after is not None: t_end = time.time() + args.exit_after exit_after = True else: exit_after = False # Start the distribution service while(not exit_after or time.time() < t_end): """Keep scanning the HBase for new records in a loop """ # Scan the HBase till current time max_timestamp = int(round(time.time() * 1000)) # time in ms # Read Hbase within timestamp range df = spark.read\ .option("catalog", catalog)\ .option("minStamp", min_timestamp)\ .option("maxStamp", max_timestamp)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .load() # Filter out records that have been distributed df = df.filter("status!='distributed'") # Get the DataFrame for publishing to Kafka (avro serialized) df_kafka = get_kafka_df(df, args.distribution_schema) # Publish Kafka topic(s) (Ensure that the topic(s) exist on the Kafka Server) df_kafka\ .write\ .format("kafka")\ .option("kafka.bootstrap.servers", "localhost:9093")\ .option("topic", "distribution_test")\ .save() # Update the status column in Hbase update_status_in_hbase(df, args.science_db_name, "objectId", args.checkpointpath_dist, max_timestamp) # update min_timestamp for next iteration min_timestamp = max_timestamp # Wait for some time before another loop time.sleep(1)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="distribute", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Read the catalog file generated by raw2science science_db_catalog = args.science_db_catalog with open(science_db_catalog) as f: catalog = json.load(f) # Define variables min_timestamp = 100 # set a default t_end = 1577836799 # some default value # get distribution offset min_timestamp = get_distribution_offset( args.checkpointpath_dist, args.startingOffset_dist) # Get topic name to publish on topic = args.distribution_topic broker_list = args.distribution_servers # Run distribution for (args.exit_after) seconds if args.exit_after is not None: t_end = time.time() + args.exit_after exit_after = True else: exit_after = False # Start the distribution service while(not exit_after or time.time() < t_end): """Keep scanning the HBase for new records in a loop """ # Scan the HBase till current time max_timestamp = int(round(time.time() * 1000)) # time in ms # Read Hbase within timestamp range df = spark.read\ .option("catalog", catalog)\ .option("minStamp", min_timestamp)\ .option("maxStamp", max_timestamp)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .load() # Keep records that haven't been distributed df = df.filter("status!='distributed'") # Send out slack alerts api_token = get_api_token() if api_token: slack_cols = [ "objectId", "candidate_ra", "candidate_dec", "cross_match_alerts_per_batch"] send_slack_alerts(df.select(slack_cols), args.slack_channels) # Apply additional filters (user defined xml) if args.distribution_rules_xml: df = filter_df_using_xml(df, args.distribution_rules_xml) # create a nested dataframe similar to the original ztf dataframe df_nested = group_df_into_struct(df, "candidate", "objectId") df_nested = group_df_into_struct(df_nested, "prv_candidates", "objectId") df_nested = group_df_into_struct(df_nested, "cutoutTemplate", "objectId") df_nested = group_df_into_struct(df_nested, "cutoutScience", "objectId") df_nested = group_df_into_struct(df_nested, "cutoutDifference", "objectId") # Apply level two filters df_nested = apply_user_defined_filters(df_nested, filter_leveltwo_names) # Persist df to memory to materialize changes df_nested.persist() # Get the DataFrame for publishing to Kafka (avro serialized) df_kafka = get_kafka_df(df_nested, args.distribution_schema) # Ensure that the topic(s) exist on the Kafka Server) df_kafka\ .write\ .format("kafka")\ .option("kafka.bootstrap.servers", broker_list)\ .option("kafka.security.protocol", "SASL_PLAINTEXT")\ .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\ .option("topic", topic)\ .save() # Update the status in Hbase and commit checkpoint to file update_status_in_hbase( df, args.science_db_name, "objectId", args.checkpointpath_dist, max_timestamp) # update min_timestamp for next iteration min_timestamp = max_timestamp # free the memory df_nested.unpersist() # Wait for some time before another loop time.sleep(1)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession( name="index_archival_{}_{}".format(args.index_table, args.night), shuffle_partitions=2 ) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the aggregated science database path = '{}/science/year={}/month={}/day={}'.format( args.agg_data_prefix, args.night[:4], args.night[4:6], args.night[6:8] ) df = load_parquet_files(path) # construct the index view index_row_key_name = args.index_table columns = index_row_key_name.split('_') names = [col(i) for i in columns] index_name = '.' + columns[0] # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Load column names to use in the science portal cols_i, cols_d, cols_b = load_science_portal_column_names() # Assign each column to a specific column family cf = assign_column_family_names(df, cols_i, cols_d, cols_b) # Restrict the input DataFrame to the subset of wanted columns. if 'upper' in args.index_table: df = df.select( 'objectId', 'prv_candidates.jd', 'prv_candidates.fid', 'prv_candidates.magpsf', 'prv_candidates.sigmapsf', 'prv_candidates.diffmaglim' ) else: df = df.select(cols_i + cols_d + cols_b) # Create and attach the rowkey df, _ = attach_rowkey(df) common_cols = [ 'objectId', 'candid', 'publisher', 'rcid', 'chipsf', 'distnr', 'ra', 'dec', 'jd', 'fid', 'nid', 'field', 'xpos', 'ypos', 'rb', 'ssdistnr', 'ssmagnr', 'ssnamenr', 'jdstarthist', 'jdendhist', 'tooflag', 'sgscore1', 'distpsnr1', 'neargaia', 'maggaia', 'nmtchps', 'diffmaglim', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos', 'cdsxmatch', 'roid', 'mulens', 'snn_snia_vs_nonia', 'snn_sn_vs_all', 'rf_snia_vs_nonia', 'classtar', 'drb', 'ndethist', 'rf_kn_vs_nonkn', 'tracklet' ] if columns[0].startswith('pixel'): nside = int(columns[0].split('pixel')[1]) df_index = df.withColumn( columns[0], ang2pix( df['ra'], df['dec'], lit(nside) ) ).select( [ concat_ws('_', *names).alias(index_row_key_name) ] + ['objectId'] ) elif columns[0] == 'class': df_index = df.withColumn( 'class', extract_fink_classification( df['cdsxmatch'], df['roid'], df['mulens'], df['snn_snia_vs_nonia'], df['snn_sn_vs_all'], df['rf_snia_vs_nonia'], df['ndethist'], df['drb'], df['classtar'], df['jd'], df['jdstarthist'], df['rf_kn_vs_nonkn'], df['tracklet'] ) ).select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) elif columns[0] == 'ssnamenr': # Flag only objects with likely counterpart in MPC df_index = df\ .filter(df['roid'] == 3)\ .select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) elif columns[0] == 'tracklet': # For data < 2021-08-10, no tracklet means '' # For data >= 2021-08-10, no tracklet means 'null' df_index = df\ .filter(df['tracklet'] != 'null')\ .filter(df['tracklet'] != '')\ .select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) elif columns[0] == 'upper': # This case is the same as the main table # but we keep only upper limit measurements. index_row_key_name = 'objectId_jd' # explode df_ex = df.withColumn( "tmp", arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid") ).withColumn("tmp", explode("tmp")).select( concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name), "objectId", col("tmp.jd"), col("tmp.fid"), col("tmp.magpsf"), col("tmp.sigmapsf"), col("tmp.diffmaglim") ) # take only upper limits df_index = df_ex.filter(~df_ex['magpsf'].isNotNull()) # drop NaN columns df_index = df_index.drop(*['magpsf', 'sigmapsf']) elif columns[0] == 'uppervalid': # This case is the same as the main table # but we keep only upper limit measurements. index_row_key_name = 'objectId_jd' # explode df_ex = df.withColumn( "tmp", arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid") ).withColumn("tmp", explode("tmp")).select( concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name), "objectId", col("tmp.jd"), col("tmp.fid"), col("tmp.magpsf"), col("tmp.sigmapsf"), col("tmp.diffmaglim") ) # take only valid measurements from the history df_index = df_ex.filter(df_ex['magpsf'].isNotNull()) elif columns[0] == 'tns': with open('{}/tns_marker.txt'.format(args.tns_folder)) as f: tns_marker = f.read().replace('\n', '') pdf_tns = download_catalog(os.environ['TNS_API_KEY'], tns_marker) # Filter TNS confirmed data f1 = ~pdf_tns['type'].isna() pdf_tns_filt = pdf_tns[f1] pdf_tns_filt_b = spark.sparkContext.broadcast(pdf_tns_filt) @pandas_udf(StringType(), PandasUDFType.SCALAR) def crossmatch_with_tns(objectid, ra, dec): # TNS pdf = pdf_tns_filt_b.value ra2, dec2, type2 = pdf['ra'], pdf['declination'], pdf['type'] # create catalogs catalog_ztf = SkyCoord( ra=np.array(ra, dtype=np.float) * u.degree, dec=np.array(dec, dtype=np.float) * u.degree ) catalog_tns = SkyCoord( ra=np.array(ra2, dtype=np.float) * u.degree, dec=np.array(dec2, dtype=np.float) * u.degree ) # cross-match idx, d2d, d3d = catalog_tns.match_to_catalog_sky(catalog_ztf) sub_pdf = pd.DataFrame({ 'objectId': objectid.values[idx], 'ra': ra.values[idx], 'dec': dec.values[idx], }) # cross-match idx2, d2d2, d3d2 = catalog_ztf.match_to_catalog_sky(catalog_tns) # set separation length sep_constraint2 = d2d2.degree < 1.5 / 3600 sub_pdf['TNS'] = [''] * len(sub_pdf) sub_pdf['TNS'][idx2[sep_constraint2]] = type2.values[idx2[sep_constraint2]] to_return = objectid.apply( lambda x: '' if x not in sub_pdf['objectId'].values else sub_pdf['TNS'][sub_pdf['objectId'] == x].values[0] ) return to_return df = df.withColumn( 'tns', crossmatch_with_tns( df['objectId'], df['ra'], df['dec'] ) ).select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols + ['tns'] ).cache() df_index = df.filter(df['tns'] != '').drop('tns') # trigger the cache - not the cache might be a killer for LSST... n = df_index.count() print('TNS objects: {}'.format(n)) else: df_index = df.select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) # construct the time catalog hbcatalog_index = construct_hbase_catalog_from_flatten_schema( df_index.schema, args.science_db_name + index_name, rowkeyname=index_row_key_name, cf=cf ) # Push index table df_index.write\ .options(catalog=hbcatalog_index, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Construct the schema row - inplace replacement schema_row_key_name = 'schema_version' df_index = df_index.withColumnRenamed( index_row_key_name, schema_row_key_name ) df_index_schema = construct_schema_row( df_index, rowkeyname=schema_row_key_name, version='schema_{}_{}'.format(fbvsn, fsvsn)) # construct the hbase catalog for the schema hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema( df_index_schema.schema, args.science_db_name + index_name, rowkeyname=schema_row_key_name, cf=cf) # Push the data using the shc connector df_index_schema.write\ .options(catalog=hbcatalog_index_schema, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Not very satisfactory... The problem is that latesfirst = false is # required to create new HBase table (i.e. all the time in the CI). # If you have a better idea, let me know! if "travis" in args.science_db_name: latesfirst = False else: latesfirst = True df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*", latesfirst) # Apply level one filters logger.info(filter_levelone_names) df = apply_user_defined_filters(df, filter_levelone_names) # Apply level one processors logger.info(processor_levelone_names) df = apply_user_defined_processors(df, processor_levelone_names) # Select alert data + timestamp + added value from processors new_colnames = ["decoded.*", "cast(timestamp as string) as timestamp"] for i in processor_levelone_names: new_colnames.append(i) df = df.selectExpr(new_colnames) df_hbase = flattenstruct(df, "candidate") df_hbase = flattenstruct(df_hbase, "cutoutScience") df_hbase = flattenstruct(df_hbase, "cutoutTemplate") df_hbase = flattenstruct(df_hbase, "cutoutDifference") df_hbase = explodearrayofstruct(df_hbase, "prv_candidates") # Create a status column for distribution df_hbase = df_hbase.withColumn("status", lit("dbUpdate")) # Save the catalog on disk for later usage catalog = construct_hbase_catalog_from_flatten_schema( df_hbase.schema, args.science_db_name, "objectId") science_db_catalog = args.science_db_catalog with open(science_db_catalog, 'w') as json_file: json.dump(catalog, json_file) def write_to_hbase_and_monitor(df: DataFrame, epochid: int, hbcatalog: str): """Write data into HBase. The purpose of this function is to write data to HBase using Structured Streaming tools such as foreachBatch. Parameters ---------- df : DataFrame Input micro-batch DataFrame. epochid : int ID of the micro-batch hbcatalog : str HBase catalog describing the data """ # If the table does not exist, one needs to specify # the number of zones to use (must be greater than 3). # TODO: remove this harcoded parameter. df.write\ .options(catalog=hbcatalog, newtable=5)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Query to push data into HBase countquery = df_hbase\ .writeStream\ .outputMode("append")\ .option("checkpointLocation", args.checkpointpath_sci)\ .foreachBatch(lambda x, y: write_to_hbase_and_monitor(x, y, catalog))\ .start() # Query to group objects by type according to SIMBAD # Do it every 30 seconds groupedquery_started = False if "cross_match_alerts_per_batch" in processor_levelone_names: df_group = df.groupBy("cross_match_alerts_per_batch").count() groupquery = df_group\ .writeStream\ .outputMode("complete") \ .foreachBatch(write_to_csv)\ .trigger(processingTime='30 seconds'.format(args.tinterval))\ .start() groupedquery_started = True # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() if groupedquery_started: groupquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science", shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*", latestfirst=False) # Apply level one filters logger.info(qualitycuts) df = apply_user_defined_filter(df, qualitycuts) # Apply level one processor: cdsxmatch logger.info("New processor: cdsxmatch") colnames = [df['objectId'], df['candidate.ra'], df['candidate.dec']] df = df.withColumn(cdsxmatch.__name__, cdsxmatch(*colnames)) # Apply level one processor: rfscore logger.info("New processor: rfscore") # Required alert columns what = [ 'jd', 'fid', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos' ] # Use for creating temp name prefix = 'c' what_prefix = [prefix + i for i in what] # Append temp columns with historical + current measurements for colname in what: df = concat_col(df, colname, prefix=prefix) # Perform the fit + classification. # Note we can omit the model_path argument, and in that case the # default model `data/models/default-model.obj` will be used. rfscore_args = [F.col(i) for i in what_prefix] df = df.withColumn(rfscore.__name__, rfscore(*rfscore_args)) # Apply level one processor: rfscore logger.info("New processor: microlensing") # Retrieve schema schema = load_mulens_schema_twobands() # Create standard UDF mulens_udf = F.udf(mulens, schema) # Required alert columns - already computed for SN what_prefix_mulens = [ 'cfid', 'cmagpsf', 'csigmapsf', 'cmagnr', 'csigmagnr', 'cmagzpsci', 'cisdiffpos' ] mulens_args = [F.col(i) for i in what_prefix_mulens] df = df.withColumn('mulens', mulens_udf(*mulens_args)) # Drop temp columns df = df.drop(*what_prefix) # Partition the data hourly df_partitionedby = df\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .withColumn("hour", F.date_format("timestamp", "HH")) # Append new rows in the tmp science database countquery = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_sci_tmp) \ .option("path", args.scitmpdatapath)\ .partitionBy("year", "month", "day", "hour") \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="stream2raw", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Create a streaming dataframe pointing to a Kafka stream df = connect_to_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets_stream, failondataloss=False) # Get Schema of alerts _, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) df_decoded = df.select([ "timestamp", "topic", from_avro(df["value"], alert_schema_json).alias("decoded") ]) # Flatten the data columns to match the incoming alert data schema cnames = df_decoded.columns cnames[cnames.index('decoded')] = 'decoded.*' df_decoded = df_decoded.selectExpr(cnames) # Partition the data hourly df_partitionedby = df_decoded\ .withColumn("year", date_format("timestamp", "yyyy"))\ .withColumn("month", date_format("timestamp", "MM"))\ .withColumn("day", date_format("timestamp", "dd"))\ .withColumn("hour", date_format("timestamp", "HH")) # Append new rows every `tinterval` seconds countquery_tmp = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_raw) \ .option("path", args.rawdatapath)\ .partitionBy("topic", "year", "month", "day", "hour") # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() else: countquery = countquery_tmp.start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the stream2raw service normally...") else: countquery.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="raw2science_{}".format(args.night), shuffle_partitions=2) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # data path rawdatapath = args.online_data_prefix + '/raw' scitmpdatapath = args.online_data_prefix + '/science' checkpointpath_sci_tmp = args.online_data_prefix + '/science_checkpoint' df = connect_to_raw_database( rawdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), rawdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), latestfirst=False) # Apply quality cuts logger.info("Applying quality cuts") df = df\ .filter(df['candidate.nbad'] == 0)\ .filter(df['candidate.rb'] >= 0.55) # Apply science modules df = apply_science_modules(df, logger) # Add library versions df = df.withColumn('fink_broker_version', F.lit(fbvsn))\ .withColumn('fink_science_version', F.lit(fsvsn)) # Switch publisher df = df.withColumn('publisher', F.lit('Fink')) # re-create partitioning columns if needed. if 'timestamp' not in df.columns: df = df\ .withColumn("timestamp", jd_to_datetime(df['candidate.jd'])) if "year" not in df.columns: df = df\ .withColumn("year", F.date_format("timestamp", "yyyy")) if "month" not in df.columns: df = df\ .withColumn("month", F.date_format("timestamp", "MM")) if "day" not in df.columns: df = df\ .withColumn("day", F.date_format("timestamp", "dd")) # Append new rows in the tmp science database countquery = df\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", checkpointpath_sci_tmp) \ .option("path", scitmpdatapath)\ .partitionBy("year", "month", "day") \ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="science_archival_{}".format(args.night), shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the aggregated science database path = '{}/science/year={}/month={}/day={}'.format(args.agg_data_prefix, args.night[:4], args.night[4:6], args.night[6:8]) df = load_parquet_files(path) # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Load column names to use in the science portal cols_i, cols_d, cols_b = load_science_portal_column_names() # Assign each column to a specific column family cf = assign_column_family_names(df, cols_i, cols_d, cols_b) # Restrict the input DataFrame to the subset of wanted columns. df = df.select(cols_i + cols_d + cols_b) # Create and attach the rowkey df, row_key_name = attach_rowkey(df) # construct the hbase catalog hbcatalog = construct_hbase_catalog_from_flatten_schema( df.schema, args.science_db_name, rowkeyname=row_key_name, cf=cf) # Save the catalog on disk (local) with open(args.science_db_catalog, 'w') as json_file: json.dump(hbcatalog, json_file) if args.save_science_db_catalog_only: # Print for visual inspection print(hbcatalog) else: # Push the data using the shc connector df.write\ .options(catalog=hbcatalog, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Construct the schema row - inplace replacement schema_row_key_name = 'schema_version' df = df.withColumnRenamed(row_key_name, schema_row_key_name) df_schema = construct_schema_row(df, rowkeyname=schema_row_key_name, version='schema_{}_{}'.format( fbvsn, fsvsn)) # construct the hbase catalog for the schema hbcatalog_schema = construct_hbase_catalog_from_flatten_schema( df_schema.schema, args.science_db_name, rowkeyname=schema_row_key_name, cf=cf) # Save the catalog on disk (local) catname = args.science_db_catalog.replace('.json', '_schema_row.json') with open(catname, 'w') as json_file: json.dump(hbcatalog_schema, json_file) # Push the data using the shc connector df_schema.write\ .options(catalog=hbcatalog_schema, newtable=5)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="mergeAndClean_{}".format(args.night)) # Logger to print useful debug statements logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) year = args.night[:4] month = args.night[4:6] day = args.night[6:8] print('Processing {}/{}/{}'.format(year, month, day)) input_raw = '{}/raw/year={}/month={}/day={}'.format( args.online_data_prefix, year, month, day) input_science = '{}/science/year={}/month={}/day={}'.format( args.online_data_prefix, year, month, day) # basepath output_raw = '{}/raw/year={}/month={}/day={}'.format( args.agg_data_prefix, year, month, day) output_science = '{}/science/year={}/month={}/day={}'.format( args.agg_data_prefix, year, month, day) print('Raw data processing....') df_raw = spark.read.format('parquet').load(input_raw) print('Num partitions before: ', df_raw.rdd.getNumPartitions()) print('Num partitions after : ', numPart(df_raw)) df_raw.withColumn('timestamp', jd_to_datetime(df_raw['candidate.jd']))\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .coalesce(numPart(df_raw))\ .write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_raw) print('Science data processing....') df_science = spark.read.format('parquet').load(input_science) npart_after = int(numPart(df_science)) print('Num partitions before: ', df_science.rdd.getNumPartitions()) print('Num partitions after : ', npart_after) # Add tracklet information before merging df_trck = add_tracklet_information(df_science) # join back information to the initial dataframe df_science = df_science\ .join( F.broadcast(df_trck.select(['candid', 'tracklet'])), on='candid', how='outer' ) df_science.withColumn('timestamp', jd_to_datetime(df_science['candidate.jd']))\ .withColumn("year", F.date_format("timestamp", "yyyy"))\ .withColumn("month", F.date_format("timestamp", "MM"))\ .withColumn("day", F.date_format("timestamp", "dd"))\ .coalesce(npart_after)\ .write\ .mode("append") \ .partitionBy("year", "month", "day")\ .parquet(output_science)
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="stream2raw", shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Create a streaming dataframe pointing to a Kafka stream kerberos = 'public2.alerts.ztf' in args.servers df = connect_to_kafka(servers=args.servers, topic=args.topic, startingoffsets=args.startingoffsets_stream, failondataloss=False, kerberos=kerberos) # Get Schema of alerts alert_schema, _, alert_schema_json = get_schemas_from_avro(args.schema) # Decode the Avro data, and keep only (timestamp, data) if '134.158.' in args.servers or 'localhost' in args.servers: # using custom from_avro (not available for Spark 2.4.x) # it will be available from Spark 3.0 though df_decoded = df.select( [from_avro(df["value"], alert_schema_json).alias("decoded")]) elif 'public2.alerts.ztf' in args.servers: # Decode on-the-fly using fastavro f = udf(lambda x: fastavro.reader(io.BytesIO(x)).next(), alert_schema) df_decoded = df.select([f(df['value']).alias("decoded")]) else: msg = "Data source {} is not known - a decoder must be set".format( args.servers) logger.warn(msg) spark.stop() # Flatten the data columns to match the incoming alert data schema cnames = df_decoded.columns cnames[cnames.index('decoded')] = 'decoded.*' df_decoded = df_decoded.selectExpr(cnames) # Partition the data hourly df_partitionedby = df_decoded\ .withColumn("timestamp", jd_to_datetime(df_decoded['candidate.jd']))\ .withColumn("year", date_format("timestamp", "yyyy"))\ .withColumn("month", date_format("timestamp", "MM"))\ .withColumn("day", date_format("timestamp", "dd")) # Append new rows every `tinterval` seconds countquery_tmp = df_partitionedby\ .writeStream\ .outputMode("append") \ .format("parquet") \ .option("checkpointLocation", args.checkpointpath_raw) \ .option("path", args.rawdatapath)\ .partitionBy("year", "month", "day") # Fixed interval micro-batches or ASAP if args.tinterval > 0: countquery = countquery_tmp\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() else: countquery = countquery_tmp.start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() logger.info("Exiting the stream2raw service normally...") else: countquery.awaitTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession(name="distribute_{}".format(args.night), shuffle_partitions=2) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # data path scitmpdatapath = args.online_data_prefix + '/science' checkpointpath_kafka = args.online_data_prefix + '/kafka_checkpoint' # Connect to the TMP science database df = connect_to_raw_database( scitmpdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), scitmpdatapath + "/year={}/month={}/day={}".format( args.night[0:4], args.night[4:6], args.night[6:8]), latestfirst=False) # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Cast fields to ease the distribution cnames = df.columns cnames[cnames.index( 'timestamp')] = 'cast(timestamp as string) as timestamp' cnames[cnames.index( 'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience' cnames[cnames.index( 'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate' cnames[cnames.index( 'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference' cnames[cnames.index( 'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates' cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate' # Retrieve time-series information to_expand = [ 'jd', 'fid', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos' ] # Append temp columns with historical + current measurements prefix = 'c' for colname in to_expand: df = concat_col(df, colname, prefix=prefix) # quick fix for https://github.com/astrolabsoftware/fink-broker/issues/457 for colname in to_expand: df = df.withColumnRenamed('c' + colname, 'c' + colname + 'c') broker_list = args.distribution_servers for userfilter in userfilters: # The topic name is the filter name topicname = args.substream_prefix + userfilter.split('.')[-1] + '_ztf' # Apply user-defined filter df_tmp = apply_user_defined_filter(df, userfilter) # Wrap alert data df_tmp = df_tmp.selectExpr(cnames) # Get the DataFrame for publishing to Kafka (avro serialized) df_kafka = get_kafka_df(df_tmp, '') # Ensure that the topic(s) exist on the Kafka Server) disquery = df_kafka\ .writeStream\ .format("kafka")\ .option("kafka.bootstrap.servers", broker_list)\ .option("kafka.security.protocol", "SASL_PLAINTEXT")\ .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\ .option("topic", topicname)\ .option("checkpointLocation", checkpointpath_kafka + topicname)\ .trigger(processingTime='{} seconds'.format(args.tinterval)) \ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) disquery.stop() logger.info("Exiting the distribute service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Grab the running Spark Session, # otherwise create it. spark = init_sparksession( name="buildSciDB", shuffle_partitions=2, log_level="ERROR") # FIXME! if "travis" in args.science_db_name: latesfirst = False else: latesfirst = True df = connect_to_raw_database( args.rawdatapath, args.rawdatapath + "/*", latesfirst) # Apply filters and keep only good alerts df_filt = df.withColumn( "toKeep", keep_alert_based_on( col("decoded.candidate.nbad"), col("decoded.candidate.rb"), col("decoded.candidate.magdiff") ) ).filter("toKeep == true") # for good alerts, perform a cross-match with SIMBAD, # and return the types of the objects (Star, AGN, Unknown, etc.) df_type = df_filt.withColumn( "simbadType", cross_match_alerts_per_batch( col("decoded.objectId"), col("decoded.candidate.ra"), col("decoded.candidate.dec") ) ).selectExpr( "decoded.*", "cast(timestamp as string) as timestamp", "simbadType") df_hbase = flattenstruct(df_type, "candidate") df_hbase = flattenstruct(df_hbase, "cutoutScience") df_hbase = flattenstruct(df_hbase, "cutoutTemplate") df_hbase = flattenstruct(df_hbase, "cutoutDifference") df_hbase = explodearrayofstruct(df_hbase, "prv_candidates") # Create a status column for distribution df_hbase = df_hbase.withColumn("status", lit("dbUpdate")) # Save the catalog on disk for later usage catalog = construct_hbase_catalog_from_flatten_schema( df_hbase.schema, args.science_db_name, "objectId") science_db_catalog = args.science_db_catalog with open(science_db_catalog, 'w') as json_file: json.dump(catalog, json_file) def write_to_hbase_and_monitor( df: DataFrame, epochid: int, hbcatalog: str): """Write data into HBase. The purpose of this function is to write data to HBase using Structured Streaming tools such as foreachBatch. Parameters ---------- df : DataFrame Input micro-batch DataFrame. epochid : int ID of the micro-batch hbcatalog : str HBase catalog describing the data """ # If the table does not exist, one needs to specify # the number of zones to use (must be greater than 3). # TODO: remove this harcoded parameter. df.write\ .options(catalog=hbcatalog, newtable=5)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Query to push data into HBase countquery = df_hbase\ .writeStream\ .outputMode("append")\ .option("checkpointLocation", args.checkpointpath_sci)\ .foreachBatch(lambda x, y: write_to_hbase_and_monitor(x, y, catalog))\ .start() # Query to group objects by type according to SIMBAD # Do it every 30 seconds df_group = df_type.groupBy("simbadType").count() groupquery = df_group\ .writeStream\ .outputMode("complete") \ .foreachBatch(write_to_csv)\ .trigger(processingTime='30 seconds'.format(args.tinterval))\ .start() # Keep the Streaming running until something or someone ends it! if args.exit_after is not None: time.sleep(args.exit_after) countquery.stop() groupquery.stop() print("Exiting the raw2science service normally...") else: # Wait for the end of queries spark.streams.awaitAnyTermination()