def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    init_sparksession(name="monitoringStream",
                      shuffle_partitions=2,
                      log_level="ERROR")

    # Create a streaming dataframe pointing to a Kafka stream
    df = connect_with_kafka(servers=args.servers,
                            topic=args.topic,
                            startingoffsets=args.startingoffsets,
                            failondataloss=False)

    # Trigger the streaming computation,
    # by defining the sink (memory here) and starting it
    countquery = df \
        .writeStream \
        .queryName("qraw")\
        .format("console")\
        .outputMode("update") \
        .start()

    # Monitor the progress of the stream, and save data for the webUI
    colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"]
    monitor_progress_webui(countquery, 2, colnames, args.finkwebpath)

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        print("Exiting the monitoring service normally...")
    else:
        countquery.awaitTermination()
Example #2
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Grab the running Spark Session,
    # otherwise create it.
    spark = init_sparksession(name="readingScienceDB", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    with open(args.science_db_catalog) as f:
        catalog = json.load(f)

    catalog_dic = json.loads(catalog)

    df = spark.read.option("catalog", catalog)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .load()

    print("Number of entries in {}: ".format(catalog_dic["table"]["name"]),
          df.count())

    print(
        "Number of distinct objects in {}: ".format(
            catalog_dic["table"]["name"]),
        df.select('objectId').distinct().count())
Example #3
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science", shuffle_partitions=2)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    df = connect_to_raw_database(args.rawdatapath,
                                 args.rawdatapath + "/*",
                                 latestfirst=False)

    # Apply quality cuts
    logger.info(qualitycuts)
    df = apply_user_defined_filter(df, qualitycuts)

    # Apply science modules
    df = apply_science_modules(df, logger)

    # Add library versions
    df = df.withColumn('fink_broker_version', F.lit(fbvsn))\
        .withColumn('fink_science_version', F.lit(fsvsn))

    # Switch publisher
    df = df.withColumn('publisher', F.lit('Fink'))

    # re-create partitioning columns.
    # Partitioned data doesn't preserve type information (cast as int...)
    df_partitionedby = df\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))

    # Append new rows in the tmp science database
    countquery = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath_sci_tmp) \
        .option("path", args.scitmpdatapath)\
        .partitionBy("year", "month", "day") \
        .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the raw2science service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
Example #4
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(
        name="raw2science_{}".format(args.night),
        shuffle_partitions=2)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Processing {}/{}/{}'.format(year, month, day))

    input_raw = 'ztf_alerts/raw/year={}/month={}/day={}'.format(
        year, month, day)

    # basepath
    output_science = 'ztf_alerts/science_reprocessed'

    df = spark.read.format('parquet').load(input_raw)

    # Apply level one filters
    logger.info(qualitycuts)
    df = apply_user_defined_filter(df, qualitycuts)

    # Apply science modules
    df = apply_science_modules(df, logger)

    # Add librarys versions
    df = df.withColumn('fink_broker_version', F.lit(fbvsn))\
        .withColumn('fink_science_version', F.lit(fsvsn))

    # Switch publisher
    df = df.withColumn('publisher', F.lit('Fink'))

    # re-create partitioning columns.
    # Partitioned data doesn't preserve type information (cast as int...)
    df\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))\
        .write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_science)
Example #5
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="save_schema_{}".format(args.night), shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the aggregated science database
    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Processing {}/{}/{}'.format(year, month, day))

    input_science = '{}/science/year={}/month={}/day={}'.format(
        args.agg_data_prefix, year, month, day)
    df = load_parquet_files(input_science)

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Cast fields to ease the distribution
    cnames = df.columns
    cnames[cnames.index('timestamp')] = 'cast(timestamp as string) as timestamp'
    cnames[cnames.index('cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience'
    cnames[cnames.index('cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate'
    cnames[cnames.index('cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference'
    cnames[cnames.index('prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates'
    cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate'

    df_kafka = df.selectExpr(cnames)

    path_for_avro = 'new_schema_{}.avro'.format(time())
    df_kafka.limit(1).write.format("avro").save(path_for_avro)

    # retrieve data on local disk
    subprocess.run(["hdfs", "dfs", '-get', path_for_avro])

    # Read the avro schema from .avro file
    avro_file = glob.glob(path_for_avro + "/part*")[0]
    avro_schema = readschemafromavrofile(avro_file)

    # Write the schema to a file for decoding Kafka messages
    with open('schemas/{}'.format(path_for_avro.replace('.avro', '.avsc')), 'w') as f:
        json.dump(avro_schema, f, indent=2)
Example #6
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="distribution_test", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Topic to read from
    topic = args.distribution_topic
    broker_list = args.distribution_servers

    # Read from the Kafka topic
    df_kafka = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", broker_list) \
        .option("kafka.security.protocol", "SASL_PLAINTEXT")\
        .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\
        .option("subscribe", topic) \
        .load()

    # Decode df_kafka into a Spark DataFrame with StructType column
    df = decode_kafka_df(df_kafka, args.distribution_schema)

    # Print received stream to the console
    df = df.select("struct.*")

    print("\nReading Fink OutStream\n")
    debug_query = df.writeStream\
        .format("console")\
        .trigger(processingTime='2 seconds')\
        .start()

    # Keep the Streaming running for some time
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        debug_query.stop()
        logger.info("Exiting distribution_test service normally...")
    else:
        debug_query.awaitTermination()
Example #7
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="checkstream", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Create a streaming dataframe pointing to a Kafka stream
    df = connect_to_kafka(servers=args.servers,
                          topic=args.topic,
                          startingoffsets=args.startingoffsets_stream,
                          failondataloss=False)

    # Trigger the streaming computation,
    # by defining the sink (memory here) and starting it
    countquery = df \
        .writeStream \
        .queryName("qraw")\
        .format("console")\
        .outputMode("update") \
        .start()

    # Monitor the progress of the stream, and save data for the webUI
    colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"]
    monitor_progress_webui(countquery, 2, colnames, args.finkwebpath,
                           "live_raw.csv", "live")

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the checkstream service normally...")
    else:
        countquery.awaitTermination()
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Get or create a Spark Session
    spark = init_sparksession(
        name="distribution_test", shuffle_partitions=2, log_level="ERROR")

    # Topic to read from
    topic = "distribution_test"

    # Read from the Kafka topic
    df_kafka = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9093") \
        .option("subscribe", topic) \
        .load()

    # Decode df_kafka into a Spark DataFrame with StructType column
    df = decode_kafka_df(df_kafka, args.distribution_schema)

    # Print to console
    cols = ["objectId", "candid", "candidate_ra", "candidate_dec", "simbadType"]
    cols = ["struct." + c for c in cols]
    df = df.select(cols)

    print("\nReading Fink OutStream\n")
    debug_query = df.writeStream\
                    .format("console")\
                    .trigger(processingTime='2 seconds')\
                    .start()

    # Keep the Streaming running for some time
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        debug_query.stop()
    else:
        debug_query.awaitTermination()
Example #9
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="mergeAndClean_{}".format(args.night))

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Processing {}/{}/{}'.format(year, month, day))

    input_raw = '{}/year={}/month={}/day={}'.format(args.rawdatapath, year,
                                                    month, day)
    input_science = '{}/year={}/month={}/day={}'.format(
        args.scitmpdatapath, year, month, day)

    # basepath
    output_raw = 'ztf_alerts/raw'
    output_science = 'ztf_alerts/science'

    print('Raw data processing....')
    df_raw = spark.read.format('parquet').load(input_raw)
    print('Num partitions before: ', df_raw.rdd.getNumPartitions())
    print('Num partitions after : ', numPart(df_raw))

    df_raw.withColumn('timestamp', jd_to_datetime(df_raw['candidate.jd']))\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))\
        .coalesce(numPart(df_raw))\
        .write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_raw)

    print('Science data processing....')

    df_science = spark.read.format('parquet').load(input_science)
    print('Num partitions before: ', df_science.rdd.getNumPartitions())
    print('Num partitions after : ', numPart(df_science))

    df_science.withColumn('timestamp', jd_to_datetime(df_science['candidate.jd']))\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))\
        .coalesce(numPart(df_science))\
        .write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_science)

    # Remove temporary alert folder - beware you'll never get it back!
    if args.fs == 'hdfs':
        subprocess.run(["hdfs", "dfs", '-rm', '-rf', args.datapath])
    elif args.fs == 'local':
        subprocess.run(['rm', '-rf', args.datapath])
    else:
        print('Filesystem not understood. FS_KIND must be hdfs or local.')
Example #10
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science_{}".format(args.night),
                              shuffle_partitions=None)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Processing {}/{}/{}'.format(year, month, day))

    # data path
    input_raw = args.agg_data_prefix + '/raw/year={}/month={}/day={}'.format(
        year, month, day)

    # basepath
    output_science = args.agg_data_prefix + '/science'

    df = spark.read.format('parquet').load(input_raw)
    npart = df.rdd.getNumPartitions()

    # Apply level one filters
    logger.info(qualitycuts)
    df = df.filter(df['candidate.nbad'] == 0).filter(
        df['candidate.rb'] >= 0.55)

    # Apply science modules
    df = apply_science_modules(df, logger)

    # Add tracklet information
    df_trck = spark.read.format('parquet').load(input_raw)
    df_trck = df_trck.filter(df_trck['candidate.nbad'] == 0).filter(
        df_trck['candidate.rb'] >= 0.55)
    df_trck = add_tracklet_information(df_trck)

    # join back information to the initial dataframe
    df = df\
        .join(
            F.broadcast(df_trck.select(['candid', 'tracklet'])),
            on='candid',
            how='outer'
        )

    # Add librarys versions
    df = df.withColumn('fink_broker_version', F.lit(fbvsn))\
        .withColumn('fink_science_version', F.lit(fsvsn))

    # Switch publisher
    df = df.withColumn('publisher', F.lit('Fink'))

    # re-create partitioning columns if needed.
    if 'timestamp' not in df.columns:
        df = df\
            .withColumn("timestamp", jd_to_datetime(df['candidate.jd']))

    if "year" not in df.columns:
        df = df\
            .withColumn("year", F.date_format("timestamp", "yyyy"))

    if "month" not in df.columns:
        df = df\
            .withColumn("month", F.date_format("timestamp", "MM"))

    if "day" not in df.columns:
        df = df\
            .withColumn("day", F.date_format("timestamp", "dd"))

    df.coalesce(npart).write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_science)
Example #11
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="TNS_report_{}".format(args.night),
                              shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the aggregated science database
    path = '{}/science/year={}/month={}/day={}'.format(args.agg_data_prefix,
                                                       args.night[:4],
                                                       args.night[4:6],
                                                       args.night[6:8])
    df = load_parquet_files(path)

    with open('{}/tns_marker.txt'.format(args.tns_folder)) as f:
        tns_marker = f.read().replace('\n', '')

    if not args.tns_sandbox:
        print("WARNING: submitting to real (not sandbox) TNS website")

    if args.tns_sandbox:
        url_tns_api = "https://sandbox.wis-tns.org/api"
        with open('{}/sandbox-tns_api.key'.format(args.tns_folder)) as f:
            # remove line break...
            key = f.read().replace('\n', '')
    else:
        url_tns_api = "https://www.wis-tns.org/api"
        with open('{}/tns_api.key'.format(args.tns_folder)) as f:
            # remove line break...
            key = f.read().replace('\n', '')

    cols = [
        'cdsxmatch', 'roid', 'mulens', 'snn_snia_vs_nonia', 'snn_sn_vs_all',
        'rf_snia_vs_nonia', 'candidate.ndethist', 'candidate.drb',
        'candidate.classtar', 'candidate.jd', 'candidate.jdstarthist',
        'rf_kn_vs_nonkn', 'tracklet'
    ]
    df = df.withColumn('class', extract_fink_classification(*cols))

    pdf = df\
        .filter(df['class'] == 'Early SN candidate')\
        .filter(df['candidate.ndethist'] <= 20)\
        .toPandas()

    pdf_unique = pdf.groupby('objectId')[pdf.columns].min()
    print("{} new alerts".format(len(pdf)))
    print("{} new sources".format(len(pdf_unique)))
    pdf = pdf_unique

    ids = []
    report = {"at_report": {}}
    check_tns = False
    for index, row in enumerate(pdf.iterrows()):
        alert = row[1]
        past_ids = read_past_ids(args.tns_folder)
        if alert['objectId'] in past_ids.values:
            print('{} already sent!'.format(alert['objectId']))
            continue
        if check_tns:
            groupid = retrieve_groupid(key, tns_marker, alert['objectId'])
            if groupid > 0:
                print("{} already reported by {}".format(
                    alert['objectId'], groupid))
            else:
                print('New report for object {}'.format(alert['objectId']))
        photometry, non_detection = extract_discovery_photometry(alert)
        report['at_report']["{}".format(index)] = build_report(
            alert, photometry, non_detection)
        ids.append(alert['objectId'])
    print('new objects: ', ids)

    if len(ids) != 0:
        json_report = save_logs_and_return_json_report(name='{}{}{}'.format(
            args.night[:4], args.night[4:6], args.night[6:8]),
                                                       folder=args.tns_folder,
                                                       ids=ids,
                                                       report=report)
        r = send_json_report(key, url_tns_api, json_report, tns_marker)
        print(r.json())

        # post to slack
        slacktxt = ' \n '.join(
            ['https://fink-portal/{}'.format(i) for i in ids])
        slacktxt = '{} \n '.format(args.night) + slacktxt
        r = requests.post(os.environ['TNSWEBHOOK'],
                          json={
                              'text': slacktxt,
                              "username": "******"
                          },
                          headers={'Content-Type': 'application/json'})
        print(r.status_code)
    else:
        slacktxt = '{} \n No new sources'.format(args.night)
        r = requests.post(os.environ['TNSWEBHOOK'],
                          json={
                              'text': slacktxt,
                              "username": "******"
                          },
                          headers={'Content-Type': 'application/json'})
Example #12
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    init_sparksession(name="archivingStream",
                      shuffle_partitions=2,
                      log_level="ERROR")

    # Create a streaming dataframe pointing to a Kafka stream
    df = connect_with_kafka(servers=args.servers,
                            topic=args.topic,
                            startingoffsets=args.startingoffsets,
                            failondataloss=False)

    # Get Schema of alerts
    _, _, alert_schema_json = get_schemas_from_avro(args.schema)

    # Decode the Avro data, and keep only (timestamp, data)
    df_decoded = df.select([
        "timestamp", "topic",
        from_avro(df["value"], alert_schema_json).alias("decoded")
    ])

    # Partition the data hourly
    df_partitionedby = df_decoded\
        .withColumn("year", date_format("timestamp", "yyyy"))\
        .withColumn("month", date_format("timestamp", "MM"))\
        .withColumn("day", date_format("timestamp", "dd"))\
        .withColumn("hour", date_format("timestamp", "HH"))

    # Append new rows every `tinterval` seconds
    countquery_tmp = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath) \
        .option("path", args.outputpath)\
        .partitionBy("topic", "year", "month", "day", "hour")

    # Fixed interval micro-batches or ASAP
    if args.tinterval > 0:
        countquery = countquery_tmp\
            .trigger(processingTime='{} seconds'.format(args.tinterval)) \
            .start()
        ui_refresh = args.tinterval
    else:
        countquery = countquery_tmp.start()
        # Update the UI every 2 seconds to place less load on the browser.
        ui_refresh = 2

    # Monitor the progress of the stream, and save data for the webUI
    colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"]
    monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath)

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        print("Exiting the archiving service normally...")
    else:
        countquery.awaitTermination()
Example #13
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="distribute", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the TMP science database
    df = connect_to_raw_database(args.scitmpdatapath,
                                 args.scitmpdatapath + "/*",
                                 latestfirst=False)

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Cast fields to ease the distribution
    cnames = df.columns
    cnames[cnames.index(
        'timestamp')] = 'cast(timestamp as string) as timestamp'
    cnames[cnames.index(
        'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience'
    cnames[cnames.index(
        'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate'
    cnames[cnames.index(
        'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference'
    cnames[cnames.index(
        'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates'
    cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate'

    broker_list = args.distribution_servers
    for userfilter in userfilters:
        # The topic name is the filter name
        topicname = userfilter.split('.')[-1]

        # Apply user-defined filter
        df_tmp = apply_user_defined_filter(df, userfilter)

        # Wrap alert data
        df_tmp = df_tmp.selectExpr(cnames)

        # Get the DataFrame for publishing to Kafka (avro serialized)
        df_kafka = get_kafka_df(df_tmp, '')

        # Ensure that the topic(s) exist on the Kafka Server)
        disquery = df_kafka\
            .writeStream\
            .format("kafka")\
            .option("kafka.bootstrap.servers", broker_list)\
            .option("kafka.security.protocol", "SASL_PLAINTEXT")\
            .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\
            .option("topic", topicname)\
            .option("checkpointLocation", args.checkpointpath_kafka + topicname)\
            .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        disquery.stop()
        logger.info("Exiting the distribute service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    init_sparksession(name="classifyStream",
                      shuffle_partitions=2,
                      log_level="ERROR")

    # Create a streaming dataframe pointing to a Kafka stream
    df = connect_with_kafka(servers=args.servers,
                            topic=args.topic,
                            startingoffsets=args.startingoffsets,
                            failondataloss=False)

    # Get Schema of alerts
    _, _, alert_schema_json = get_schemas_from_avro(args.schema)

    # Decode the Avro data, and keep only (timestamp, data)
    df_decoded = df.select([
        "timestamp",
        from_avro(df["value"], alert_schema_json).alias("decoded")
    ])

    # Select only (timestamp, id, ra, dec)
    df_expanded = df_decoded.select([
        df_decoded["timestamp"], df_decoded["decoded.objectId"],
        df_decoded["decoded.candidate.ra"], df_decoded["decoded.candidate.dec"]
    ])

    # for each micro-batch, perform a cross-match with an external catalog,
    # and return the types of the objects (Star, AGN, Unknown, etc.)
    df_type = df_expanded.withColumn(
        "type",
        cross_match_alerts_per_batch(col("objectId"), col("ra"), col("dec")))

    # Group data by type and count members
    df_group = df_type.groupBy("type").count()

    # Update the DataFrame every tinterval seconds
    countquery_tmp = df_group\
        .writeStream\
        .outputMode("complete") \
        .foreachBatch(write_to_csv)

    # Fixed interval micro-batches or ASAP
    if args.tinterval > 0:
        countquery = countquery_tmp\
            .trigger(processingTime='{} seconds'.format(args.tinterval)) \
            .start()
        ui_refresh = args.tinterval
    else:
        countquery = countquery_tmp.start()
        # Update the UI every 2 seconds to place less load on the browser.
        ui_refresh = 2

    # Monitor the progress of the stream, and save data for the webUI
    colnames = ["inputRowsPerSecond", "processedRowsPerSecond", "timestamp"]
    monitor_progress_webui(countquery, ui_refresh, colnames, args.finkwebpath)

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        print("Exiting the classify service normally...")
    else:
        countquery.awaitTermination()
Example #15
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(
        name="statistics_{}".format(args.night),
        shuffle_partitions=2
    )

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Statistics for {}/{}/{}'.format(year, month, day))

    input_raw = '{}/raw/year={}/month={}/day={}'.format(
        args.agg_data_prefix, year, month, day)
    input_science = '{}/science/year={}/month={}/day={}'.format(
        args.agg_data_prefix, year, month, day)

    df_raw = spark.read.format('parquet').load(input_raw)
    df_sci = spark.read.format('parquet').load(input_science)

    df_sci = df_sci.cache()

    # Number of alerts
    n_raw_alert = df_raw.count()
    n_sci_alert = df_sci.count()

    out_dic = {}
    out_dic['raw'] = n_raw_alert
    out_dic['sci'] = n_sci_alert

    # matches with SIMBAD
    n_simbad = df_sci.select('cdsxmatch')\
        .filter(df_sci['cdsxmatch'] != 'Unknown')\
        .count()

    out_dic['simbad_tot'] = n_simbad

    # Alerts with a close-by candidate host-galaxy
    list_simbad_galaxies = [
        "galaxy",
        "Galaxy",
        "EmG",
        "Seyfert",
        "Seyfert_1",
        "Seyfert_2",
        "BlueCompG",
        "StarburstG",
        "LSB_G",
        "HII_G",
        "High_z_G",
        "GinPair",
        "GinGroup",
        "BClG",
        "GinCl",
        "PartofG",
    ]

    n_simbad_gal = df_sci.select('cdsxmatch')\
        .filter(df_sci['cdsxmatch'].isin(list_simbad_galaxies))\
        .count()

    out_dic['simbad_gal'] = n_simbad_gal

    df_class = df_sci.withColumn(
        'class',
        extract_fink_classification(
            df_sci['cdsxmatch'],
            df_sci['roid'],
            df_sci['mulens'],
            df_sci['snn_snia_vs_nonia'],
            df_sci['snn_sn_vs_all'],
            df_sci['rf_snia_vs_nonia'],
            df_sci['candidate.ndethist'],
            df_sci['candidate.drb'],
            df_sci['candidate.classtar'],
            df_sci['candidate.jd'],
            df_sci['candidate.jdstarthist'],
            df_sci['rf_kn_vs_nonkn'],
            df_sci['tracklet']
        )
    )

    out_class = df_class.groupBy('class').count().collect()
    out_class_ = [o.asDict() for o in out_class]
    out_class_ = [list(o.values()) for o in out_class_]
    for kv in out_class_:
        out_dic[kv[0]] = kv[1]

    # Number of fields
    n_field = df_sci.select('candidate.field').distinct().count()

    out_dic['fields'] = n_field

    # number of measurements per band
    n_g = df_sci.select('candidate.fid').filter('fid == 1').count()
    n_r = df_sci.select('candidate.fid').filter('fid == 2').count()

    out_dic['n_g'] = n_g
    out_dic['n_r'] = n_r

    # Number of exposures
    n_exp = df_sci.select('candidate.jd').distinct().count()

    out_dic['exposures'] = n_exp

    out_dic['night'] = 'ztf_{}'.format(args.night)

    # make a Spark DataFrame
    pdf = pd.DataFrame([out_dic])
    df_hbase = spark.createDataFrame(pdf)

    # rowkey is the night YYYYMMDD
    index_row_key_name = 'night'

    # Columns to use
    cols_basic = [
        'raw',
        'sci',
        'night',
        'n_g',
        'n_r',
        'exposures',
        'fields'
    ]

    cols_class_ = np.transpose(out_class_)[0]
    cols_class = np.concatenate((cols_class_, ['simbad_tot', 'simbad_gal']))

    # column families
    cf = {i: 'basic' for i in df_hbase.select(*cols_basic).columns}
    cf.update({i: 'class' for i in df_hbase.select(*cols_class).columns})

    # construct the time catalog
    hbcatalog_index = construct_hbase_catalog_from_flatten_schema(
        df_hbase.schema,
        'statistics_class',
        rowkeyname=index_row_key_name,
        cf=cf
    )

    # Push index table
    df_hbase.write\
        .options(catalog=hbcatalog_index, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()

    # Construct the schema row - inplace replacement
    schema_row_key_name = 'schema_version'
    df_hbase = df_hbase.withColumnRenamed(
        index_row_key_name,
        schema_row_key_name
    )

    df_hbase_schema = construct_schema_row(
        df_hbase,
        rowkeyname=schema_row_key_name,
        version='schema_{}_{}'.format(fbvsn, fsvsn))

    # construct the hbase catalog for the schema
    hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema(
        df_hbase_schema.schema,
        'statistics_class',
        rowkeyname=schema_row_key_name,
        cf=cf)

    # Push the data using the shc connector
    df_hbase_schema.write\
        .options(catalog=hbcatalog_index_schema, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()
Example #16
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Get or create a Spark Session
    spark = init_sparksession(
        name="distribution", shuffle_partitions=2, log_level="ERROR")

    # Read the catalog file generated by raw2science
    science_db_catalog = args.science_db_catalog
    with open(science_db_catalog) as f:
        catalog = json.load(f)

    # Define variables
    min_timestamp = 100     # set a default
    t_end = 1577836799      # some default value

    # get distribution offset
    min_timestamp = get_distribution_offset(
                        args.checkpointpath_dist, args.startingOffset_dist)

    # Run distribution for (args.exit_after) seconds
    if args.exit_after is not None:
        t_end = time.time() + args.exit_after
        exit_after = True
    else:
        exit_after = False

    # Start the distribution service
    while(not exit_after or time.time() < t_end):
        """Keep scanning the HBase for new records in a loop
        """
        # Scan the HBase till current time
        max_timestamp = int(round(time.time() * 1000)) # time in ms

        # Read Hbase within timestamp range
        df = spark.read\
                  .option("catalog", catalog)\
                  .option("minStamp", min_timestamp)\
                  .option("maxStamp", max_timestamp)\
                  .format("org.apache.spark.sql.execution.datasources.hbase")\
                  .load()

        # Filter out records that have been distributed
        df = df.filter("status!='distributed'")

        # Get the DataFrame for publishing to Kafka (avro serialized)
        df_kafka = get_kafka_df(df, args.distribution_schema)

        # Publish Kafka topic(s) (Ensure that the topic(s) exist on the Kafka Server)
        df_kafka\
            .write\
            .format("kafka")\
            .option("kafka.bootstrap.servers", "localhost:9093")\
            .option("topic", "distribution_test")\
            .save()

        # Update the status column in Hbase
        update_status_in_hbase(df, args.science_db_name, "objectId",
                args.checkpointpath_dist, max_timestamp)

        # update min_timestamp for next iteration
        min_timestamp = max_timestamp

        # Wait for some time before another loop
        time.sleep(1)
Example #17
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="distribute", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Read the catalog file generated by raw2science
    science_db_catalog = args.science_db_catalog
    with open(science_db_catalog) as f:
        catalog = json.load(f)

    # Define variables
    min_timestamp = 100     # set a default
    t_end = 1577836799      # some default value

    # get distribution offset
    min_timestamp = get_distribution_offset(
        args.checkpointpath_dist, args.startingOffset_dist)

    # Get topic name to publish on
    topic = args.distribution_topic
    broker_list = args.distribution_servers

    # Run distribution for (args.exit_after) seconds
    if args.exit_after is not None:
        t_end = time.time() + args.exit_after
        exit_after = True
    else:
        exit_after = False

    # Start the distribution service
    while(not exit_after or time.time() < t_end):
        """Keep scanning the HBase for new records in a loop
        """
        # Scan the HBase till current time
        max_timestamp = int(round(time.time() * 1000))  # time in ms

        # Read Hbase within timestamp range
        df = spark.read\
            .option("catalog", catalog)\
            .option("minStamp", min_timestamp)\
            .option("maxStamp", max_timestamp)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .load()

        # Keep records that haven't been distributed
        df = df.filter("status!='distributed'")

        # Send out slack alerts
        api_token = get_api_token()
        if api_token:
            slack_cols = [
                "objectId", "candidate_ra",
                "candidate_dec", "cross_match_alerts_per_batch"]
            send_slack_alerts(df.select(slack_cols), args.slack_channels)

        # Apply additional filters (user defined xml)
        if args.distribution_rules_xml:
            df = filter_df_using_xml(df, args.distribution_rules_xml)

        # create a nested dataframe similar to the original ztf dataframe
        df_nested = group_df_into_struct(df, "candidate", "objectId")
        df_nested = group_df_into_struct(df_nested, "prv_candidates", "objectId")
        df_nested = group_df_into_struct(df_nested, "cutoutTemplate", "objectId")
        df_nested = group_df_into_struct(df_nested, "cutoutScience", "objectId")
        df_nested = group_df_into_struct(df_nested, "cutoutDifference", "objectId")

        # Apply level two filters
        df_nested = apply_user_defined_filters(df_nested, filter_leveltwo_names)

        # Persist df to memory to materialize changes
        df_nested.persist()

        # Get the DataFrame for publishing to Kafka (avro serialized)
        df_kafka = get_kafka_df(df_nested, args.distribution_schema)

        # Ensure that the topic(s) exist on the Kafka Server)
        df_kafka\
            .write\
            .format("kafka")\
            .option("kafka.bootstrap.servers", broker_list)\
            .option("kafka.security.protocol", "SASL_PLAINTEXT")\
            .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\
            .option("topic", topic)\
            .save()

        # Update the status in Hbase and commit checkpoint to file
        update_status_in_hbase(
            df, args.science_db_name, "objectId",
            args.checkpointpath_dist, max_timestamp)

        # update min_timestamp for next iteration
        min_timestamp = max_timestamp

        # free the memory
        df_nested.unpersist()

        # Wait for some time before another loop
        time.sleep(1)
Example #18
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(
        name="index_archival_{}_{}".format(args.index_table, args.night),
        shuffle_partitions=2
    )

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the aggregated science database
    path = '{}/science/year={}/month={}/day={}'.format(
        args.agg_data_prefix,
        args.night[:4],
        args.night[4:6],
        args.night[6:8]
    )
    df = load_parquet_files(path)

    # construct the index view
    index_row_key_name = args.index_table
    columns = index_row_key_name.split('_')
    names = [col(i) for i in columns]
    index_name = '.' + columns[0]

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Load column names to use in the science portal
    cols_i, cols_d, cols_b = load_science_portal_column_names()

    # Assign each column to a specific column family
    cf = assign_column_family_names(df, cols_i, cols_d, cols_b)

    # Restrict the input DataFrame to the subset of wanted columns.
    if 'upper' in args.index_table:
        df = df.select(
            'objectId',
            'prv_candidates.jd',
            'prv_candidates.fid',
            'prv_candidates.magpsf',
            'prv_candidates.sigmapsf',
            'prv_candidates.diffmaglim'
        )
    else:
        df = df.select(cols_i + cols_d + cols_b)

    # Create and attach the rowkey
    df, _ = attach_rowkey(df)

    common_cols = [
        'objectId', 'candid', 'publisher', 'rcid', 'chipsf', 'distnr',
        'ra', 'dec', 'jd', 'fid', 'nid', 'field', 'xpos', 'ypos', 'rb',
        'ssdistnr', 'ssmagnr', 'ssnamenr', 'jdstarthist', 'jdendhist', 'tooflag',
        'sgscore1', 'distpsnr1', 'neargaia', 'maggaia', 'nmtchps', 'diffmaglim',
        'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos',
        'cdsxmatch',
        'roid',
        'mulens',
        'snn_snia_vs_nonia', 'snn_sn_vs_all', 'rf_snia_vs_nonia',
        'classtar', 'drb', 'ndethist', 'rf_kn_vs_nonkn', 'tracklet'
    ]

    if columns[0].startswith('pixel'):
        nside = int(columns[0].split('pixel')[1])

        df_index = df.withColumn(
            columns[0],
            ang2pix(
                df['ra'],
                df['dec'],
                lit(nside)
            )
        ).select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + ['objectId']
        )
    elif columns[0] == 'class':
        df_index = df.withColumn(
            'class',
            extract_fink_classification(
                df['cdsxmatch'],
                df['roid'],
                df['mulens'],
                df['snn_snia_vs_nonia'],
                df['snn_sn_vs_all'],
                df['rf_snia_vs_nonia'],
                df['ndethist'],
                df['drb'],
                df['classtar'],
                df['jd'],
                df['jdstarthist'],
                df['rf_kn_vs_nonkn'],
                df['tracklet']
            )
        ).select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + common_cols
        )
    elif columns[0] == 'ssnamenr':
        # Flag only objects with likely counterpart in MPC
        df_index = df\
            .filter(df['roid'] == 3)\
            .select(
                [
                    concat_ws('_', *names).alias(index_row_key_name)
                ] + common_cols
            )
    elif columns[0] == 'tracklet':
        # For data < 2021-08-10, no tracklet means ''
        # For data >= 2021-08-10, no tracklet means 'null'
        df_index = df\
            .filter(df['tracklet'] != 'null')\
            .filter(df['tracklet'] != '')\
            .select(
                [
                    concat_ws('_', *names).alias(index_row_key_name)
                ] + common_cols
            )
    elif columns[0] == 'upper':
        # This case is the same as the main table
        # but we keep only upper limit measurements.
        index_row_key_name = 'objectId_jd'
        # explode
        df_ex = df.withColumn(
            "tmp",
            arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid")
        ).withColumn("tmp", explode("tmp")).select(
            concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name),
            "objectId",
            col("tmp.jd"),
            col("tmp.fid"),
            col("tmp.magpsf"),
            col("tmp.sigmapsf"),
            col("tmp.diffmaglim")
        )

        # take only upper limits
        df_index = df_ex.filter(~df_ex['magpsf'].isNotNull())
        # drop NaN columns
        df_index = df_index.drop(*['magpsf', 'sigmapsf'])
    elif columns[0] == 'uppervalid':
        # This case is the same as the main table
        # but we keep only upper limit measurements.
        index_row_key_name = 'objectId_jd'
        # explode
        df_ex = df.withColumn(
            "tmp",
            arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid")
        ).withColumn("tmp", explode("tmp")).select(
            concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name),
            "objectId",
            col("tmp.jd"),
            col("tmp.fid"),
            col("tmp.magpsf"),
            col("tmp.sigmapsf"),
            col("tmp.diffmaglim")
        )

        # take only valid measurements from the history
        df_index = df_ex.filter(df_ex['magpsf'].isNotNull())
    elif columns[0] == 'tns':
        with open('{}/tns_marker.txt'.format(args.tns_folder)) as f:
            tns_marker = f.read().replace('\n', '')

        pdf_tns = download_catalog(os.environ['TNS_API_KEY'], tns_marker)

        # Filter TNS confirmed data
        f1 = ~pdf_tns['type'].isna()
        pdf_tns_filt = pdf_tns[f1]

        pdf_tns_filt_b = spark.sparkContext.broadcast(pdf_tns_filt)

        @pandas_udf(StringType(), PandasUDFType.SCALAR)
        def crossmatch_with_tns(objectid, ra, dec):
            # TNS
            pdf = pdf_tns_filt_b.value
            ra2, dec2, type2 = pdf['ra'], pdf['declination'], pdf['type']

            # create catalogs
            catalog_ztf = SkyCoord(
                ra=np.array(ra, dtype=np.float) * u.degree,
                dec=np.array(dec, dtype=np.float) * u.degree
            )
            catalog_tns = SkyCoord(
                ra=np.array(ra2, dtype=np.float) * u.degree,
                dec=np.array(dec2, dtype=np.float) * u.degree
            )

            # cross-match
            idx, d2d, d3d = catalog_tns.match_to_catalog_sky(catalog_ztf)

            sub_pdf = pd.DataFrame({
                'objectId': objectid.values[idx],
                'ra': ra.values[idx],
                'dec': dec.values[idx],
            })

            # cross-match
            idx2, d2d2, d3d2 = catalog_ztf.match_to_catalog_sky(catalog_tns)

            # set separation length
            sep_constraint2 = d2d2.degree < 1.5 / 3600

            sub_pdf['TNS'] = [''] * len(sub_pdf)
            sub_pdf['TNS'][idx2[sep_constraint2]] = type2.values[idx2[sep_constraint2]]

            to_return = objectid.apply(
                lambda x: '' if x not in sub_pdf['objectId'].values
                else sub_pdf['TNS'][sub_pdf['objectId'] == x].values[0]
            )

            return to_return

        df = df.withColumn(
            'tns',
            crossmatch_with_tns(
                df['objectId'],
                df['ra'],
                df['dec']
            )
        ).select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + common_cols + ['tns']
        ).cache()
        df_index = df.filter(df['tns'] != '').drop('tns')
        # trigger the cache - not the cache might be a killer for LSST...
        n = df_index.count()
        print('TNS objects: {}'.format(n))
    else:
        df_index = df.select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + common_cols
        )

    # construct the time catalog
    hbcatalog_index = construct_hbase_catalog_from_flatten_schema(
        df_index.schema,
        args.science_db_name + index_name,
        rowkeyname=index_row_key_name,
        cf=cf
    )

    # Push index table
    df_index.write\
        .options(catalog=hbcatalog_index, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()

    # Construct the schema row - inplace replacement
    schema_row_key_name = 'schema_version'
    df_index = df_index.withColumnRenamed(
        index_row_key_name,
        schema_row_key_name
    )

    df_index_schema = construct_schema_row(
        df_index,
        rowkeyname=schema_row_key_name,
        version='schema_{}_{}'.format(fbvsn, fsvsn))

    # construct the hbase catalog for the schema
    hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema(
        df_index_schema.schema,
        args.science_db_name + index_name,
        rowkeyname=schema_row_key_name,
        cf=cf)

    # Push the data using the shc connector
    df_index_schema.write\
        .options(catalog=hbcatalog_index_schema, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()
Example #19
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Not very satisfactory... The problem is that latesfirst = false is
    # required to create new HBase table (i.e. all the time in the CI).
    # If you have a better idea, let me know!
    if "travis" in args.science_db_name:
        latesfirst = False
    else:
        latesfirst = True

    df = connect_to_raw_database(args.rawdatapath, args.rawdatapath + "/*",
                                 latesfirst)

    # Apply level one filters
    logger.info(filter_levelone_names)
    df = apply_user_defined_filters(df, filter_levelone_names)

    # Apply level one processors
    logger.info(processor_levelone_names)
    df = apply_user_defined_processors(df, processor_levelone_names)

    # Select alert data + timestamp + added value from processors
    new_colnames = ["decoded.*", "cast(timestamp as string) as timestamp"]
    for i in processor_levelone_names:
        new_colnames.append(i)

    df = df.selectExpr(new_colnames)

    df_hbase = flattenstruct(df, "candidate")
    df_hbase = flattenstruct(df_hbase, "cutoutScience")
    df_hbase = flattenstruct(df_hbase, "cutoutTemplate")
    df_hbase = flattenstruct(df_hbase, "cutoutDifference")
    df_hbase = explodearrayofstruct(df_hbase, "prv_candidates")

    # Create a status column for distribution
    df_hbase = df_hbase.withColumn("status", lit("dbUpdate"))

    # Save the catalog on disk for later usage
    catalog = construct_hbase_catalog_from_flatten_schema(
        df_hbase.schema, args.science_db_name, "objectId")

    science_db_catalog = args.science_db_catalog
    with open(science_db_catalog, 'w') as json_file:
        json.dump(catalog, json_file)

    def write_to_hbase_and_monitor(df: DataFrame, epochid: int,
                                   hbcatalog: str):
        """Write data into HBase.

        The purpose of this function is to write data to HBase using
        Structured Streaming tools such as foreachBatch.

        Parameters
        ----------
        df : DataFrame
            Input micro-batch DataFrame.
        epochid : int
            ID of the micro-batch
        hbcatalog : str
            HBase catalog describing the data

        """
        # If the table does not exist, one needs to specify
        # the number of zones to use (must be greater than 3).
        # TODO: remove this harcoded parameter.
        df.write\
            .options(catalog=hbcatalog, newtable=5)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .save()

    # Query to push data into HBase
    countquery = df_hbase\
        .writeStream\
        .outputMode("append")\
        .option("checkpointLocation", args.checkpointpath_sci)\
        .foreachBatch(lambda x, y: write_to_hbase_and_monitor(x, y, catalog))\
        .start()

    # Query to group objects by type according to SIMBAD
    # Do it every 30 seconds
    groupedquery_started = False
    if "cross_match_alerts_per_batch" in processor_levelone_names:
        df_group = df.groupBy("cross_match_alerts_per_batch").count()
        groupquery = df_group\
            .writeStream\
            .outputMode("complete") \
            .foreachBatch(write_to_csv)\
            .trigger(processingTime='30 seconds'.format(args.tinterval))\
            .start()
        groupedquery_started = True

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        if groupedquery_started:
            groupquery.stop()
        logger.info("Exiting the raw2science service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
Example #20
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science", shuffle_partitions=2)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    df = connect_to_raw_database(args.rawdatapath,
                                 args.rawdatapath + "/*",
                                 latestfirst=False)

    # Apply level one filters
    logger.info(qualitycuts)
    df = apply_user_defined_filter(df, qualitycuts)

    # Apply level one processor: cdsxmatch
    logger.info("New processor: cdsxmatch")
    colnames = [df['objectId'], df['candidate.ra'], df['candidate.dec']]
    df = df.withColumn(cdsxmatch.__name__, cdsxmatch(*colnames))

    # Apply level one processor: rfscore
    logger.info("New processor: rfscore")
    # Required alert columns
    what = [
        'jd', 'fid', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci',
        'isdiffpos'
    ]

    # Use for creating temp name
    prefix = 'c'
    what_prefix = [prefix + i for i in what]

    # Append temp columns with historical + current measurements
    for colname in what:
        df = concat_col(df, colname, prefix=prefix)

    # Perform the fit + classification.
    # Note we can omit the model_path argument, and in that case the
    # default model `data/models/default-model.obj` will be used.
    rfscore_args = [F.col(i) for i in what_prefix]
    df = df.withColumn(rfscore.__name__, rfscore(*rfscore_args))

    # Apply level one processor: rfscore
    logger.info("New processor: microlensing")

    # Retrieve schema
    schema = load_mulens_schema_twobands()

    # Create standard UDF
    mulens_udf = F.udf(mulens, schema)

    # Required alert columns - already computed for SN
    what_prefix_mulens = [
        'cfid', 'cmagpsf', 'csigmapsf', 'cmagnr', 'csigmagnr', 'cmagzpsci',
        'cisdiffpos'
    ]

    mulens_args = [F.col(i) for i in what_prefix_mulens]
    df = df.withColumn('mulens', mulens_udf(*mulens_args))

    # Drop temp columns
    df = df.drop(*what_prefix)

    # Partition the data hourly
    df_partitionedby = df\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))\
        .withColumn("hour", F.date_format("timestamp", "HH"))

    # Append new rows in the tmp science database
    countquery = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath_sci_tmp) \
        .option("path", args.scitmpdatapath)\
        .partitionBy("year", "month", "day", "hour") \
        .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the raw2science service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
Example #21
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="stream2raw", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Create a streaming dataframe pointing to a Kafka stream
    df = connect_to_kafka(servers=args.servers,
                          topic=args.topic,
                          startingoffsets=args.startingoffsets_stream,
                          failondataloss=False)

    # Get Schema of alerts
    _, _, alert_schema_json = get_schemas_from_avro(args.schema)

    # Decode the Avro data, and keep only (timestamp, data)
    df_decoded = df.select([
        "timestamp", "topic",
        from_avro(df["value"], alert_schema_json).alias("decoded")
    ])

    # Flatten the data columns to match the incoming alert data schema
    cnames = df_decoded.columns
    cnames[cnames.index('decoded')] = 'decoded.*'
    df_decoded = df_decoded.selectExpr(cnames)

    # Partition the data hourly
    df_partitionedby = df_decoded\
        .withColumn("year", date_format("timestamp", "yyyy"))\
        .withColumn("month", date_format("timestamp", "MM"))\
        .withColumn("day", date_format("timestamp", "dd"))\
        .withColumn("hour", date_format("timestamp", "HH"))

    # Append new rows every `tinterval` seconds
    countquery_tmp = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath_raw) \
        .option("path", args.rawdatapath)\
        .partitionBy("topic", "year", "month", "day", "hour")

    # Fixed interval micro-batches or ASAP
    if args.tinterval > 0:
        countquery = countquery_tmp\
            .trigger(processingTime='{} seconds'.format(args.tinterval)) \
            .start()
    else:
        countquery = countquery_tmp.start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the stream2raw service normally...")
    else:
        countquery.awaitTermination()
Example #22
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="raw2science_{}".format(args.night),
                              shuffle_partitions=2)

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # data path
    rawdatapath = args.online_data_prefix + '/raw'
    scitmpdatapath = args.online_data_prefix + '/science'
    checkpointpath_sci_tmp = args.online_data_prefix + '/science_checkpoint'

    df = connect_to_raw_database(
        rawdatapath + "/year={}/month={}/day={}".format(
            args.night[0:4], args.night[4:6], args.night[6:8]),
        rawdatapath + "/year={}/month={}/day={}".format(
            args.night[0:4], args.night[4:6], args.night[6:8]),
        latestfirst=False)

    # Apply quality cuts
    logger.info("Applying quality cuts")
    df = df\
        .filter(df['candidate.nbad'] == 0)\
        .filter(df['candidate.rb'] >= 0.55)

    # Apply science modules
    df = apply_science_modules(df, logger)

    # Add library versions
    df = df.withColumn('fink_broker_version', F.lit(fbvsn))\
        .withColumn('fink_science_version', F.lit(fsvsn))

    # Switch publisher
    df = df.withColumn('publisher', F.lit('Fink'))

    # re-create partitioning columns if needed.
    if 'timestamp' not in df.columns:
        df = df\
            .withColumn("timestamp", jd_to_datetime(df['candidate.jd']))

    if "year" not in df.columns:
        df = df\
            .withColumn("year", F.date_format("timestamp", "yyyy"))

    if "month" not in df.columns:
        df = df\
            .withColumn("month", F.date_format("timestamp", "MM"))

    if "day" not in df.columns:
        df = df\
            .withColumn("day", F.date_format("timestamp", "dd"))

    # Append new rows in the tmp science database
    countquery = df\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", checkpointpath_sci_tmp) \
        .option("path", scitmpdatapath)\
        .partitionBy("year", "month", "day") \
        .trigger(processingTime='{} seconds'.format(args.tinterval)) \
        .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the raw2science service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
Example #23
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="science_archival_{}".format(args.night),
                              shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the aggregated science database
    path = '{}/science/year={}/month={}/day={}'.format(args.agg_data_prefix,
                                                       args.night[:4],
                                                       args.night[4:6],
                                                       args.night[6:8])
    df = load_parquet_files(path)

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Load column names to use in the science portal
    cols_i, cols_d, cols_b = load_science_portal_column_names()

    # Assign each column to a specific column family
    cf = assign_column_family_names(df, cols_i, cols_d, cols_b)

    # Restrict the input DataFrame to the subset of wanted columns.
    df = df.select(cols_i + cols_d + cols_b)

    # Create and attach the rowkey
    df, row_key_name = attach_rowkey(df)

    # construct the hbase catalog
    hbcatalog = construct_hbase_catalog_from_flatten_schema(
        df.schema, args.science_db_name, rowkeyname=row_key_name, cf=cf)

    # Save the catalog on disk (local)
    with open(args.science_db_catalog, 'w') as json_file:
        json.dump(hbcatalog, json_file)

    if args.save_science_db_catalog_only:
        # Print for visual inspection
        print(hbcatalog)
    else:
        # Push the data using the shc connector
        df.write\
            .options(catalog=hbcatalog, newtable=50)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .save()

        # Construct the schema row - inplace replacement
        schema_row_key_name = 'schema_version'
        df = df.withColumnRenamed(row_key_name, schema_row_key_name)

        df_schema = construct_schema_row(df,
                                         rowkeyname=schema_row_key_name,
                                         version='schema_{}_{}'.format(
                                             fbvsn, fsvsn))

        # construct the hbase catalog for the schema
        hbcatalog_schema = construct_hbase_catalog_from_flatten_schema(
            df_schema.schema,
            args.science_db_name,
            rowkeyname=schema_row_key_name,
            cf=cf)

        # Save the catalog on disk (local)
        catname = args.science_db_catalog.replace('.json', '_schema_row.json')
        with open(catname, 'w') as json_file:
            json.dump(hbcatalog_schema, json_file)

        # Push the data using the shc connector
        df_schema.write\
            .options(catalog=hbcatalog_schema, newtable=5)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .save()
Example #24
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="mergeAndClean_{}".format(args.night))

    # Logger to print useful debug statements
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    year = args.night[:4]
    month = args.night[4:6]
    day = args.night[6:8]

    print('Processing {}/{}/{}'.format(year, month, day))

    input_raw = '{}/raw/year={}/month={}/day={}'.format(
        args.online_data_prefix, year, month, day)
    input_science = '{}/science/year={}/month={}/day={}'.format(
        args.online_data_prefix, year, month, day)

    # basepath
    output_raw = '{}/raw/year={}/month={}/day={}'.format(
        args.agg_data_prefix, year, month, day)
    output_science = '{}/science/year={}/month={}/day={}'.format(
        args.agg_data_prefix, year, month, day)

    print('Raw data processing....')
    df_raw = spark.read.format('parquet').load(input_raw)
    print('Num partitions before: ', df_raw.rdd.getNumPartitions())
    print('Num partitions after : ', numPart(df_raw))

    df_raw.withColumn('timestamp', jd_to_datetime(df_raw['candidate.jd']))\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))\
        .coalesce(numPart(df_raw))\
        .write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_raw)

    print('Science data processing....')

    df_science = spark.read.format('parquet').load(input_science)
    npart_after = int(numPart(df_science))
    print('Num partitions before: ', df_science.rdd.getNumPartitions())
    print('Num partitions after : ', npart_after)

    # Add tracklet information before merging
    df_trck = add_tracklet_information(df_science)

    # join back information to the initial dataframe
    df_science = df_science\
        .join(
            F.broadcast(df_trck.select(['candid', 'tracklet'])),
            on='candid',
            how='outer'
        )

    df_science.withColumn('timestamp', jd_to_datetime(df_science['candidate.jd']))\
        .withColumn("year", F.date_format("timestamp", "yyyy"))\
        .withColumn("month", F.date_format("timestamp", "MM"))\
        .withColumn("day", F.date_format("timestamp", "dd"))\
        .coalesce(npart_after)\
        .write\
        .mode("append") \
        .partitionBy("year", "month", "day")\
        .parquet(output_science)
Example #25
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="stream2raw", shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Create a streaming dataframe pointing to a Kafka stream
    kerberos = 'public2.alerts.ztf' in args.servers
    df = connect_to_kafka(servers=args.servers,
                          topic=args.topic,
                          startingoffsets=args.startingoffsets_stream,
                          failondataloss=False,
                          kerberos=kerberos)

    # Get Schema of alerts
    alert_schema, _, alert_schema_json = get_schemas_from_avro(args.schema)

    # Decode the Avro data, and keep only (timestamp, data)
    if '134.158.' in args.servers or 'localhost' in args.servers:
        # using custom from_avro (not available for Spark 2.4.x)
        # it will be available from Spark 3.0 though
        df_decoded = df.select(
            [from_avro(df["value"], alert_schema_json).alias("decoded")])
    elif 'public2.alerts.ztf' in args.servers:
        # Decode on-the-fly using fastavro
        f = udf(lambda x: fastavro.reader(io.BytesIO(x)).next(), alert_schema)
        df_decoded = df.select([f(df['value']).alias("decoded")])
    else:
        msg = "Data source {} is not known - a decoder must be set".format(
            args.servers)
        logger.warn(msg)
        spark.stop()

    # Flatten the data columns to match the incoming alert data schema
    cnames = df_decoded.columns
    cnames[cnames.index('decoded')] = 'decoded.*'
    df_decoded = df_decoded.selectExpr(cnames)

    # Partition the data hourly
    df_partitionedby = df_decoded\
        .withColumn("timestamp", jd_to_datetime(df_decoded['candidate.jd']))\
        .withColumn("year", date_format("timestamp", "yyyy"))\
        .withColumn("month", date_format("timestamp", "MM"))\
        .withColumn("day", date_format("timestamp", "dd"))

    # Append new rows every `tinterval` seconds
    countquery_tmp = df_partitionedby\
        .writeStream\
        .outputMode("append") \
        .format("parquet") \
        .option("checkpointLocation", args.checkpointpath_raw) \
        .option("path", args.rawdatapath)\
        .partitionBy("year", "month", "day")

    # Fixed interval micro-batches or ASAP
    if args.tinterval > 0:
        countquery = countquery_tmp\
            .trigger(processingTime='{} seconds'.format(args.tinterval)) \
            .start()
    else:
        countquery = countquery_tmp.start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        logger.info("Exiting the stream2raw service normally...")
    else:
        countquery.awaitTermination()
Example #26
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(name="distribute_{}".format(args.night),
                              shuffle_partitions=2)

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # data path
    scitmpdatapath = args.online_data_prefix + '/science'
    checkpointpath_kafka = args.online_data_prefix + '/kafka_checkpoint'

    # Connect to the TMP science database
    df = connect_to_raw_database(
        scitmpdatapath + "/year={}/month={}/day={}".format(
            args.night[0:4], args.night[4:6], args.night[6:8]),
        scitmpdatapath + "/year={}/month={}/day={}".format(
            args.night[0:4], args.night[4:6], args.night[6:8]),
        latestfirst=False)

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Cast fields to ease the distribution
    cnames = df.columns
    cnames[cnames.index(
        'timestamp')] = 'cast(timestamp as string) as timestamp'
    cnames[cnames.index(
        'cutoutScience')] = 'struct(cutoutScience.*) as cutoutScience'
    cnames[cnames.index(
        'cutoutTemplate')] = 'struct(cutoutTemplate.*) as cutoutTemplate'
    cnames[cnames.index(
        'cutoutDifference')] = 'struct(cutoutDifference.*) as cutoutDifference'
    cnames[cnames.index(
        'prv_candidates')] = 'explode(array(prv_candidates)) as prv_candidates'
    cnames[cnames.index('candidate')] = 'struct(candidate.*) as candidate'

    # Retrieve time-series information
    to_expand = [
        'jd', 'fid', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci',
        'isdiffpos'
    ]

    # Append temp columns with historical + current measurements
    prefix = 'c'
    for colname in to_expand:
        df = concat_col(df, colname, prefix=prefix)

    # quick fix for https://github.com/astrolabsoftware/fink-broker/issues/457
    for colname in to_expand:
        df = df.withColumnRenamed('c' + colname, 'c' + colname + 'c')

    broker_list = args.distribution_servers
    for userfilter in userfilters:
        # The topic name is the filter name
        topicname = args.substream_prefix + userfilter.split('.')[-1] + '_ztf'

        # Apply user-defined filter
        df_tmp = apply_user_defined_filter(df, userfilter)

        # Wrap alert data
        df_tmp = df_tmp.selectExpr(cnames)

        # Get the DataFrame for publishing to Kafka (avro serialized)
        df_kafka = get_kafka_df(df_tmp, '')

        # Ensure that the topic(s) exist on the Kafka Server)
        disquery = df_kafka\
            .writeStream\
            .format("kafka")\
            .option("kafka.bootstrap.servers", broker_list)\
            .option("kafka.security.protocol", "SASL_PLAINTEXT")\
            .option("kafka.sasl.mechanism", "SCRAM-SHA-512")\
            .option("topic", topicname)\
            .option("checkpointLocation", checkpointpath_kafka + topicname)\
            .trigger(processingTime='{} seconds'.format(args.tinterval)) \
            .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        disquery.stop()
        logger.info("Exiting the distribute service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()
Example #27
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Grab the running Spark Session,
    # otherwise create it.
    spark = init_sparksession(
        name="buildSciDB", shuffle_partitions=2, log_level="ERROR")

    # FIXME!
    if "travis" in args.science_db_name:
        latesfirst = False
    else:
        latesfirst = True

    df = connect_to_raw_database(
        args.rawdatapath, args.rawdatapath + "/*", latesfirst)

    # Apply filters and keep only good alerts
    df_filt = df.withColumn(
        "toKeep",
        keep_alert_based_on(
            col("decoded.candidate.nbad"),
            col("decoded.candidate.rb"),
            col("decoded.candidate.magdiff")
        )
    ).filter("toKeep == true")

    # for good alerts, perform a cross-match with SIMBAD,
    # and return the types of the objects (Star, AGN, Unknown, etc.)
    df_type = df_filt.withColumn(
        "simbadType",
        cross_match_alerts_per_batch(
            col("decoded.objectId"),
            col("decoded.candidate.ra"),
            col("decoded.candidate.dec")
        )
    ).selectExpr(
        "decoded.*", "cast(timestamp as string) as timestamp", "simbadType")

    df_hbase = flattenstruct(df_type, "candidate")
    df_hbase = flattenstruct(df_hbase, "cutoutScience")
    df_hbase = flattenstruct(df_hbase, "cutoutTemplate")
    df_hbase = flattenstruct(df_hbase, "cutoutDifference")
    df_hbase = explodearrayofstruct(df_hbase, "prv_candidates")

    # Create a status column for distribution
    df_hbase = df_hbase.withColumn("status", lit("dbUpdate"))

    # Save the catalog on disk for later usage
    catalog = construct_hbase_catalog_from_flatten_schema(
        df_hbase.schema, args.science_db_name, "objectId")

    science_db_catalog = args.science_db_catalog
    with open(science_db_catalog, 'w') as json_file:
        json.dump(catalog, json_file)

    def write_to_hbase_and_monitor(
            df: DataFrame, epochid: int, hbcatalog: str):
        """Write data into HBase.

        The purpose of this function is to write data to HBase using
        Structured Streaming tools such as foreachBatch.

        Parameters
        ----------
        df : DataFrame
            Input micro-batch DataFrame.
        epochid : int
            ID of the micro-batch
        hbcatalog : str
            HBase catalog describing the data

        """
        # If the table does not exist, one needs to specify
        # the number of zones to use (must be greater than 3).
        # TODO: remove this harcoded parameter.
        df.write\
            .options(catalog=hbcatalog, newtable=5)\
            .format("org.apache.spark.sql.execution.datasources.hbase")\
            .save()

    # Query to push data into HBase
    countquery = df_hbase\
        .writeStream\
        .outputMode("append")\
        .option("checkpointLocation", args.checkpointpath_sci)\
        .foreachBatch(lambda x, y: write_to_hbase_and_monitor(x, y, catalog))\
        .start()

    # Query to group objects by type according to SIMBAD
    # Do it every 30 seconds
    df_group = df_type.groupBy("simbadType").count()
    groupquery = df_group\
        .writeStream\
        .outputMode("complete") \
        .foreachBatch(write_to_csv)\
        .trigger(processingTime='30 seconds'.format(args.tinterval))\
        .start()

    # Keep the Streaming running until something or someone ends it!
    if args.exit_after is not None:
        time.sleep(args.exit_after)
        countquery.stop()
        groupquery.stop()
        print("Exiting the raw2science service normally...")
    else:
        # Wait for the end of queries
        spark.streams.awaitAnyTermination()