StructField("year", StringType(), True),
        StructField("month", StringType(), True)
    ])

    raw_crime_df = spark.readStream \
        .option("header", "false") \
        .option("maxFilesPerTrigger", 2) \
        .schema(schema) \
        .csv(data_path)

    raw_crime_df.createOrReplaceTempView("CrimeData")

    print("Is the stream ready?", raw_crime_df.isStreaming)

    category_df = spark.sql(
        "SELECT major_category, value FROM CrimeData WHERE year = '2016'")

    crime_per_cat_df = category_df.groupBy("major_category")\
        .agg(_sum("value").alias("convictions"))\
        .orderBy(desc("convictions"))

    query = crime_per_cat_df.writeStream\
        .outputMode("complete")\
        .format("console")\
        .option("truncate", "false")\
        .option("numRows", 30)\
        .start()\
        .awaitTermination()

# spark-submit --packages "org.apache.hadoop:hadoop-aws:2.7.4" com/dsm/files/sql_demo.py
def main():
    """
    TODO: Create html page

    Access time filter logic:
        - If "last_access_ts" is less than 3 months ago, then set "months_old" as 3,
        - If "last_access_ts" is less than 6 monthsa ago, then set "months_old" as 6,
        - If "last_access_ts" is less than 12 months ago, then set "months_old" as 12

    The result includes only the datasets whose last access time are 12, 6 or 3 months ago.
    """
    spark = get_spark_session()
    (df_contents_f_to_b, df_contents_b_to_d, df_replicas, df_dids_files,
     df_replicas_j_dids, df_files_complete) = prepare_spark_dataframes(spark)

    # ===============================================================================
    # Continue with joins
    # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    # -------------------------------------------------------------------------------

    # --- STEP-10 / Tests to check dataframes are okay ---:
    #         df_block_file_rse.select("file").distinct().count() =  is 29921156
    #         df_block_file_rse.filter(col("file").isNull()).count() = 0
    #         df_block_file_rse.filter(col("block").isNull()).count() = 57892
    #         Above line means, we cannot extract block names of 57892 file from CONTENTS table ..
    #         .. which provides F:D and D:C mapping (file, dataset, container in Rucio terms)
    #         df_block_file_rse.filter(col("rse_id").isNull()).count() = 0
    #         df_block_file_rse.filter(col("fsize").isNull()).count() = 0
    #         We are all good, just drop null block names.

    # STEP-10: Left join df_files_complete and df_contents_f_to_b to get block names of files.
    #   - There are some files that we cannot extract their block names from CONTENTS table
    #   - So filter out them.
    df_block_file_rse = df_files_complete \
        .join(df_contents_f_to_b, ["file"], how="left") \
        .select(['block', 'file', 'rse_id', 'accessed_at', 'fsize', ]) \
        .filter(col("block").isNotNull()) \
        .cache()

    # --- STEP-11 / Tests to check dataframes are okay ---:
    #         df_all.filter(col("dataset").isNull()).count() = 280821

    # STEP-11: Left join df_block_file_rse and df_contents_b_to_d to get dataset names of blocks&files.
    #   - There are some blocks that we cannot extract their dataset names from CONTENTS table.
    #   - So filter out them.
    df_all = df_block_file_rse \
        .join(df_contents_b_to_d, ["block"], how="left") \
        .select(['dataset', 'block', 'file', 'rse_id', 'accessed_at', 'fsize']) \
        .filter(col("dataset").isNotNull()) \
        .cache()

    # STEP-12: Group by "dataset" and "rses" to calculate:
    #       - dataset_size_in_rse: total size of dataset in a RSE by summing up dataset's all files in that RSE.
    #       - `last_access_time_of_dataset_per_rse`: last access time of dataset in a RSE ...
    #           ... by getting max of file `accessed_at` field of dataset's all files in that RSE.
    #       - `#files_null_access_time_per_rse`: number of files which has NULL `accessed_at` field ...
    #           ... in each dataset in a RSE. ...
    #           ... This important to know to filter out if there is any NULL accessed_at file in calculation.
    #       - `#files_per_rse`: number of files od the dataset in that RSE
    #       - `#files_unique_per_rse`: unique count of dataset files in that RSE
    #       Final result will be like: one dataset can be in multiple RSEs and presumably ...
    #           ... it may have different sizes since a dataset may lost one of its block or file in a RSE?
    df_final_dataset_rse = df_all \
        .groupby(["dataset", "rse_id"]) \
        .agg(_sum(col("fsize")).alias("dataset_size_in_rse"),
             _max(col("accessed_at")).alias("last_access_time_of_dataset_per_rse"),
             _sum(when(col("accessed_at").isNull(), 1).otherwise(0)).alias("#files_null_access_time_per_rse"),
             _count(lit(1)).alias("#files_per_rse"),
             countDistinct(col("file")).alias("#files_unique_per_rse"),
             ) \
        .cache()

    # STEP-13: Get thresholds. They are unix timestamps which are 3, 6 and 12 months ago from today.
    ts_thresholds = get_ts_thresholds()

    # STEP-14:
    #   Filter for calculating last_accessed_at_least_{12|6|3}_months_ago columns.
    #       - To produce correct results, "last_access_time_of_dataset_per_rse" field should not be null
    #           which means a dataset's all files' accessed_at fields are filled.
    #       - And "#files_null_access_time_per_rse"==0 means that there should not be ...
    #           any file with NULL "accessed_at" field.
    # Group by dataset to get final result from all RSEs' datasets.
    #   - max_dataset_size(TB): max size of dataset in all RSEs that contain this dataset
    #   - max_dataset_size(TB): min size of dataset in all RSEs that contain this dataset
    #   - max_dataset_size(TB): avg size of dataset in all RSEs that contain this dataset
    #   - last_access_time_of_dataset: last access time of dataset in all RSEs
    df = df_final_dataset_rse \
        .filter(col("last_access_time_of_dataset_per_rse").isNotNull() &
                (col("#files_null_access_time_per_rse") == 0)
                ) \
        .groupby(["dataset"]) \
        .agg(_round(_max(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("max_dataset_size(TB)"),
             _round(_min(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("min_dataset_size(TB)"),
             _round(_avg(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("avg_dataset_size(TB)"),
             _sum(col("#files_null_access_time_per_rse")).alias("#files_null_access_time_per_dataset"),
             _max(col("last_access_time_of_dataset_per_rse")).alias("last_access_time_of_dataset"),
             ) \
        .withColumn('last_access_more_than_12_months_ago',
                    when(col('last_access_time_of_dataset') < ts_thresholds[12], 1).otherwise(0)
                    ) \
        .withColumn('last_access_more_than_6_months_ago',
                    when(col('last_access_time_of_dataset') < ts_thresholds[6], 1).otherwise(0)
                    ) \
        .withColumn('last_access_more_than_3_months_ago',
                    when(col('last_access_time_of_dataset') < ts_thresholds[3], 1).otherwise(0)
                    ) \
        .filter((col('last_access_more_than_12_months_ago') == 1) |
                (col('last_access_more_than_6_months_ago') == 1) |
                (col('last_access_more_than_3_months_ago') == 1)
                ) \
        .cache()

    # STEP-15: Find datasets which have only null accessed_at fields in its files
    df_all_null_accessed_at = df_final_dataset_rse \
        .filter(col("last_access_time_of_dataset_per_rse").isNull()) \
        .groupby(["dataset"]) \
        .agg(_round(_max(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("max_dataset_size(TB)"),
             _round(_min(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("min_dataset_size(TB)"),
             _round(_avg(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("avg_dataset_size(TB)"),
             _sum(col("#files_null_access_time_per_rse")).alias("#files_null_access_time_per_dataset"),
             _max(col("last_access_time_of_dataset_per_rse")).alias("last_access_time_of_dataset"),
             ) \
        .cache()

    # Total for not null data: not read more than 3,6,12 months which is equal to more than 3 months values.
    df.select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()

    # For 12 months
    df.filter(col("last_access_more_than_12_months_ago") == 1).select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df.filter(col("last_access_more_than_12_months_ago") == 1).count())

    # For 6 months
    df.filter(col("last_access_more_than_6_months_ago") == 1).select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df.filter(col("last_access_more_than_6_months_ago") == 1).count())

    # For 3 months
    df.filter(col("last_access_more_than_3_months_ago") == 1).select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df.filter(col("last_access_more_than_3_months_ago") == 1).count())

    # For all null accessed_at(all files) datasets
    df_all_null_accessed_at.select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df_all_null_accessed_at.count())

    return df, df_all_null_accessed_at
def create_main_df(spark, hdfs_paths, base_eos_dir):
    # UTC timestamp of start hour of spark job
    ts_current_hour = int(datetime.utcnow().replace(
        minute=0, second=0, microsecond=0, tzinfo=timezone.utc).timestamp() *
                          1000)
    # -----------------------------------------------------------------------------------------------------------------
    #                -- ==================  Prepare main Spark dataframes  ===========================

    # Get RSES id, name, type, tier, country, kind from RSES table dump
    df_rses = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['RSES']) \
        .filter(col('DELETED_AT').isNull()) \
        .withColumn('replica_rse_id', lower(_hex(col('ID')))) \
        .withColumnRenamed('RSE', 'rse') \
        .withColumnRenamed('RSE_TYPE', 'rse_type') \
        .withColumn('rse_tier', _split(col('rse'), '_').getItem(0)) \
        .withColumn('rse_country', _split(col('rse'), '_').getItem(1)) \
        .withColumn('rse_kind',
                    when(col("rse").endswith('Temp'), 'temp')
                    .when(col("rse").endswith('Test'), 'test')
                    .otherwise('prod')
                    ) \
        .select(['replica_rse_id', 'rse', 'rse_type', 'rse_tier', 'rse_country', 'rse_kind'])

    # Rucio Dataset(D) refers to dbs block, so we used DBS terminology from the beginning
    df_contents_f_to_b = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['CONTENTS']) \
        .filter(col("SCOPE") == "cms") \
        .filter(col("DID_TYPE") == "D") \
        .filter(col("CHILD_TYPE") == "F") \
        .withColumnRenamed("NAME", "block") \
        .withColumnRenamed("CHILD_NAME", "file") \
        .select(["block", "file"])

    # Rucio Dataset(D) refers to dbs block; Rucio Container(C) refers to dbs dataset.
    # We used DBS terminology from the beginning
    df_contents_b_to_d = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['CONTENTS']) \
        .filter(col("SCOPE") == "cms") \
        .filter(col("DID_TYPE") == "C") \
        .filter(col("CHILD_TYPE") == "D") \
        .withColumnRenamed("NAME", "dataset") \
        .withColumnRenamed("CHILD_NAME", "block") \
        .select(["dataset", "block"])

    # Get file to dataset map
    df_contents_ds_files = df_contents_f_to_b.join(df_contents_b_to_d, ["block"], how="left") \
        .filter(col('file').isNotNull()) \
        .filter(col('dataset').isNotNull()) \
        .withColumnRenamed('dataset', 'contents_dataset') \
        .withColumn('is_d_name_from_rucio', lit(BOOL_STR[True])) \
        .select(["contents_dataset", "file", "is_d_name_from_rucio"])

    dbs_files = spark.read.format('avro').load(hdfs_paths['FILES']) \
        .withColumnRenamed('LOGICAL_FILE_NAME', 'file') \
        .withColumnRenamed('DATASET_ID', 'dbs_file_ds_id') \
        .withColumnRenamed('FILE_SIZE', 'dbs_file_size') \
        .select(['file', 'dbs_file_ds_id', 'dbs_file_size'])

    dbs_datasets = spark.read.format('avro').load(hdfs_paths['DATASETS'])

    df_dbs_ds_files = dbs_files.join(dbs_datasets.select(['DATASET_ID', 'DATASET']),
                                     dbs_files.dbs_file_ds_id == dbs_datasets.DATASET_ID, how='left') \
        .filter(col('file').isNotNull()) \
        .filter(col('DATASET').isNotNull()) \
        .withColumnRenamed('dbs_file_ds_id', 'dbs_dataset_id') \
        .withColumnRenamed('DATASET', 'dbs_dataset') \
        .withColumn('is_d_name_from_dbs', lit(BOOL_STR[True])) \
        .select(['file', 'dbs_dataset', 'is_d_name_from_dbs'])

    # Prepare replicas
    df_replicas = spark.read.format('avro').load(hdfs_paths['REPLICAS']) \
        .filter(col("SCOPE") == "cms") \
        .withColumn('replica_rse_id', lower(_hex(col('RSE_ID')))) \
        .withColumn('replica_file_size', col('BYTES').cast(LongType())) \
        .withColumnRenamed('NAME', 'file') \
        .withColumnRenamed('ACCESSED_AT', 'replica_accessed_at') \
        .withColumnRenamed('CREATED_AT', 'replica_created_at') \
        .withColumnRenamed('LOCK_CNT', 'lock_cnt') \
        .withColumnRenamed('STATE', 'state') \
        .select(['file', 'replica_rse_id', 'replica_file_size',
                 'replica_accessed_at', 'replica_created_at', 'lock_cnt'])

    # Create enriched file df which adds dbs file size to replicas files. Left join select only replicas files
    df_files_enriched_with_dbs = df_replicas \
        .join(dbs_files.select(['file', 'dbs_file_size']), ['file'], how='left') \
        .withColumn('joint_file_size',
                    when(col('replica_file_size').isNotNull(), col('replica_file_size'))
                    .when(col('dbs_file_size').isNotNull(), col('dbs_file_size'))
                    ) \
        .select(['file', 'replica_rse_id', 'replica_accessed_at', 'replica_created_at', 'lock_cnt',
                 'replica_file_size', 'dbs_file_size', 'joint_file_size'])

    # -----------------------------------------------------------------------------------------------------------------
    #            -- ==================  only Rucio: Replicas and Contents  ======================= --

    df_only_from_rucio = df_replicas \
        .join(df_contents_ds_files, ['file'], how='left') \
        .select(['contents_dataset', 'file', 'replica_rse_id', 'replica_file_size',
                 'replica_accessed_at', 'replica_created_at', 'is_d_name_from_rucio', 'lock_cnt'])

    # Use them in outer join
    # _max(col('replica_accessed_at')).alias('rucio_last_accessed_at'),
    # _max(col('replica_created_at')).alias('rucio_last_created_at'),

    df_only_from_rucio = df_only_from_rucio \
        .groupby(['replica_rse_id', 'contents_dataset']) \
        .agg(_sum(col('replica_file_size')).alias('rucio_size'),
             _count(lit(1)).alias('rucio_n_files'),
             _sum(
                 when(col('replica_accessed_at').isNull(), 0)
                     .otherwise(1)
             ).alias('rucio_n_accessed_files'),
             _first(col("is_d_name_from_rucio")).alias("is_d_name_from_rucio"),
             _sum(col('lock_cnt')).alias('rucio_locked_files')
             ) \
        .withColumn('rucio_is_d_locked',
                    when(col('rucio_locked_files') > 0, IS_DATASET_LOCKED[True])
                    .otherwise(IS_DATASET_LOCKED[False])
                    ) \
        .select(['contents_dataset', 'replica_rse_id', 'rucio_size', 'rucio_n_files', 'rucio_n_accessed_files',
                 'is_d_name_from_rucio', 'rucio_locked_files', 'rucio_is_d_locked', ])

    # -----------------------------------------------------------------------------------------------------------------
    #             -- =================  only DBS: Replicas, Files, Datasets  ====================== --

    # Of course only files from Replicas processed, select only dbs related fields
    df_only_from_dbs = df_files_enriched_with_dbs \
        .select(['file', 'replica_rse_id', 'dbs_file_size', 'replica_accessed_at', 'lock_cnt']) \
        .join(df_dbs_ds_files, ['file'], how='left') \
        .filter(col('dbs_dataset').isNotNull()) \
        .select(['file', 'dbs_dataset', 'replica_rse_id', 'dbs_file_size', 'replica_accessed_at',
                 'is_d_name_from_dbs', 'lock_cnt'])

    df_only_from_dbs = df_only_from_dbs \
        .groupby(['replica_rse_id', 'dbs_dataset']) \
        .agg(_sum(col('dbs_file_size')).alias('dbs_size'),
             _count(lit(1)).alias('dbs_n_files'),
             _sum(
                 when(col('replica_accessed_at').isNull(), 0)
                     .otherwise(1)
             ).alias('dbs_n_accessed_files'),
             _first(col("is_d_name_from_dbs")).alias("is_d_name_from_dbs"),
             _sum(col('lock_cnt')).alias('dbs_locked_files')
             ) \
        .withColumn('dbs_is_d_locked',
                    when(col('dbs_locked_files') > 0, IS_DATASET_LOCKED[True])
                    .otherwise(IS_DATASET_LOCKED[False])
                    ) \
        .select(['dbs_dataset', 'replica_rse_id', 'dbs_size', 'dbs_n_files', 'dbs_n_accessed_files',
                 'is_d_name_from_dbs', 'dbs_locked_files', 'dbs_is_d_locked'])

    # Full outer join of Rucio and DBS to get all dataset-file maps
    df_dataset_file_map_enr = df_contents_ds_files.join(df_dbs_ds_files,
                                                        ['file'],
                                                        how='full')

    # -----------------------------------------------------------------------------------------------------------------
    #               -- ======  check files do not have dataset name  ============ --

    # Check Replicas files do not have dataset name in Contents, DBS or both
    x = df_replicas.join(df_dataset_file_map_enr, ['file'], how='left') \
        .select(['contents_dataset', 'dbs_dataset', 'file'])

    y_contents = x.filter(col('contents_dataset').isNull())
    z_dbs = x.filter(col('dbs_dataset').isNull())
    t_both = x.filter(
        col('contents_dataset').isNull() & col('dbs_dataset').isNull())
    stats_dict = {
        "Replicas files do not have dataset name in Contents":
        y_contents.select('file').distinct().count(),
        "Replicas files do not have dataset name in DBS":
        z_dbs.select('file').distinct().count(),
        "Replicas files do not have dataset name neither in Contents nor DBS":
        t_both.select('file').distinct().count()
    }
    write_stats_to_eos(base_eos_dir, stats_dict)
    del x, y_contents, z_dbs, t_both

    # -----------------------------------------------------------------------------------------------------------------
    #              -- ======  joint Rucio and DBS: Replicas, Contents, Files, Datasets  ============ --

    # Main aim is to get all datasets of files
    df_dataset_file_map_enr = df_dataset_file_map_enr \
        .withColumn("dataset",
                    when(col("contents_dataset").isNotNull(), col("contents_dataset"))
                    .when(col("dbs_dataset").isNotNull(), col("dbs_dataset"))
                    ) \
        .withColumn("is_ds_from_rucio", when(col("is_d_name_from_rucio").isNotNull(), 1).otherwise(0)) \
        .withColumn("is_ds_from_dbs", when(col("is_d_name_from_dbs").isNotNull(), 1).otherwise(0)) \
        .select(['dataset', 'file', 'is_ds_from_dbs', 'is_ds_from_rucio'])

    df_joint_ds_files = df_files_enriched_with_dbs \
        .select(['file', 'replica_rse_id', 'replica_accessed_at', 'replica_created_at',
                 'joint_file_size', 'lock_cnt']) \
        .join(df_dataset_file_map_enr, ['file'], how='left') \
        .filter(col('dataset').isNotNull()) \
        .select(['dataset', 'file', 'is_ds_from_dbs', 'is_ds_from_rucio',
                 'replica_rse_id', 'replica_accessed_at', 'replica_created_at', 'joint_file_size', 'lock_cnt'])

    df_joint_main = df_joint_ds_files \
        .groupby(['replica_rse_id', 'dataset']) \
        .agg(_sum(col('joint_file_size')).alias('joint_size'),
             _max(col('replica_accessed_at')).alias('joint_last_accessed_at'),
             _max(col('replica_created_at')).alias('joint_last_created_at'),
             _sum(col('is_ds_from_dbs')).alias('joint_dbs_n_files'),
             _sum(col('is_ds_from_rucio')).alias('joint_rucio_n_files'),
             _count(lit(1)).alias('joint_n_files'),
             _sum(
                 when(col('replica_accessed_at').isNull(), 0).otherwise(1)
             ).alias('joint_n_accessed_files'),
             _sum(col('lock_cnt')).alias('joint_locked_files')
             ) \
        .withColumn('all_f_in_dbs',
                    when((col('joint_dbs_n_files') == 0) & (col('joint_dbs_n_files').isNull()),
                         IS_ALL_DATASET_FILES_EXISTS['n'])
                    .when(col('joint_dbs_n_files') == col('joint_n_files'), IS_ALL_DATASET_FILES_EXISTS['a'])
                    .when(col('joint_dbs_n_files') > 0, IS_ALL_DATASET_FILES_EXISTS['p'])
                    ) \
        .withColumn('all_f_in_rucio',
                    when((col('joint_rucio_n_files') == 0) & (col('joint_rucio_n_files').isNull()),
                         IS_ALL_DATASET_FILES_EXISTS['n'])
                    .when(col('joint_rucio_n_files') == col('joint_n_files'), IS_ALL_DATASET_FILES_EXISTS['a'])
                    .when(col('joint_rucio_n_files') > 0, IS_ALL_DATASET_FILES_EXISTS['p'])
                    ) \
        .withColumn('joint_is_d_locked',
                    when(col('joint_locked_files') > 0, IS_DATASET_LOCKED[True])
                    .otherwise(IS_DATASET_LOCKED[False])
                    ) \
        .withColumnRenamed("replica_rse_id", "rse_id") \
        .select(['dataset',
                 'rse_id',
                 'joint_size',
                 'joint_last_accessed_at',
                 'joint_last_created_at',
                 'joint_dbs_n_files',
                 'joint_rucio_n_files',
                 'joint_n_files',
                 'joint_n_accessed_files',
                 'all_f_in_dbs',
                 'all_f_in_rucio',
                 'joint_locked_files',
                 'joint_is_d_locked'
                 ])
    # -----------------------------------------------------------------------------------------------------------------
    #          -- ============  Dataset enrichment with Dataset tags  ============ --

    # Enrich dbs dataset with names from id properties of other tables
    dbs_data_tiers = spark.read.format('avro').load(hdfs_paths['DATA_TIERS'])
    dbs_physics_group = spark.read.format('avro').load(
        hdfs_paths['PHYSICS_GROUPS'])
    dbs_acquisition_era = spark.read.format('avro').load(
        hdfs_paths['ACQUISITION_ERAS'])
    dbs_dataset_access_type = spark.read.format('avro').load(
        hdfs_paths['DATASET_ACCESS_TYPES'])

    dbs_datasets_enr = dbs_datasets \
        .join(dbs_data_tiers, ['data_tier_id'], how='left') \
        .join(dbs_physics_group, ['physics_group_id'], how='left') \
        .join(dbs_acquisition_era, ['acquisition_era_id'], how='left') \
        .join(dbs_dataset_access_type, ['dataset_access_type_id'], how='left') \
        .select(['dataset', 'dataset_id', 'is_dataset_valid', 'primary_ds_id', 'processed_ds_id', 'prep_id',
                 'data_tier_id', 'data_tier_name',
                 'physics_group_id', 'physics_group_name',
                 'acquisition_era_id', 'acquisition_era_name',
                 'dataset_access_type_id', 'dataset_access_type'])

    # -----------------------------------------------------------------------------------------------------------------
    #                       -- ============  Main: join all  ============ --

    cond_with_only_rucio = [
        df_joint_main.dataset == df_only_from_rucio.contents_dataset,
        df_joint_main.rse_id == df_only_from_rucio.replica_rse_id
    ]

    cond_with_only_dbs = [
        df_joint_main.dataset == df_only_from_dbs.dbs_dataset,
        df_joint_main.rse_id == df_only_from_dbs.replica_rse_id
    ]

    # Left joins: since df_join_main has outer join, should have all datasets of both Rucio and DBS
    df_main = df_joint_main.join(df_only_from_rucio,
                                 cond_with_only_rucio,
                                 how='left').drop('replica_rse_id')
    df_main = df_main.join(df_only_from_dbs, cond_with_only_dbs,
                           how='left').drop('replica_rse_id')

    df_main = df_main \
        .withColumn('rucio_has_ds_name',
                    when(col('is_d_name_from_rucio').isNotNull(), col('is_d_name_from_rucio'))
                    .otherwise(BOOL_STR[False])) \
        .withColumn('dbs_has_ds_name',
                    when(col('is_d_name_from_dbs').isNotNull(), col('is_d_name_from_dbs'))
                    .otherwise(BOOL_STR[False]))

    # Remove unneeded columns by selecting specific ones
    df_main = df_main.select([
        'dataset', 'rse_id', 'joint_size', 'joint_last_accessed_at',
        'joint_last_created_at', 'joint_dbs_n_files', 'joint_rucio_n_files',
        'joint_n_files', 'joint_n_accessed_files', 'all_f_in_dbs',
        'all_f_in_rucio', 'rucio_size', 'rucio_n_files',
        'rucio_n_accessed_files', 'rucio_has_ds_name', 'dbs_size',
        'dbs_n_files', 'dbs_n_accessed_files', 'dbs_has_ds_name',
        'rucio_locked_files', 'rucio_is_d_locked', 'dbs_locked_files',
        'dbs_is_d_locked', 'joint_locked_files', 'joint_is_d_locked'
    ])

    # Add DBS dataset enrichment's to main df
    df_main = df_main.join(dbs_datasets_enr, ['dataset'], how='left')

    # Add RSES name, type, tier, country, kind to dataset
    df_main = df_main \
        .join(df_rses, df_main.rse_id == df_rses.replica_rse_id, how='left') \
        .drop('rse_id', 'replica_rse_id')

    # UTC timestamp of start hour of the spark job
    df_main = df_main.withColumn('tstamp_hour', lit(ts_current_hour))

    # Fill null values of string type columns. Null values is hard to handle in ES queries.
    df_main = df_main.fillna(value=NULL_STR_TYPE_COLUMN_VALUE,
                             subset=STR_TYPE_COLUMNS)
    return df_main
Esempio n. 4
0
# print(df.count())

# Spark india trade
keys = []
values = []
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
df = spark.read.csv(path="DataSources/india-trade-data/2018-2010_import.csv",
                    header="true")
df = df.select(df.country,
               df.value.cast('float').alias('value')).where(
                   df.value.isNotNull())
df = df.groupBy("country").agg(_sum("value").alias("sum_val"))
df = df.select(df.country,
               df.sum_val.cast('int').alias('total')).orderBy('total',
                                                              ascending=False)
rows = df.limit(10).collect()
for r in rows:
    keys.append(r[0])
    values.append(r[1])
explode = []
for i in range(len(values)):
    explode.append(0)
explode[0] = 0.1  # only "explode" the largest slide
fig1, ax1 = plt.subplots()
ax1.pie(values,
        explode=explode,
        labels=keys,
Esempio n. 5
0
combined_df = lines.withColumn("tx_id",splittedClms.getItem(0).cast("integer")) \
    .withColumn("product_id",splittedClms.getItem(1).cast("integer")) \
    .withColumn("qty",splittedClms.getItem(2).cast("integer")) \
    .withColumn("amt",splittedClms.getItem(3).cast("integer")) \
    .withColumn("day_dt",splittedClms.getItem(4).cast("string"))

streamingDf = combined_df.select("tx_id", "product_id", "qty", "amt", "day_dt",
                                 "timestamp")

joinedDf = streamingDf.join(productDf, "product_id").select(
    "tx_id", "product_id", "name", "qty", "amt",
    "timestamp")  # inner equi-join with a static DF

aggDf = joinedDf.groupBy(window(joinedDf.timestamp, "2 minutes", "1 minutes"),
                         joinedDf.product_id, joinedDf.name).agg(
                             _sum("qty"),
                             _sum("amt")).sort(joinedDf.product_id)
# TODO: .sort(joinedDf.columns[1])

# What is 2 minutes and 1 minutes here? a: time_window and refresh frequence (every 1 minute)

query = aggDf \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "false").start()

# 'complete' mean all the rows of input since start point, even it is not changed

query.awaitTermination()
    # calculate min and max or order date in order to calculate recency
    max_order_date, min_order_date = init_flat_data \
        .select( _max(col('order_date')), _min(col('order_date'))) \
        .take(1)[0]

    # calculate recency/frequency and monetary
    calculate_diff_day = udf(lambda x: (max_order_date - x).days,
                             IntegerType())
    rfm_table = init_flat_data \
        .withColumn('recency', calculate_diff_day('order_date')) \
        .groupby(['company_id', 'company_name', 'country']) \
        .agg(
            _mean(col('recency')).alias('recency'),
            _count(col('order_id')).alias('frequency'),
            _sum(col('NBI')).alias('monetary')
        )

    # calculate quantiles for each variable
    quantiles = rfm_table.approxQuantile(['recency', 'frequency', 'monetary'],
                                         [0.20, 0.4, 0.6, 0.8], 0)
    r_quantile = quantiles[0]
    f_quantile = quantiles[1]
    m_quantile = quantiles[2]

    # calculate score of each variable
    def_r_score = udf(
        lambda x: 5 if x < r_quantile[0] else 4 if x < r_quantile[1] else 3
        if x < r_quantile[2] else 2 if x < r_quantile[3] else 1, IntegerType())
    def_f_score = udf(
        lambda x: 1 if x < f_quantile[0] else 2 if x < f_quantile[1] else 3
Esempio n. 7
0
engine = create_engine(get_engine_uri("mysql", "pymysql"), pool_size=50)
save_videos_table(videos_df.rdd.collect(), engine)

# Process trending videos
#TODO: Replace with an env variable
if len(df.take(10)) < 10: # insufficient videos
   print("Too few videos, exiting...")
   exit(0)

trending_df = sql_reader.options(**sql_config).option("dbtable", "trending").load()
trending_df = trending_df \
   .sort("timestamp", ascending=False) \
   .limit(10) \
   .groupBy("video_id") \
   .agg(_sum("views").alias("c_views"))

ts = current_timestamp()
trending_df = df.join(trending_df, on='video_id', how='left_outer').na.fill(0, "c_views")
trending_df = trending_df \
   .withColumn("sum", col("views") + col("c_views")) \
   .drop("views") \
   .drop("c_views") \
   .drop("id") \
   .withColumnRenamed("sum", "views") \
   .withColumn("timestamp", ts) \
   .sort("views", ascending=False) \
   .limit(10)

trending_df.write \
   .mode("append") \
# Process data based on max date (where dates greater than the record date are missing)
df4_max = df3.filter(col("_diff_max") > 0)
df5_max = df4_max.withColumn("_next_dates", get_next_dates_udf(date_column, "_diff_max"))

# Process data based on min date (where dates less than the record date are missing)
df4_min = df3.filter(col("_diff_min") < 0)
df5_min = df4_min.withColumn("_next_dates", get_prev_dates_udf(date_column, "_diff_min"))

#  Combine dataframes for all missing data
df5 = df5_max.union(df5_min)

# Add dummy value
df6 = df5.withColumn(fill_column, lit(0))

# Explode dates
df7 = df6.withColumn(date_column, explode("_next_dates"))

# Drop columns that were added for processing
df8 = df7.drop("max_dt", "min_dt","_diff_max", "_diff_min", "_next_dates")

# Drop duplicates
df9 = df8.dropDuplicates()

# Combine with base dataframe
df10 = df9.union(df1)
df10.dropDuplicates()

# Aggregate to get rid of rows from the exploded data that are already present in the base dataframe
df11 = df10.groupBy("dt","product","channel").agg(_sum("quantity"))

df11.sort("dt").show(10)
plt.title("Daily cases")
plt.xlabel("Date")
plt.ylabel("Count")
plt.rcParams["figure.figsize"] = (30, 5)

#Show cumulative confirmed cases by day
cases_group_by_date.toPandas().plot(x="date", y="cumulativeConfirmed")
plt.title("Daily Cumulative Confirmed Cases")
plt.xlabel("Date")
plt.ylabel("Count")
plt.rcParams["figure.figsize"] = (30, 5)

#Show dailyConfirmed cases sum group by dayOfWeek
cases_group_by_day_of_week = cases_group_by_date\
                        .groupby(['dayOfWeek'])\
                        .agg(_sum('dailyConfirmed'))\
                        .orderBy('dayOfWeek',asending=True)
cases_group_by_day_of_week.show()

cases_group_by_day_of_week.toPandas().plot(kind='bar')
plt.rcParams["figure.figsize"] = (5, 5)

#Show cases group by categories as imported, local cases not residing dorm, local cases residing dorms
case_categories_columns = ["Case Type", "Count"]
case_categories = [
    ("Imported Cases",
     cases_group_by_date.groupBy().sum('dailyImported').collect()[0][0]),
    ("Local Transmission Cases", cases_group_by_date.groupBy().sum(
        'localCaseNotResidingInDorms').collect()[0][0]),
    ("Local Case Reside Dorms", cases_group_by_date.groupBy().sum(
        'localCaseResidingInDorms').collect()[0][0])
Esempio n. 10
0
def process_and_get_pd_dfs(spark, start_date, end_date):
    schema = _get_schema()
    raw_df = (
        spark.read.option('basePath', DEFAULT_HDFS_FOLDER).json(
            get_candidate_files(start_date,
                                end_date,
                                spark,
                                base=DEFAULT_HDFS_FOLDER),
            schema=schema,
        ).select('data.*').filter(
            col("RecordTime").between(f"{start_date.timestamp() * 1000}",
                                      f"{end_date.timestamp() * 1000}")).
        filter(
            (col('Site') == 'T3_US_ANL') |  # ANL
            (col('Site') == 'T3_US_NERSC') |  # NERSC
            (col('Site') == 'T3_US_OSG') |  # OSG
            (col('Site') == 'T3_US_PSC') |  # PSC
            (col('Site') == 'T3_US_SDSC') |  # SDSC
            (col('Site') == 'T3_US_TACC') |  # TACC
            ((col('Site').endswith('_ES_PIC_BSC'))
             & (col('MachineAttrCMSSubSiteName0') == 'PIC-BSC')) |  # BSC
            ((col('Site') == 'T1_IT_CNAF')
             & (col('MachineAttrCMSSubSiteName0') == 'CNAF-CINECA'))
            |  # CINECA
            ((col('Site') == 'T1_DE_KIT')
             & (col('MachineAttrCMSSubSiteName0') == 'KIT-HOREKA')) |  # HOREKA
            ((col('Site') == 'T2_DE_RWTH')
             & (col('MachineAttrCMSSubSiteName0') == 'RWTH-HPC'))  # RWTH
        ).filter(col('Status').isin([
            'Running',
            'Completed'
        ])).withColumn('date', from_unixtime(
            (col('RecordTime') /
             1000))).withColumn(
                 'site_name',
                 when(col('Site') == 'T3_US_ANL',
                      lit("ANL")).when(
                          col('Site') == 'T3_US_NERSC', lit("NERSC")).when(
                              col('Site') == 'T3_US_OSG', lit("OSG")).when(
                                  col('Site') == 'T3_US_PSC', lit("PSC")).when(
                                      col('Site') == 'T3_US_SDSC',
                                      lit("SDSC")).when(
                                          col('Site') == 'T3_US_TACC',
                                          lit("TACC")).
                 when(
                     col('Site').endswith('_ES_PIC_BSC'), lit("BSC")).when(
                         col('MachineAttrCMSSubSiteName0') == 'CNAF-CINECA',
                         lit("CINECA")).when(
                             col('MachineAttrCMSSubSiteName0') == 'KIT-HOREKA',
                             lit("HOREKA")).
                 when(
                     col('MachineAttrCMSSubSiteName0') == 'RWTH-HPC',
                     lit("RWTH"))).withColumn(
                         "RequestCpus",
                         when(
                             col("RequestCpus").isNotNull(),
                             col("RequestCpus")).otherwise(lit(1)),
                     ).withColumn('dayofmonth', _dayofmonth(
                         col('date'))).withColumn(
                             'month',
                             concat_ws(
                                 '-', _year(col('date')),
                                 format_string('%02d', _month(
                                     col('date'))))  # 2-digit month, default 1
                         ).drop(
                             'Site',
                             'MachineAttrCMSSubSiteName0').withColumnRenamed(
                                 'site_name', 'site'))

    # There should be only Completed status for a GlobalJobId
    df_core_hr = raw_df.filter(col('Status') == 'Completed') \
        .drop_duplicates(["GlobalJobId"])

    df_core_hr_daily = df_core_hr.groupby(['site', 'month', 'dayofmonth']) \
        .agg(_round(_sum("CoreHr")).alias("sum CoreHr"))

    df_core_hr_monthly = df_core_hr.groupby(['site', 'month']) \
        .agg(_round(_sum("CoreHr")).alias("sum CoreHr"))

    sec_12_min = 60 * 12
    time_window_12m = from_unixtime(
        unix_timestamp('date') - unix_timestamp('date') % sec_12_min)

    # 1st group-by includes GlobaljobId to get running cores of GlobaljobId without duplicates in each 12 minutes window
    # 2nd group-by gets sum of RequestCpus in 12 minutes window
    # 3rd group-by gets avg of RequestCpus(12 minutes window) for each site for each month
    df_running_cores_daily = raw_df \
        .withColumn('12m_window', time_window_12m) \
        .groupby(['site', 'month', 'dayofmonth', '12m_window', 'GlobalJobId']) \
        .agg(_max(col('RequestCpus')).alias('running_cores_of_single_job_in_12m')) \
        .groupby(['site', 'month', 'dayofmonth', '12m_window']) \
        .agg(_sum(col('running_cores_of_single_job_in_12m')).alias('running_cores_12m_sum')) \
        .groupby(['site', 'month', 'dayofmonth']) \
        .agg(_round(_avg(col('running_cores_12m_sum'))).alias('running_cores_avg_over_12m_sum'))
    return df_core_hr_daily.toPandas(), df_running_cores_daily.toPandas(
    ), df_core_hr_monthly.toPandas()
Esempio n. 11
0
purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)
dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")\
  .select(
    col("CustomerId"),
    col("date"),
    col("Quantity"),
    purchaseRank.alias("quantityRank"),
    purchaseDenseRank.alias("quantityDenseRank"),
    maxPurchaseQuantity.alias("maxPurchaseQuantity")).show(30)

#rollup
dfNoNull = dfWithDate.drop()
dfNoNull.createOrReplaceTempView("dfNoNull")
dfNoNull.show()
rolledUpDF = dfNoNull.rollup("Date", "Country").agg(_sum("Quantity"))\
   .selectExpr("Date", "Country", "`sum(Quantity)` as total_quantity").orderBy("Date")
rolledUpDF.show()
rolledUpDF.where("Country IS NULL").show()
rolledUpDF.where("Date IS NULL").show()

#cubes
dfNoNull.cube("Date", "Country").agg(_sum(col("Quantity")))\
  .select("Date", "Country", "sum(Quantity)").orderBy("Date").show()
###################



## rdd and dataframes
a = spark.range(10).rdd
b = spark.range(10).toDF("id").rdd.map(lambda row: row[0])
Esempio n. 12
0
        from_json(col("value").cast("string"), stock_schema).alias("value")
    )

    trade_df = value_df.select("value.*") \
        .withColumn("CreatedTime", to_timestamp(col("CreatedTime"), "yyyy-MM-dd HH:mm:ss"))\
        .withColumn("Buy", expr("case when Type == 'BUY' then Amount else 0 end")) \
        .withColumn("Sell", expr("case when Type == 'SELL' then Amount else 0 end"))

    # trade_df.printSchema()

    # water mark is made to set expiry time
    window_agg_df = trade_df \
        .withWatermark("CreatedTime", "30 minute") \
        .groupBy(
            window(col("CreatedTime"), "15 minute")) \
        .agg(_sum("Buy").alias("TotalBuy"),
             _sum("Sell").alias("TotalSell"))

    # window_agg_df.printSchema()
    output_df = window_agg_df.select("window.start", "window.end", "TotalBuy", "TotalSell")
    """
    # It will be used when want to perform bach processing
    running_total_window = Window.orderBy("end") \
        .rowsBetween(Window.unboundedPreceding, Window.currentRow)

    final_output_df = output_df \
        .withColumn("RTotalBuy", _sum("TotalBuy").over(running_total_window)) \
        .withColumn("RTotalSell", _sum("TotalSell").over(running_total_window)) \
        .withColumn("NetValue", expr("RTotalBuy - RTotalSell"))

    final_output_df.show(truncate=False)
Esempio n. 13
0
def generate_cpu_eff_site(
    start_date=None,
    end_date=None,
    cms_type="production",
    output_folder="./www/cpu_eff",
    last_n_days=30,
    cpu_eff_outlier=0,
):
    """
    """
    _yesterday = datetime.combine(date.today() - timedelta(days=1),
                                  datetime.min.time())
    if not (start_date or end_date):
        # defaults to the last 30 days with 3 days offset.
        # Default: (today-33days to today-3days)
        end_date = _yesterday
        start_date = end_date - timedelta(days=last_n_days)
    elif not start_date:
        start_date = end_date - timedelta(days=last_n_days)
    elif not end_date:
        end_date = min(start_date + timedelta(days=last_n_days), _yesterday)
    if start_date > end_date:
        raise ValueError(
            f"start date ({start_date}) should be earlier than end date({end_date})"
        )
    group_type_map = {
        "production": ["Workflow",
                       "WMAgent_RequestName"],  # Order is important
        "analysis": ["Workflow"],
        "test": ["Workflow"],
        "folding@home": ["Workflow"],
    }
    # Should be a list, used also in dataframe merge conditions.
    group_by_col = group_type_map[cms_type]
    spark = get_spark_session()
    schema = _get_schema()
    raw_df = (spark.read.option("basePath", _DEFAULT_HDFS_FOLDER).json(
        get_candidate_files(start_date,
                            end_date,
                            spark,
                            base=_DEFAULT_HDFS_FOLDER),
        schema=schema,
    ).select("data.*").filter(f"""Status='Completed'
          AND JobFailed=0
          AND RecordTime >= {start_date.timestamp() * 1000}
          AND RecordTime < {end_date.timestamp() * 1000}
          AND Type =  '{cms_type}'
          AND CpuEffOutlier = '{cpu_eff_outlier}'
          """).drop_duplicates(["GlobalJobId"]))
    raw_df = (raw_df.withColumn(
        "RequestCpus",
        when(col("RequestCpus").isNotNull(),
             col("RequestCpus")).otherwise(lit(1)),
    ).withColumn("CoreTime",
                 col("WallClockHr") * col("RequestCpus")).withColumn(
                     "Wasted_cputimehr",
                     ((col("RequestCpus") * col("WallClockHr")) -
                      col("CpuTimeHr")))).cache()

    grouped_tiers = raw_df.groupby("Tier", "Type", "CpuEffOutlier").agg(
        (100 * _sum("CpuTimeHr") / _sum("CoreTime")).alias("tier_cpueff"),
        _sum("RequestCpus").alias("tier_cpus"),
        _sum("CpuTimeHr").alias("tier_cputimehr"),
        _sum("WallClockHr").alias("tier_wallclockhr"),
    ).toPandas()
    grouped_wf = raw_df.groupby(*group_by_col, "Type").agg(
        (100 * _sum("CpuTimeHr") / _sum("CoreTime")).alias("wf_cpueff"),
        _sum("RequestCpus").alias("wf_cpus"),
        _sum("CpuTimeHr").alias("wf_cputimehr"),
        _sum("WallClockHr").alias("wf_wallclockhr"),
        _sum("Wasted_cputimehr").alias("wf_wasted_cputimehr"),
    )
    grouped_wf_t1_t2 = raw_df.filter("""Tier='T1' OR Tier='T2'""").groupby(
        *group_by_col, "Type").agg(
            (100 * _sum("CpuTimeHr") /
             _sum("CoreTime")).alias("wf_cpueff_t1_t2"),
            _sum("CpuTimeHr").alias("wf_cputimehr_t1_t2"),
            _sum("WallClockHr").alias("wf_wallclockhr_t1_t2"),
            _sum("Wasted_cputimehr").alias("wf_wasted_cputimehr_t1_t2"),
        )
    grouped_site_wf = raw_df.groupby(*group_by_col, "Site").agg(
        (100 * _sum("CpuTimeHr") / _sum("CoreTime")).alias("wf_site_cpueff"),
        _sum("RequestCpus").alias("wf_cpus"),
        _sum("CpuTimeHr").alias("wf_site_cputimehr"),
        _sum("WallClockHr").alias("wf_site_wallclockhr"),
        _sum("Wasted_cputimehr").alias("wf_site_wasted_cputimehr"),
        first("ScheddName").alias("schedd"),
        first("WMAgent_JobID").alias("wmagent_jobid"),
    )

    select_expr = f"""wf_wallclockhr > 100"""
    selected_df = grouped_wf.where(select_expr)
    selected_pd = selected_df.toPandas()
    grouped_wf_t1_t2 = grouped_wf_t1_t2.toPandas()
    grouped_wf_t1_t2.drop(['Type'], axis=1, inplace=True)

    # Merge grouped_wf and grouped_wf_t1_t2 to see cpueff, cputimehr and wallclockhr values of (T1-T2 sites only)
    selected_pd = pd.merge(selected_pd,
                           grouped_wf_t1_t2,
                           how='left',
                           left_on=group_by_col,
                           right_on=group_by_col)

    workflow_column = selected_pd["Workflow"].copy()
    filter_column = (workflow_column if group_by_col[-1] == "Workflow" else
                     selected_pd[group_by_col[-1]].copy())
    main_page = _generate_main_page(selected_pd, grouped_tiers, start_date,
                                    end_date, cms_type, workflow_column,
                                    filter_column, cpu_eff_outlier)
    os.makedirs(output_folder, exist_ok=True)
    with open(f"{output_folder}/CPU_Efficiency_Table.html", "w") as ofile:
        ofile.write(main_page)
    # We are only interested on the selected workflows.
    site_wf = grouped_site_wf.where(
        col(filter_column.name).isin(filter_column.to_list())).toPandas()
    if cms_type == "production":
        site_klinks = site_kibana_links()
        site_wf["log"] = (
            "<a href='https://cms-unified.web.cern.ch/cms-unified/logmapping/"
            + site_wf["WMAgent_RequestName"] + "/" + site_wf["schedd"] + "_" +
            site_wf["wmagent_jobid"] + ".tar.gz'>logs</a>")
        site_wf.drop(columns="schedd")
        site_wf["@Kibana"] = (site_klinks[0].format(
            START_DAY=(start_date + timedelta(seconds=time.altzone)
                       ).strftime('%Y-%m-%dT%H:%M:%S.000Z'),
            END_DAY=(end_date + timedelta(seconds=time.altzone)
                     ).strftime('%Y-%m-%dT%H:%M:%S.000Z')) +
                              str(cpu_eff_outlier) + site_klinks[1] +
                              site_wf["WMAgent_RequestName"] + site_klinks[2] +
                              site_wf["Workflow"] + site_klinks[3] +
                              site_wf["Site"] + site_klinks[4])
    site_wf = site_wf.set_index([*group_by_col, "Site"]).sort_index()
    # Create one file per worflow, so we don't have a big file collapsing the browser.
    _folder = f"{output_folder}/wfbysite"
    os.makedirs(_folder, exist_ok=True)
    num_levels = len(group_by_col)
    for workflow, df in site_wf.groupby(filter_column.name):
        sublevels = ""
        if num_levels > 1:
            df_ni = df.reset_index()
            sublevels = ("/".join(
                df_ni[group_by_col[0:-1]].drop_duplicates().values[0].tolist())
                         + "/")
            os.makedirs(f"{_folder}/{sublevels}", exist_ok=True)
        df.droplevel(list(range(num_levels))).to_html(
            f"{_folder}/{sublevels}CPU_Efficiency_bySite_{workflow}.html",
            escape=False,
        )
def get_df_sub_not_read_since(df_dataset_file_rse_ts_size,
                              filtered_rses_id_name_map, min_tb_limit,
                              n_months_filter):
    """Get dataframe of datasets that are not read since N months for sub details htmls

    Group by 'dataset' and 'rse_id' of get_df_dataset_file_rse_ts_size

    Filters:
        - If a dataset contains EVEN a single file with null accessed_at, filter out

    Access time filter logic:
        - If 'last_access_time_of_dataset_in_all_rses' is less than 'n_months_filter', ...
          ... set 'is_not_read_since_{n_months_filter}_months' column as True

    Columns:
        - 'dataset_size_in_rse_tb'
                Total size of a Dataset in an RSE.
                Produced by summing up datasets' all files in that RSE.
        - 'last_access_time_of_dataset_in_rse'
                Last access time of a Dataset in an RSE.
                Produced by getting max `accessed_at`(represents single file's access time) of a dataset in an RSE.
        - '#files_with_null_access_time_of_dataset_in_rse'
                Number of files count, which have NULL `accessed_at` values, of a Dataset in an RSE.
                This is important to know to filter out if there is any NULL `accessed_at` value of a Dataset.
        - '#files_of_dataset_in_rse'
                Number of files count of a Dataset in an RSE
        - '#distinct_files_of_dataset_in_rse'
                Number of unique files count of dataset in an RSE

    df_main_datasets_and_rses: RSE name, dataset and their size and access time calculations
    """
    # New boolean column name to map dataset-rse_id couples are not read at least n_months_filter or not
    bool_column_is_not_read_since_n_months = 'is_not_read_since_{}_months'.format(
        str(n_months_filter))

    # Get reverted dict to get RSE names from id
    reverted_filtered_rses_id_name_map = get_reverted_rses_id_name_map(
        filtered_rses_id_name_map)

    return df_dataset_file_rse_ts_size \
        .groupby(['rse_id', 'dataset']) \
        .agg(_round(_sum(col('f_size')) / TB_DENOMINATOR, 5).alias('dataset_size_in_rse_tb'),
             _max(col('accessed_at')).alias('last_access_time_of_dataset_in_rse'),
             _sum(
                 when(col('accessed_at').isNull(), 0).otherwise(1)
             ).alias('#_accessed_files_of_dataset_in_rse'),
             _count(lit(1)).alias('#_files_of_dataset_in_rse'),
             ) \
        .withColumn(bool_column_is_not_read_since_n_months,
                    when(
                        col('last_access_time_of_dataset_in_rse') < get_n_months_ago_epoch_msec(n_months_filter),
                        True).otherwise(False)
                    ) \
        .filter(col('last_access_time_of_dataset_in_rse').isNotNull()) \
        .filter(col(bool_column_is_not_read_since_n_months)) \
        .filter(col('dataset_size_in_rse_tb') > min_tb_limit) \
        .replace(reverted_filtered_rses_id_name_map, subset=['rse_id']) \
        .withColumnRenamed('rse_id', 'RSE name') \
        .select(['RSE name',
                 'dataset',
                 'dataset_size_in_rse_tb',
                 'last_access_time_of_dataset_in_rse',
                 '#_files_of_dataset_in_rse',
                 '#_accessed_files_of_dataset_in_rse',
                 ]) \
        .cache()
Esempio n. 15
0
def prepare_vxworks_data(
        file_tree_dataframe: DataFrame,
        vxworks_safety_features_dataframe: DataFrame) -> DataFrame:
    # Upcoming
    #
    # interrupt_stack_protection will need to consider whether or not the overflow / underflow
    # values are set to something non-zero in order for the protection to be valid
    # Currently that metadata doesn't exist
    #
    # user_stack_protection will need to check whether the overflow / underflow values
    # are non-zero, as well as the global_stack fill. For now, that metadata is either shaky
    # or does not exist.
    #
    # kernel_stack_protection will need to be considered. Currently this metadata
    # doesn't really exist, so the output will be rough.
    #
    # in write_protection, the vector table protection is _not_ a cross-platform feature,
    # and as such, computing statistics on it would be complex; we're going to avoid that for now
    vxworks_features_counted = vxworks_safety_features_dataframe.select(
        col('file_hash'),
        when(
            col('password_protection') == True, 1
        ).otherwise(0).alias('password_protection_count'),
        when(
            col('interrupt_stack_protection.guard_zones') == True, 1
        ).otherwise(0).alias('interrupt_stack_protection_count'),
        # If they have at least one of these two, we're going to give
        # them good boy points.
        when(
            (col('write_protection.user_text') == False) & \
            (col('write_protection.kernel_text') == False), 0
            # col('write_protection.virtual_mem_text') == True, 1
        ).otherwise(1).alias('write_protection_count_preliminary'),
        col('write_protection.virtual_mem_text').alias('virtual_mem_text'),
        when(
            col('kernel_stack_protection.guard_overflow_size_exec').isNotNull() & \
            col('kernel_stack_protection.guard_underflow_size_exec').isNotNull() & \
            col('kernel_stack_protection.guard_overflow_size_exception').isNotNull(), 1
        ).otherwise(None).alias('kernel_stack_protection_count'),
        when(
            (col('user_task_stack_protection.no_exec') == True) & \
            (col('user_task_stack_protection.guard_zones') == True), 1
        ).otherwise(0).alias('user_task_stack_protection_count'),
        when(
            col('file_hash').isNotNull(), 1
        ).otherwise(0).alias('row_count')
    ).withColumn(
        # Nested filter: If user text protection and kernel text protection are
        # disabled but virtual mem text is enabled, then this facet is secure
        # Otherwise, we should call it insecure
        'write_protection_count',
        when(
            (col('write_protection_count_preliminary') == 0) & \
            (col('virtual_mem_text') == True), 1
        ).otherwise(
            when(
                col('write_protection_count_preliminary') == 1, 1
            ).otherwise(0)
        )
    ).select(
        'file_hash',
        'row_count',
        'write_protection_count',
        'password_protection_count',
        'kernel_stack_protection_count',
        'interrupt_stack_protection_count',
        'user_task_stack_protection_count',
    ).withColumn(
        'has_one_security_feature',
        when(
            (col('write_protection_count') != 0) | \
            (col('password_protection_count') != 0) | \
            (col('kernel_stack_protection_count') != 0) | \
            (col('interrupt_stack_protection_count') != 0) | \
            (col('user_task_stack_protection_count') != 0), 1
        ).otherwise(0)
    )

    vxworks_safety_stats_df = vxworks_features_counted.join(
        file_tree_dataframe, 'file_hash').groupby('firmware_hash').agg(
            _sum('row_count').cast('int').alias('total_vxworks_count'),
            _sum('write_protection_count').cast('int').alias(
                'write_protection'),
            _sum('password_protection_count').cast('int').alias(
                'password_protection'),
            _sum('has_one_security_feature').cast('int').alias(
                'count_with_security_features'),
            _sum('kernel_stack_protection_count').cast('int').alias(
                'kernel_stack_protection'),
            _sum('interrupt_stack_protection_count').cast('int').alias(
                'interrupt_stack_protection'),
            _sum('user_task_stack_protection_count').cast('int').alias(
                'user_task_stack_protection'),
        ).select('firmware_hash', 'write_protection', 'password_protection',
                 'total_vxworks_count', 'kernel_stack_protection',
                 'interrupt_stack_protection', 'user_task_stack_protection',
                 'count_with_security_features')
    return vxworks_safety_stats_df
    """ repartition based on 'LNAME' and 'Address' and generate spark_partiion_id
    then run mapPartitions() function and create in-partition idx
    """
    df1 = df.repartition(N, 'LNAME', 'Address') \
            .rdd.mapPartitionsWithIndex(func) \
            .toDF()

    # get number of unique rows (based on Address+LNAME) which is max_idx
    # and then grab the running SUM of this rcnt
    # the new df should be small and just cache it
    w1 = Window.partitionBy().orderBy('partition_id').rowsBetween(
        Window.unboundedPreceding, -1)

    df2 = df1.groupby('partition_id') \
             .agg((_max('idx')).alias('cnt')) \
             .withColumn('rcnt', coalesce(_sum('cnt').over(w1),lit(0))) \
             .cache()
    df2.show()
    #+------------+---+----+
    #|partition_id|cnt|rcnt|
    #+------------+---+----+
    #|           0|  3|   0|
    #|           1|  1|   3|
    #|           2|  1|   4|
    #|           4|  1|   5|
    #+------------+---+----+
    """join df1 with df2 and create id = idx + rcnt"""
    df_new = df1.join(df2,
                      on=['partition_id']).withColumn('id',
                                                      col('idx') + col('rcnt'))
Esempio n. 17
0
def prepare_vxworks_features_per_binary(
        file_tree_dataframe: DataFrame,
        vxworks_safety_features_dataframe: DataFrame) -> DataFrame:
    return vxworks_safety_features_dataframe.select(
        col('file_hash'),
        when(
            col('password_protection') == True, 1
        ).otherwise(0).alias('password_protection_count'),
        when(
            col('interrupt_stack_protection.guard_zones') == True, 1
        ).otherwise(0).alias('interrupt_stack_protection_count'),
        # If they have at least one of these two, we're going to give
        # them good boy points.
        when(
            (col('write_protection.user_text') == False) & \
            (col('write_protection.kernel_text') == False), 0
            # col('write_protection.virtual_mem_text') == True, 1
        ).otherwise(1).alias('write_protection_count_preliminary'),
        col('write_protection.virtual_mem_text').alias('virtual_mem_text'),
        when(
            col('kernel_stack_protection.guard_overflow_size_exec').isNotNull() & \
            col('kernel_stack_protection.guard_underflow_size_exec').isNotNull() & \
            col('kernel_stack_protection.guard_overflow_size_exception').isNotNull(), 1
        ).otherwise(None).alias('kernel_stack_protection_count'),
        when(
            (col('user_task_stack_protection.no_exec') == True) & \
            (col('user_task_stack_protection.guard_zones') == True), 1
        ).otherwise(0).alias('user_task_stack_protection_count')
    ).withColumn(
        # Nested filter: If user text protection and kernel text protection are
        # disabled but virtual mem text is enabled, then this facet is secure
        # Otherwise, we should call it insecure
        'write_protection_count',
        when(
            (col('write_protection_count_preliminary') == 0) & \
            (col('virtual_mem_text') == True), 1
        ).otherwise(
            when(
                col('write_protection_count_preliminary') == 1, 1
            ).otherwise(0)
        )
    ).withColumn(
        'ratio',
        (
            col('write_protection_count') + \
            col('password_protection_count') + \
            when(
                col('kernel_stack_protection_count').isNotNull(), 1
            ).otherwise(0) + \
            col('interrupt_stack_protection_count') + \
            col('user_task_stack_protection_count')
        ) / 5
    ).select(
        'file_hash',
        'ratio'
    ).join(
        file_tree_dataframe,
        'file_hash'
    ).groupBy(
        'firmware_hash'
    ).agg(
        _sum(
            'ratio'
        ).alias(
            'vxworks_ratio_total'
        ),
        count(
            'file_hash'
        ).alias('vxworks_binary_count')
    )
    "userId", "buyId").pivot("price").sum("price")

# In[48]:

buyclicks_pivotPrice.orderBy("userId").show(15)

# In[49]:

# ORDERED table by USER, SESSION, TIME, ITEM
buyclicks_raw.orderBy("userId", "userSessionId", "timestamp", "buyId").show(15)

# In[50]:

# TOTAL SPENT BY USER
buyclicks_total_by_user = buyclicks_raw.groupBy("userId").agg(
    _sum("price").alias("totalspent")).orderBy("totalspent", ascending=False)

# In[51]:

buyclicks_total_by_user.show(5)

# In[ ]:

# ## AD CLICKS

# In[52]:

adclicks_raw.count()

# In[53]:
Esempio n. 19
0
def prepare_crypto_material_data(
        file_tree_dataframe: DataFrame,
        crypto_material_dataframe: DataFrame) -> DataFrame:
    # yapf: disable
    vulnerable_crypto_materials_stats_dataframe = crypto_material_dataframe.select(
        when(
            col('material_type') == 'SshRsaPrivateKeyBlock', 1
        ).otherwise(0).alias('has_ssh_rsa_private_key'),
        when(
            col('material_type') == 'SshRsaPublicKeyBlock', 1
        ).otherwise(0).alias('has_ssh_rsa_public_key'),
        when(
            col('material_type') == 'PgpPrivateKeyBlock', 1
        ).otherwise(0).alias('has_pgp_private_key'),
        when(
            col('material_type') == 'Pkcs8PrivateKey', 1
        ).otherwise(0).alias('has_pkcs8_private_key'),
        when(
            col('material_type') == 'Pkcs12Certificate', 1
        ).otherwise(0).alias('has_pkcs12_certificate'),
        when(
            col('material_type') == 'SSLPrivateKey', 1
        ).otherwise(0).alias('has_ssl_private_key'),
        'file_hash'
    )



    vulnerable_crypto_materials_counts = vulnerable_crypto_materials_stats_dataframe.join(
        file_tree_dataframe,
        'file_hash'
    ).groupBy(
        'firmware_hash'
    ).agg(
        _sum('has_ssh_rsa_private_key').cast('int').alias('ssh_rsa_private_key_count'),
        _sum('has_ssh_rsa_public_key').cast('int').alias('ssh_rsa_public_key_count'),
        _sum('has_pgp_private_key').cast('int').alias('pgp_private_key_count'),
        _sum('has_pkcs8_private_key').cast('int').alias('pkcs8_private_key_count'),
        _sum('has_pkcs12_certificate').cast('int').alias('pkcs12_certificate_count'),
        _sum('has_ssl_private_key').cast('int').alias('ssl_private_key_count'),
    ).select(
        'firmware_hash',
        'ssh_rsa_private_key_count',
        'ssh_rsa_public_key_count',
        'pgp_private_key_count',
        'pkcs8_private_key_count',
        'pkcs12_certificate_count',
        'ssl_private_key_count'
    )

    # Checking col('file_full_path').contains('ssh_host') before the full regex lets Spark filter most of the
    # rows out without having to uncompress the data from Tungsten to apply the regex, which results in a significant
    # speedup.

    host_and_authorized_key_counts = file_tree_dataframe.select(
        'file_hash',
        'firmware_hash',
        'file_full_path'
    ).distinct().select(
        'firmware_hash',
        when(
            (col('file_full_path').contains('ssh_host') & col('file_full_path').rlike('.*ssh_host.*key')), 1
        ).otherwise(0).alias('has_host_key'),
        when(
            col('file_full_path').endswith('authorized_keys') | col('file_full_path').endswith('authorized_keys2'), 1
        ).otherwise(0).alias('has_authorized_key'),
    ).groupBy(
        'firmware_hash'
    ).agg(
        _sum('has_host_key').cast('int').alias('host_keys_count'),
        _sum('has_authorized_key').cast('int').alias('authorized_keys_count')
    ).select(
        'firmware_hash',
        'host_keys_count',
        'authorized_keys_count'
    )

    crypto_materials_stats_df = vulnerable_crypto_materials_counts.join(
        host_and_authorized_key_counts,
        'firmware_hash'
    )
    # yapf: enable
    return crypto_materials_stats_df
Esempio n. 20
0
    def _demographics_transform(self):
        """
        Class method to transform and aggregate demographics data, grouping by state and calculating gender and race
        ratios for each state.
        Returns:
                [dict] - object with source-name: SparkDF key-value pairs
        """
        df = self.data_dict.get('demographics', None)
        if df is not None:
            data = df \
                .groupBy(
                    col("state_code").alias("state_code"),
                    col("state")
                ).agg(
                    _sum("total_population").alias("total_population"),
                    _sum("male_population").alias("male_population"),
                    _sum("female_population").alias("female_population"),
                    _sum("American Indian and Alaska Native").alias("american_indian_and_alaska_native"),
                    _sum("Asian").alias("asian"),
                    _sum("Black or African-American").alias("black_or_african_american"),
                    _sum("Hispanic or Latino").alias("hispanic_or_latino"),
                    _sum("White").alias("white")
                ) \
                .withColumn(
                    "male_population_ratio",
                    round(
                        (col("male_population") / col("total_population")), 2
                    )
                ) \
                .withColumn(
                    "female_population_ratio",
                    round(
                        (col("female_population") / col("total_population")), 2
                    )
                ) \
                .withColumn(
                    "american_indian_and_alaska_native_ratio",
                    round(
                        (col("american_indian_and_alaska_native") / col("total_population")), 2
                    )
                ) \
                .withColumn(
                    "asian_ratio",
                    round(
                        (col("asian") / col("total_population")), 2
                    )
                ) \
                .withColumn(
                    "black_or_african_american_ratio",
                    round(
                        (col("black_or_african_american") / col("total_population")), 2
                    )
                ) \
                .withColumn(
                    "hispanic_or_latino_ratio",
                    round(
                        (col("hispanic_or_latino") / col("total_population")), 2
                    )
                ) \
                .withColumn(
                    "white_ratio",
                    round(
                        (col("white") / col("total_population")), 2
                    )
                )

            return dict(demographics=data)
        else:
            logger.error(ValueError('No dataset named "demographics" found in cleaned data dict.'))
            raise ValueError('No dataset named "demographics" found in cleaned data dict.')
Esempio n. 21
0
def prepare_code_analysis_data(file_tree_dataframe: DataFrame, code_analysis_python_dataframe: DataFrame) -> DataFrame:
    return code_analysis_python_dataframe.groupBy(
        'file_hash'
    ).agg(
        _sum(
            when(
                (col('issue_severity') == 'HIGH') & (col('issue_confidence') == 'HIGH'), 1
            ).otherwise(0)
        ).alias('hs_hc'),
        _sum(
            when(
                (col('issue_severity') == 'HIGH') & (col('issue_confidence') == 'MEDIUM'), 1
            ).otherwise(0)
        ).alias('hs_mc'),
        _sum(
            when(
                (col('issue_severity') == 'HIGH') & (col('issue_confidence') == 'LOW'), 1
            ).otherwise(0)
        ).alias('hs_lc'),
        _sum(
            when(
                (col('issue_severity') == 'MEDIUM') & (col('issue_confidence') == 'HIGH'), 1
            ).otherwise(0)
        ).alias('ms_hc'),
        _sum(
            when(
                (col('issue_severity') == 'MEDIUM') & (col('issue_confidence') == 'MEDIUM'), 1
            ).otherwise(0)
        ).alias('ms_mc'),
        _sum(
            when(
                (col('issue_severity') == 'MEDIUM') & (col('issue_confidence') == 'LOW'), 1
            ).otherwise(0)
        ).alias('ms_lc'),
        _sum(
            when(
                (col('issue_severity') == 'LOW') & (col('issue_confidence') == 'HIGH'), 1
            ).otherwise(0)
        ).alias('ls_hc'),
        _sum(
            when(
                (col('issue_severity') == 'LOW') & (col('issue_confidence') == 'MEDIUM'), 1
            ).otherwise(0)
        ).alias('ls_mc'),
        _sum(
            when(
                (col('issue_severity') == 'LOW') & (col('issue_confidence') == 'LOW'), 1
            ).otherwise(0)
        ).alias('ls_lc'),
    ).withColumn(
        'weighted_file_risk',
        (col('hs_hc') * 10) +
        (col('hs_mc') * 5) +
        (col('hs_lc') * 2) +
        (col('ms_hc') * 5) +
        (col('ms_mc') * 2.5) +
        (col('ms_lc') * 1) +
        (col('ls_hc') * 2) +
        (col('ls_mc') * 1) +
        (col('ls_lc') * 0.4)
    ).join(
        file_tree_dataframe,
        'file_hash'
    ).groupBy(
        'firmware_hash'
    ).agg(
        _sum(
            'weighted_file_risk'
        ).cast('float').alias('total_weighted_file_risk'),
        _sum(
            'hs_hc'
        ).cast('int').alias('high_severity_high_confidence'),
        _sum(
            'hs_mc'
        ).cast('int').alias('high_severity_medium_confidence'),
        _sum(
            'hs_lc'
        ).cast('int').alias('high_severity_low_confidence'),
        _sum(
            'ms_hc'
        ).cast('int').alias('medium_severity_high_confidence'),
        _sum(
            'ms_mc'
        ).cast('int').alias('medium_severity_medium_confidence'),
        _sum(
            'ms_lc'
        ).cast('int').alias('medium_severity_low_confidence'),
        _sum(
            'ls_hc'
        ).cast('int').alias('low_severity_high_confidence'),
        _sum(
            'ls_mc'
        ).cast('int').alias('low_severity_medium_confidence'),
        _sum(
            'ls_lc'
        ).cast('int').alias('low_severity_low_confidence')
    )
Esempio n. 22
0
def useSpark(sourceFile: str, targetTsvFile: str) -> None:
    """[Process the input source files using Spark to transform to target data]

    Args:
        sourceFile (str): [Path to the location of the input data]
        targetTsvFile (str): [Path to the location of the target data]
    """

    # secrets for access to postgres database are held in .env file
    # this loads that into the application environment
    load_dotenv(verbose=True)

    spark = SparkSession.builder \
        .appName('Aquis2') \
        .master("local[2]") \
        .config(conf=getSparkConf(getJars())) \
        .getOrCreate()

    # clean data from source file
    cleanDf = spark.read.text(sourceFile) \
        .filter(col("value").contains("msgType_") & ~col("value").contains('msgType_":11')) \
        .withColumn("value", expr("substring(value,2)")) \
        .withColumn("value", regexp_replace("value", '\{\{', r'\{"header":\{')) \
        .withColumn("value", regexp_replace("value", 'SELL,', '"SELL",')) \
        .withColumn("value", regexp_replace("value", 'BUY,', '"BUY",')) \
        .withColumn("value", regexp_replace("value", '"flags_":"\{"', '"flags_":\{"'))

    # figure out schema on message 8, keep for re-use later as a technology demonstration
    msg8Schema = spark.read.json(
        cleanDf.filter(col("value").contains('"msgType_":8')).select(
            col("value").cast("string")).rdd.map(
                lambda r: r.value))._jdf.schema().toDDL()
    msg8Df = cleanDf.filter(col("value").contains('"msgType_":8')).withColumn("value", from_json("value", msg8Schema)) \
        .select("value.security_.securityId_", "value.security_.isin_", "value.security_.currency_") \
        .repartition(2, ["securityId_"])
    # msg8Df.printSchema()
    # root
    # | -- securityId_: long(nullable=true)
    # | -- isin_: string(nullable=true)
    # | -- currency_: string(nullable=true)

    # figure out schema on message 12, keep for re-use later as a technology demonstration
    msg12Schema = spark.read.json(
        cleanDf.filter(col("value").contains('"msgType_":12')).select(
            col("value").cast("string")).rdd.map(
                lambda r: r.value))._jdf.schema().toDDL()
    msg12Df = cleanDf.filter(col("value").contains('"msgType_":12')) \
        .withColumn("value", from_json("value", msg12Schema)) \
        .repartition(2, ["value.bookEntry_.securityId_"])

    # msg12Df.printSchema()
    # msg12Df.select("value.bookEntry_.side_").show()
    # root
    # | -- value: struct(nullable=true)
    # | | -- bookEntry_: struct(nullable=true)
    # | | | -- orderId_: long(nullable=true)
    # | | | -- price_: long(nullable=true)
    # | | | -- quantity_: long(nullable=true)
    # | | | -- securityId_: long(nullable=true)
    # | | | -- side_: string(nullable=true)
    # | | -- header: struct(nullable=true)
    # | | | -- length_: long(nullable=true)
    # | | | -- msgType_: long(nullable=true)
    # | | | -- seqNo_: long(nullable=true)

    # now aggregate messageType12 by securityId_ and side_
    aggDfSells = msg12Df.filter("value.bookEntry_.side_ == 'SELL'") \
        .select("*", (col("value.bookEntry_.quantity_") * col("value.bookEntry_.price_")).alias("TotalSellAmount")) \
        .groupby("value.bookEntry_.securityId_") \
        .agg(count("value.bookEntry_.securityId_").alias("Total Sell Count"),
             _sum("value.bookEntry_.quantity_").alias("Total Sell Quantity"),
             _min("value.bookEntry_.price_").alias("Min Sell Price"),
             _sum("TotalSellAmount").alias("Weighted Average Sell Price")
             ) \
        .withColumn("Weighted Average Sell Price", col("Weighted Average Sell Price") / col("Total Sell Quantity"))

    # now aggregate messageType12 by securityId_ and side_
    aggDfBuys = msg12Df.filter("value.bookEntry_.side_ == 'BUY'") \
        .select("*", (col("value.bookEntry_.quantity_") * col("value.bookEntry_.price_")).alias("TotalBuyAmount")) \
        .groupby("value.bookEntry_.securityId_") \
        .agg(count("value.bookEntry_.securityId_").alias("Total Buy Count"),
             _sum("value.bookEntry_.quantity_").alias("Total Buy Quantity"),
             _max("value.bookEntry_.price_").alias("Max Buy Price"),
             _sum("TotalBuyAmount").alias("Weighted Average Buy Price")) \
        .withColumn("Weighted Average Buy Price", col("Weighted Average Buy Price") / col("Total Buy Quantity"))

    # bring it together with joins, use outer join with the security data due to missing ids
    # select columns in the following order..
    outputColList = [
        col("isin_").alias("ISIN"),
        col("currency_").alias("Currency"), "Total Buy Count",
        "Total Sell Count", "Total Buy Quantity", "Total Sell Quantity",
        "Weighted Average Buy Price", "Weighted Average Sell Price",
        "Max Buy Price", "Min Sell Price"
    ]

    outputDf = aggDfBuys.join(aggDfSells, ["securityId_"], "full_outer") \
        .join(msg8Df, ["securityId_"], "left_outer") \
        .na.fill(0, outputColList[2:]) \
        .na.fill("MISSING", ["isin_", "currency_"]) \
        .select(outputColList)

    # collect into a single file
    outputDf.coalesce(1).write.option("sep", "\t").csv(targetTsvFile,
                                                       header=True)

    # Demo writing to postgresql (msg8 dataframe)
    # will append records to table AcquisExample. Table will
    # be created on the fly it it does not exist.
    dburl = getDbConnectionUrl(db=os.getenv("POSTGRES_DB"),
                               user=os.getenv("POSTGRES_USER"),
                               secret=os.getenv("POSTGRES_SECRET"))
    msg8Df.write.format("jdbc") \
        .option("url", dburl) \
        .option("dbtable", "AcquisExample") \
        .option("driver", "org.postgresql.Driver") \
        .save(mode="append")

    spark.stop()
Esempio n. 23
0
def main(
    output_folder="./www/stepchain",
    start_date=None,
    end_date=None,
    last_n_days=15,
):
    """Get step data in wmarchive.

    Each step array contains multiple steps. Udf function returns each step as a separate row in a list.
    flatMap helps to flat list of steps to become individual rows in dataframe.
    """
    # Borrowed logic from condor_cpu_efficiency
    _yesterday = datetime.combine(date.today() - timedelta(days=1),
                                  datetime.min.time())
    if not (start_date or end_date):
        end_date = _yesterday
        start_date = end_date - timedelta(days=last_n_days)
    elif not start_date:
        start_date = end_date - timedelta(days=last_n_days)
    elif not end_date:
        end_date = min(start_date + timedelta(days=last_n_days), _yesterday)
    if start_date > end_date:
        raise ValueError(
            f"start date ({start_date}) should be earlier than end date({end_date})"
        )

    spark = get_spark_session()
    df_raw = spark.read.option("basePath", _DEFAULT_HDFS_FOLDER).json(
        get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER)
    ) \
        .select(["data.*", "metadata.timestamp"]) \
        .filter(
        f"""data.meta_data.jobstate='success'
                  AND data.meta_data.jobtype='Production'
                  AND data.wmats >= {start_date.timestamp()}
                  AND data.wmats < {end_date.timestamp()}
                  """
    )
    df_rdd = df_raw.rdd.flatMap(lambda r: udf_step_extract(r))
    df = spark.createDataFrame(df_rdd,
                               schema=get_schema()).dropDuplicates().where(
                                   _col("ncores").isNotNull()).cache()
    df_details = df.groupby(["task", "site", "step_name"]).agg(
        (100 * (_sum("jobCPU") / _mean("nthreads")) /
         _sum("jobTime")).alias("avg_cpueff"),
        _count(lit(1)).alias("#jobs"),
        _mean("steps_len").alias("#steps"),
        _mean("nthreads").alias("#nthreads"),
        _mean("ncores").alias("#ncores"),
        (_sum("jobCPU") / _count(lit(1))).alias("avg_jobCPU"),
        (_sum("jobTime") / _count(lit(1))).alias("avg_jobTime"),
        _collect_set("acquisitionEra").alias("acquisitionEra"),
    ).withColumn("avg_cpueff",
                 _col("avg_cpueff").cast(IntegerType())).toPandas()
    df_task = df.groupby(["task"]).agg(
        (100 * (_sum("jobCPU") / _mean("nthreads")) /
         _sum("jobTime")).alias("avg_cpueff"),
        _count(lit(1)).alias("#jobs"),
        _mean("steps_len").alias("#steps"),
        _mean("nthreads").alias("#nthreads"),
        _mean("ncores").alias("#ncores"),
        (_sum("jobCPU") / _count(lit(1))).alias("avg_jobCPU"),
        (_sum("jobTime") / _count(lit(1))).alias("avg_jobTime"),
    ).withColumn("avg_cpueff",
                 _col("avg_cpueff").cast(IntegerType())).toPandas()
    write_htmls(df_details, df_task, start_date, end_date, output_folder)