def get_df_main_datasets_never_read(df_dataset_file_rse_ts_size, filtered_rses_id_name_map, min_tb_limit): """Get never accessed datasets' dataframes""" reverted_filtered_rses_id_name_map = get_reverted_rses_id_name_map( filtered_rses_id_name_map) df_sub_datasets_never_read = df_dataset_file_rse_ts_size \ .groupby(['rse_id', 'dataset']) \ .agg(_round(_sum(col('f_size')) / TB_DENOMINATOR, 5).alias('dataset_size_in_rse_tb'), _max(col('accessed_at')).alias('last_access_time_of_dataset_in_rse'), _max(col('created_at')).alias('last_create_time_of_dataset_in_rse'), _count(lit(1)).alias('#_files_of_dataset_in_rse'), ) \ .filter(col('last_access_time_of_dataset_in_rse').isNull()) \ .filter(col('dataset_size_in_rse_tb') > min_tb_limit) \ .replace(reverted_filtered_rses_id_name_map, subset=['rse_id']) \ .withColumnRenamed('rse_id', 'RSE name') \ .select(['RSE name', 'dataset', 'dataset_size_in_rse_tb', 'last_create_time_of_dataset_in_rse', '#_files_of_dataset_in_rse' ]) \ .cache() df_main_datasets_never_read = df_sub_datasets_never_read \ .groupby(['dataset']) \ .agg(_max(col('dataset_size_in_rse_tb')).alias('max_dataset_size_in_rses(TB)'), _min(col('dataset_size_in_rse_tb')).alias('min_dataset_size_in_rses(TB)'), _avg(col('dataset_size_in_rse_tb')).alias('avg_dataset_size_in_rses(TB)'), _sum(col('dataset_size_in_rse_tb')).alias('sum_dataset_size_in_rses(TB)'), _max(col('last_create_time_of_dataset_in_rse')).alias('last_create_time_of_dataset_in_all_rses'), concat_ws(', ', collect_list('RSE name')).alias('RSE(s)'), ) \ .cache() return df_main_datasets_never_read, df_sub_datasets_never_read
def get_prcp_day(df_in): # Efetua o agrupamento dos valores, efetuando a agregacao de determinados campos print('CONSTRUINDO DATAFRAME COM VALORES AGREGADOS POR DIA') df_out = df_in.groupby('city', 'yr', 'mo', 'da').agg( _abs(_max('lat')).alias('latitude'), _sum('prcp').alias('prcp_dia'), _max('tmax').alias('tmax'), _min('tmin').alias('tmin'), _mean('temp').alias('med_temp')).orderBy('city', 'yr', 'mo', 'da') return df_out
def updateStatsPerDate(request): df_stathour = spark.read.json("/dataLake/"+request.POST.get("bdname")+"/stat_hour") statsPerDate = {"clic" : [] ,"open" : [] , "recieved" : [], "dates" : []} # After Loading page if request.POST.get("etat") == "0" : if request.POST.get("withCompanes") == "1": lstCompanes = list(request.POST.get("companes").split(",")) df_stats = df_stathour.where(df_stathour["mail_sending_id"].isin(lstCompanes)).groupBy(year("date").alias("year"), month("date").alias("month")).agg(_sum("opens").alias("opens"),_sum("clics").alias("clics"),_sum("recieved").alias("recieved")).orderBy("year", "month") else: df_stats = df_stathour.groupBy(year("date").alias("year"), month("date").alias("month")).agg(_sum("opens").alias("opens"),_sum("clics").alias("clics"),_sum("recieved").alias("recieved")).orderBy("year", "month") for content in df_stats.collect(): statsPerDate["dates"].append(calendar.month_name[content["month"]]+" "+str(content["year"])) statsPerDate["clic"].append(content["clics"]) statsPerDate["open"].append(content["opens"]) statsPerDate["recieved"].append(content["recieved"]) #Actions with listener elif request.POST.get("etat") == "1" : if request.POST.get("action") == "week" or request.POST.get("action") == "month" : if request.POST.get("action") == "week": nbday = 7 else: nbday = 30 if request.POST.get("withCompanes") == "1": lstCompanes = list(request.POST.get("companes").split(",")) maxDateInStatHour = (datetime.datetime.strptime(df_stathour.where(df_stathour["mail_sending_id"].isin(lstCompanes)).agg(_max("date").alias("date")).collect()[0]["date"], "%Y-%m-%d") - datetime.timedelta(days=nbday)).strftime("%Y-%m-%d") df_stats = df_stathour.where(df_stathour["mail_sending_id"].isin(lstCompanes)).where("date >'"+maxDateInStatHour+"'").groupBy(year("date").alias("year"), month("date").alias("month"), dayofmonth("date").alias("day")).agg(_sum("opens").alias("opens"),_sum("clics").alias("clics"),_sum("recieved").alias("recieved")).orderBy("year", "month", "day") else: maxDateInStatHour = (datetime.datetime.strptime(df_stathour.agg(_max("date").alias("date")).collect()[0]["date"], "%Y-%m-%d") - datetime.timedelta(days=nbday)).strftime("%Y-%m-%d") df_stats = df_stathour.where("date >'"+maxDateInStatHour+"'").groupBy(year("date").alias("year"), month("date").alias("month"), dayofmonth("date").alias("day")).agg(_sum("opens").alias("opens"),_sum("clics").alias("clics"),_sum("recieved").alias("recieved")).orderBy("year", "month", "day") for content in df_stats.collect(): statsPerDate["dates"].append(str(content["day"])+" "+calendar.month_name[content["month"]]) statsPerDate["clic"].append(content["clics"]) statsPerDate["open"].append(content["opens"]) statsPerDate["recieved"].append(content["recieved"]) elif request.POST.get("action") == "sixMonth" : if request.POST.get("withCompanes") == "1": lstCompanes = list(request.POST.get("companes").split(",")) maxDateInStatHour = (datetime.datetime.strptime(df_stathour.where(df_stathour["mail_sending_id"].isin(lstCompanes)).agg(_max("date").alias("date")).collect()[0]["date"], "%Y-%m-%d") - datetime.timedelta(days=180)).strftime("%Y-%m-%d") df_stats = df_stathour.where(df_stathour["mail_sending_id"].isin(lstCompanes)).where("date >'"+maxDateInStatHour+"'").groupBy(year("date").alias("year"), month("date").alias("month")).agg(_sum("opens").alias("opens"),_sum("clics").alias("clics"),_sum("recieved").alias("recieved")).orderBy("year", "month") else: maxDateInStatHour = (datetime.datetime.strptime(df_stathour.agg(_max("date").alias("date")).collect()[0]["date"], "%Y-%m-%d") - datetime.timedelta(days=180)).strftime("%Y-%m-%d") df_stats = df_stathour.where("date >'"+maxDateInStatHour+"'").groupBy(year("date").alias("year"), month("date").alias("month")).agg(_sum("opens").alias("opens"),_sum("clics").alias("clics"),_sum("recieved").alias("recieved")).orderBy("year", "month") for content in df_stats.collect(): statsPerDate["dates"].append(calendar.month_name[content["month"]]+" "+str(content["year"])) statsPerDate["clic"].append(content["clics"]) statsPerDate["open"].append(content["opens"]) statsPerDate["recieved"].append(content["recieved"]) elif request.POST.get("etat") == "2" : pass return HttpResponse(json.dumps(statsPerDate), content_type="application/json")
def get_df_main_not_read_since(df_sub_not_read_since): """Get dataframe of datasets not read since N months for main htmls. Get last access of dataframe in all RSE(s) """ return df_sub_not_read_since \ .groupby(['dataset']) \ .agg(_max(col('dataset_size_in_rse_tb')).alias('max_dataset_size_in_rses(TB)'), _min(col('dataset_size_in_rse_tb')).alias('min_dataset_size_in_rses(TB)'), _avg(col('dataset_size_in_rse_tb')).alias('avg_dataset_size_in_rses(TB)'), _sum(col('dataset_size_in_rse_tb')).alias('sum_dataset_size_in_rses(TB)'), _max(col('last_access_time_of_dataset_in_rse')).alias('last_access_time_of_dataset_in_all_rses'), concat_ws(', ', collect_list('RSE name')).alias('RSE(s)'), ) \ .cache()
def get_device_tracking_observations(df): """ Calculates first_observed_at and last_observed_at for each device from the supplied observations. :param df: A DataFrame of network connection observations. :return: A DataFrame of (organization, mac, first_observed_at, last_observed_at) """ if df and df.columns: outbound_df = df.select( 'organization', col('network_src_mac').name('mac'), 'occurred_at' ) inbound_df = df.select( 'organization', col('network_dest_mac').name('mac'), 'occurred_at' ) return outbound_df.union( inbound_df ).where( col('mac').isNotNull() ).groupBy( 'organization', 'mac' ).agg( _min('occurred_at').name('first_observed_at'), _max('occurred_at').name('last_observed_at'), )
def prepare_cyclomatic_complexity_data( file_tree_dataframe: DataFrame, cyclomatic_complexity_dataframe: DataFrame) -> DataFrame: # yapf: disable cyclomatic_complexity_max_dataframe = cyclomatic_complexity_dataframe.groupBy( 'file_hash' ).agg( _max('complexity').alias('max_complexity_in_file') ) cyclomatic_complexity_stats_df = cyclomatic_complexity_max_dataframe.join( file_tree_dataframe, 'file_hash' ).groupBy( 'firmware_hash' ).agg( _max('max_complexity_in_file').alias('max_complexity') ) # yapf: enable return cyclomatic_complexity_stats_df
def get_metrics(df_in): # Efetua o agrupamento dos valores, efetuando a agregacao de determinados campos print('CONSTRUINDO DATAFRAME COM VALORES AGREGADOS') df_out = df_in.groupby('yr', 'city').agg( _max('elvt').alias('ele_max'), _mean('temp').alias('med_temp'), _mean('tmin').alias('med_temp_min'), _mean('tmax').alias('med_temp_max'), _sum('prcp').alias('prcp'), _mean('hmdy').alias('med_umi'), _mean('hmin').alias('med_umi_min'), _mean('hmax').alias('med_umi_max'), _mean('wdsp').alias('med_velo_vento'), _mean('gust').alias('med_velo_rajadas_vento')).orderBy('yr', 'city') return df_out
def get_crab_popularity_ds(start_date, end_date, verbose=False, base=_BASE_PATH): """ Query the hdfs data and returns a pandas dataframe with: Datatier, Dataset, CMSPrimaryPrimaryDataset, job_count, workflow_count, ChirpCMSSWReadBytes args: - start_date datetime Start of the query period (RecordTime) - end_date datetime End of the query period """ start = int(start_date.timestamp() * 1000) end = int(end_date.timestamp() * 1000) spark = get_spark_session(yarn=True, verbose=verbose) dfs_crabdb = (spark.read.option("basePath", base).json( _get_candidate_files(start_date, end_date, spark, base=base), schema=_get_crab_condor_schema(), ).select("metadata.timestamp", "data.*").filter("""Status in ('Completed', 'Removed') AND CRAB_DataBlock is not NULL AND timestamp >= {} AND timestamp <= {}""".format( start, end)).repartition("CRAB_DataBlock").drop_duplicates([ "GlobalJobId" ]).withColumnRenamed( "CMSPrimaryPrimaryDataset", "PrimaryDataset").withColumn( "Dataset", regexp_extract("CRAB_DataBlock", "^(.*)/([^/]*)#.*$", 1)).withColumn( "Datatier", regexp_extract( "CRAB_DataBlock", "^(.*)/([^/]*)#.*$", 2))) dfs_crabdb = (dfs_crabdb.groupBy( "Datatier", "PrimaryDataset", "Dataset").agg( _max(col("RecordTime")), _min(col("RecordTime")), count(lit(1)), countDistinct("CRAB_Workflow"), _sum(col("ChirpCMSSWReadBytes")), ).withColumnRenamed("count(1)", "job_count").withColumnRenamed( "count(DISTINCT CRAB_Workflow)", "workflow_count").withColumnRenamed( "sum(ChirpCMSSWReadBytes)", "ChirpCMSSWReadBytes").na.fill( "Unknown", ["Datatier", "PrimaryDataset", "Dataset"])) return dfs_crabdb.toPandas()
def useSpark(sourceFile: str, targetTsvFile: str) -> None: """[Process the input source files using Spark to transform to target data] Args: sourceFile (str): [Path to the location of the input data] targetTsvFile (str): [Path to the location of the target data] """ # secrets for access to postgres database are held in .env file # this loads that into the application environment load_dotenv(verbose=True) spark = SparkSession.builder \ .appName('Aquis2') \ .master("local[2]") \ .config(conf=getSparkConf(getJars())) \ .getOrCreate() # clean data from source file cleanDf = spark.read.text(sourceFile) \ .filter(col("value").contains("msgType_") & ~col("value").contains('msgType_":11')) \ .withColumn("value", expr("substring(value,2)")) \ .withColumn("value", regexp_replace("value", '\{\{', r'\{"header":\{')) \ .withColumn("value", regexp_replace("value", 'SELL,', '"SELL",')) \ .withColumn("value", regexp_replace("value", 'BUY,', '"BUY",')) \ .withColumn("value", regexp_replace("value", '"flags_":"\{"', '"flags_":\{"')) # figure out schema on message 8, keep for re-use later as a technology demonstration msg8Schema = spark.read.json( cleanDf.filter(col("value").contains('"msgType_":8')).select( col("value").cast("string")).rdd.map( lambda r: r.value))._jdf.schema().toDDL() msg8Df = cleanDf.filter(col("value").contains('"msgType_":8')).withColumn("value", from_json("value", msg8Schema)) \ .select("value.security_.securityId_", "value.security_.isin_", "value.security_.currency_") \ .repartition(2, ["securityId_"]) # msg8Df.printSchema() # root # | -- securityId_: long(nullable=true) # | -- isin_: string(nullable=true) # | -- currency_: string(nullable=true) # figure out schema on message 12, keep for re-use later as a technology demonstration msg12Schema = spark.read.json( cleanDf.filter(col("value").contains('"msgType_":12')).select( col("value").cast("string")).rdd.map( lambda r: r.value))._jdf.schema().toDDL() msg12Df = cleanDf.filter(col("value").contains('"msgType_":12')) \ .withColumn("value", from_json("value", msg12Schema)) \ .repartition(2, ["value.bookEntry_.securityId_"]) # msg12Df.printSchema() # msg12Df.select("value.bookEntry_.side_").show() # root # | -- value: struct(nullable=true) # | | -- bookEntry_: struct(nullable=true) # | | | -- orderId_: long(nullable=true) # | | | -- price_: long(nullable=true) # | | | -- quantity_: long(nullable=true) # | | | -- securityId_: long(nullable=true) # | | | -- side_: string(nullable=true) # | | -- header: struct(nullable=true) # | | | -- length_: long(nullable=true) # | | | -- msgType_: long(nullable=true) # | | | -- seqNo_: long(nullable=true) # now aggregate messageType12 by securityId_ and side_ aggDfSells = msg12Df.filter("value.bookEntry_.side_ == 'SELL'") \ .select("*", (col("value.bookEntry_.quantity_") * col("value.bookEntry_.price_")).alias("TotalSellAmount")) \ .groupby("value.bookEntry_.securityId_") \ .agg(count("value.bookEntry_.securityId_").alias("Total Sell Count"), _sum("value.bookEntry_.quantity_").alias("Total Sell Quantity"), _min("value.bookEntry_.price_").alias("Min Sell Price"), _sum("TotalSellAmount").alias("Weighted Average Sell Price") ) \ .withColumn("Weighted Average Sell Price", col("Weighted Average Sell Price") / col("Total Sell Quantity")) # now aggregate messageType12 by securityId_ and side_ aggDfBuys = msg12Df.filter("value.bookEntry_.side_ == 'BUY'") \ .select("*", (col("value.bookEntry_.quantity_") * col("value.bookEntry_.price_")).alias("TotalBuyAmount")) \ .groupby("value.bookEntry_.securityId_") \ .agg(count("value.bookEntry_.securityId_").alias("Total Buy Count"), _sum("value.bookEntry_.quantity_").alias("Total Buy Quantity"), _max("value.bookEntry_.price_").alias("Max Buy Price"), _sum("TotalBuyAmount").alias("Weighted Average Buy Price")) \ .withColumn("Weighted Average Buy Price", col("Weighted Average Buy Price") / col("Total Buy Quantity")) # bring it together with joins, use outer join with the security data due to missing ids # select columns in the following order.. outputColList = [ col("isin_").alias("ISIN"), col("currency_").alias("Currency"), "Total Buy Count", "Total Sell Count", "Total Buy Quantity", "Total Sell Quantity", "Weighted Average Buy Price", "Weighted Average Sell Price", "Max Buy Price", "Min Sell Price" ] outputDf = aggDfBuys.join(aggDfSells, ["securityId_"], "full_outer") \ .join(msg8Df, ["securityId_"], "left_outer") \ .na.fill(0, outputColList[2:]) \ .na.fill("MISSING", ["isin_", "currency_"]) \ .select(outputColList) # collect into a single file outputDf.coalesce(1).write.option("sep", "\t").csv(targetTsvFile, header=True) # Demo writing to postgresql (msg8 dataframe) # will append records to table AcquisExample. Table will # be created on the fly it it does not exist. dburl = getDbConnectionUrl(db=os.getenv("POSTGRES_DB"), user=os.getenv("POSTGRES_USER"), secret=os.getenv("POSTGRES_SECRET")) msg8Df.write.format("jdbc") \ .option("url", dburl) \ .option("dbtable", "AcquisExample") \ .option("driver", "org.postgresql.Driver") \ .save(mode="append") spark.stop()
# remove useless data init_flat_data = init_flat_data.fillna(0) #cast columns init_flat_data = init_flat_data \ .withColumn('order_date', to_date(unix_timestamp("order_date", "yyyy-MM-dd").cast('timestamp')) ) \ .withColumn('NBI', col('NBI').cast('float') ) \ .withColumn('order_id', col('order_id').cast('int')) \ .withColumn('company_id', col('company_id').cast('int')) # print schema print(init_flat_data.printSchema(), '\n') # calculate min and max or order date in order to calculate recency max_order_date, min_order_date = init_flat_data \ .select( _max(col('order_date')), _min(col('order_date'))) \ .take(1)[0] # calculate recency/frequency and monetary calculate_diff_day = udf(lambda x: (max_order_date - x).days, IntegerType()) rfm_table = init_flat_data \ .withColumn('recency', calculate_diff_day('order_date')) \ .groupby(['company_id', 'company_name', 'country']) \ .agg( _mean(col('recency')).alias('recency'), _count(col('order_id')).alias('frequency'), _sum(col('NBI')).alias('monetary') ) # calculate quantiles for each variable
schema = t.StructType() \ .add("time", t.StringType(), True) \ .add("open", t.DoubleType(), True) \ .add("close", t.DoubleType(), True) \ .add("high", t.DoubleType(), True) \ .add("low", t.DoubleType(), True) \ .add("volume", t.DoubleType(), True) \ .add("input_file", t.StringType(), True) df = spark.read.format(file_type).options(header="true",inferSchema="true").schema(schema).load(file_location).withColumn("input_file", input_file_name()) # Get and split file name to create a column with the coin pair corresponding for each row split_col = split(df['input_file'], '/') df = df.withColumn('coin_pair', split(split_col.getItem(3),'\.').getItem(0)) df = df.drop("input_file") # We have a timestamp and we want a date df = df.withColumn('Date', from_unixtime((col('time')/1000))) # Agregate data to have a daily result, ready to insert into the database df = df.groupBy("coin_pair",window("Date","1 day")) \ .agg(_sum('volume'), last('close')-first('open'), _min('low'), _max('high') ) \ .select(col("coin_pair"), \ to_date(col("window.start")).alias("date"), \ col("sum(volume)").alias("dailyVolume"), \ col("(last(close) - first(open))").alias("dailyResult"), \ col("min(low)").alias("dailyLower"), \ col("max(high)").alias("dailyHigher")) \ .orderBy("coin_pair","date") # Write data into the MongoDB database coins and daily_coins_data collection df.write.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://XXXXXXX/coins.daily_coins_data").mode("append").save()
def get_df_sub_not_read_since(df_dataset_file_rse_ts_size, filtered_rses_id_name_map, min_tb_limit, n_months_filter): """Get dataframe of datasets that are not read since N months for sub details htmls Group by 'dataset' and 'rse_id' of get_df_dataset_file_rse_ts_size Filters: - If a dataset contains EVEN a single file with null accessed_at, filter out Access time filter logic: - If 'last_access_time_of_dataset_in_all_rses' is less than 'n_months_filter', ... ... set 'is_not_read_since_{n_months_filter}_months' column as True Columns: - 'dataset_size_in_rse_tb' Total size of a Dataset in an RSE. Produced by summing up datasets' all files in that RSE. - 'last_access_time_of_dataset_in_rse' Last access time of a Dataset in an RSE. Produced by getting max `accessed_at`(represents single file's access time) of a dataset in an RSE. - '#files_with_null_access_time_of_dataset_in_rse' Number of files count, which have NULL `accessed_at` values, of a Dataset in an RSE. This is important to know to filter out if there is any NULL `accessed_at` value of a Dataset. - '#files_of_dataset_in_rse' Number of files count of a Dataset in an RSE - '#distinct_files_of_dataset_in_rse' Number of unique files count of dataset in an RSE df_main_datasets_and_rses: RSE name, dataset and their size and access time calculations """ # New boolean column name to map dataset-rse_id couples are not read at least n_months_filter or not bool_column_is_not_read_since_n_months = 'is_not_read_since_{}_months'.format( str(n_months_filter)) # Get reverted dict to get RSE names from id reverted_filtered_rses_id_name_map = get_reverted_rses_id_name_map( filtered_rses_id_name_map) return df_dataset_file_rse_ts_size \ .groupby(['rse_id', 'dataset']) \ .agg(_round(_sum(col('f_size')) / TB_DENOMINATOR, 5).alias('dataset_size_in_rse_tb'), _max(col('accessed_at')).alias('last_access_time_of_dataset_in_rse'), _sum( when(col('accessed_at').isNull(), 0).otherwise(1) ).alias('#_accessed_files_of_dataset_in_rse'), _count(lit(1)).alias('#_files_of_dataset_in_rse'), ) \ .withColumn(bool_column_is_not_read_since_n_months, when( col('last_access_time_of_dataset_in_rse') < get_n_months_ago_epoch_msec(n_months_filter), True).otherwise(False) ) \ .filter(col('last_access_time_of_dataset_in_rse').isNotNull()) \ .filter(col(bool_column_is_not_read_since_n_months)) \ .filter(col('dataset_size_in_rse_tb') > min_tb_limit) \ .replace(reverted_filtered_rses_id_name_map, subset=['rse_id']) \ .withColumnRenamed('rse_id', 'RSE name') \ .select(['RSE name', 'dataset', 'dataset_size_in_rse_tb', 'last_access_time_of_dataset_in_rse', '#_files_of_dataset_in_rse', '#_accessed_files_of_dataset_in_rse', ]) \ .cache()
def main(): """ TODO: Create html page Access time filter logic: - If "last_access_ts" is less than 3 months ago, then set "months_old" as 3, - If "last_access_ts" is less than 6 monthsa ago, then set "months_old" as 6, - If "last_access_ts" is less than 12 months ago, then set "months_old" as 12 The result includes only the datasets whose last access time are 12, 6 or 3 months ago. """ spark = get_spark_session() (df_contents_f_to_b, df_contents_b_to_d, df_replicas, df_dids_files, df_replicas_j_dids, df_files_complete) = prepare_spark_dataframes(spark) # =============================================================================== # Continue with joins # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # ------------------------------------------------------------------------------- # --- STEP-10 / Tests to check dataframes are okay ---: # df_block_file_rse.select("file").distinct().count() = is 29921156 # df_block_file_rse.filter(col("file").isNull()).count() = 0 # df_block_file_rse.filter(col("block").isNull()).count() = 57892 # Above line means, we cannot extract block names of 57892 file from CONTENTS table .. # .. which provides F:D and D:C mapping (file, dataset, container in Rucio terms) # df_block_file_rse.filter(col("rse_id").isNull()).count() = 0 # df_block_file_rse.filter(col("fsize").isNull()).count() = 0 # We are all good, just drop null block names. # STEP-10: Left join df_files_complete and df_contents_f_to_b to get block names of files. # - There are some files that we cannot extract their block names from CONTENTS table # - So filter out them. df_block_file_rse = df_files_complete \ .join(df_contents_f_to_b, ["file"], how="left") \ .select(['block', 'file', 'rse_id', 'accessed_at', 'fsize', ]) \ .filter(col("block").isNotNull()) \ .cache() # --- STEP-11 / Tests to check dataframes are okay ---: # df_all.filter(col("dataset").isNull()).count() = 280821 # STEP-11: Left join df_block_file_rse and df_contents_b_to_d to get dataset names of blocks&files. # - There are some blocks that we cannot extract their dataset names from CONTENTS table. # - So filter out them. df_all = df_block_file_rse \ .join(df_contents_b_to_d, ["block"], how="left") \ .select(['dataset', 'block', 'file', 'rse_id', 'accessed_at', 'fsize']) \ .filter(col("dataset").isNotNull()) \ .cache() # STEP-12: Group by "dataset" and "rses" to calculate: # - dataset_size_in_rse: total size of dataset in a RSE by summing up dataset's all files in that RSE. # - `last_access_time_of_dataset_per_rse`: last access time of dataset in a RSE ... # ... by getting max of file `accessed_at` field of dataset's all files in that RSE. # - `#files_null_access_time_per_rse`: number of files which has NULL `accessed_at` field ... # ... in each dataset in a RSE. ... # ... This important to know to filter out if there is any NULL accessed_at file in calculation. # - `#files_per_rse`: number of files od the dataset in that RSE # - `#files_unique_per_rse`: unique count of dataset files in that RSE # Final result will be like: one dataset can be in multiple RSEs and presumably ... # ... it may have different sizes since a dataset may lost one of its block or file in a RSE? df_final_dataset_rse = df_all \ .groupby(["dataset", "rse_id"]) \ .agg(_sum(col("fsize")).alias("dataset_size_in_rse"), _max(col("accessed_at")).alias("last_access_time_of_dataset_per_rse"), _sum(when(col("accessed_at").isNull(), 1).otherwise(0)).alias("#files_null_access_time_per_rse"), _count(lit(1)).alias("#files_per_rse"), countDistinct(col("file")).alias("#files_unique_per_rse"), ) \ .cache() # STEP-13: Get thresholds. They are unix timestamps which are 3, 6 and 12 months ago from today. ts_thresholds = get_ts_thresholds() # STEP-14: # Filter for calculating last_accessed_at_least_{12|6|3}_months_ago columns. # - To produce correct results, "last_access_time_of_dataset_per_rse" field should not be null # which means a dataset's all files' accessed_at fields are filled. # - And "#files_null_access_time_per_rse"==0 means that there should not be ... # any file with NULL "accessed_at" field. # Group by dataset to get final result from all RSEs' datasets. # - max_dataset_size(TB): max size of dataset in all RSEs that contain this dataset # - max_dataset_size(TB): min size of dataset in all RSEs that contain this dataset # - max_dataset_size(TB): avg size of dataset in all RSEs that contain this dataset # - last_access_time_of_dataset: last access time of dataset in all RSEs df = df_final_dataset_rse \ .filter(col("last_access_time_of_dataset_per_rse").isNotNull() & (col("#files_null_access_time_per_rse") == 0) ) \ .groupby(["dataset"]) \ .agg(_round(_max(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("max_dataset_size(TB)"), _round(_min(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("min_dataset_size(TB)"), _round(_avg(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("avg_dataset_size(TB)"), _sum(col("#files_null_access_time_per_rse")).alias("#files_null_access_time_per_dataset"), _max(col("last_access_time_of_dataset_per_rse")).alias("last_access_time_of_dataset"), ) \ .withColumn('last_access_more_than_12_months_ago', when(col('last_access_time_of_dataset') < ts_thresholds[12], 1).otherwise(0) ) \ .withColumn('last_access_more_than_6_months_ago', when(col('last_access_time_of_dataset') < ts_thresholds[6], 1).otherwise(0) ) \ .withColumn('last_access_more_than_3_months_ago', when(col('last_access_time_of_dataset') < ts_thresholds[3], 1).otherwise(0) ) \ .filter((col('last_access_more_than_12_months_ago') == 1) | (col('last_access_more_than_6_months_ago') == 1) | (col('last_access_more_than_3_months_ago') == 1) ) \ .cache() # STEP-15: Find datasets which have only null accessed_at fields in its files df_all_null_accessed_at = df_final_dataset_rse \ .filter(col("last_access_time_of_dataset_per_rse").isNull()) \ .groupby(["dataset"]) \ .agg(_round(_max(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("max_dataset_size(TB)"), _round(_min(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("min_dataset_size(TB)"), _round(_avg(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("avg_dataset_size(TB)"), _sum(col("#files_null_access_time_per_rse")).alias("#files_null_access_time_per_dataset"), _max(col("last_access_time_of_dataset_per_rse")).alias("last_access_time_of_dataset"), ) \ .cache() # Total for not null data: not read more than 3,6,12 months which is equal to more than 3 months values. df.select([ "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)" ]).groupBy().sum().show() # For 12 months df.filter(col("last_access_more_than_12_months_ago") == 1).select([ "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)" ]).groupBy().sum().show() print(df.filter(col("last_access_more_than_12_months_ago") == 1).count()) # For 6 months df.filter(col("last_access_more_than_6_months_ago") == 1).select([ "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)" ]).groupBy().sum().show() print(df.filter(col("last_access_more_than_6_months_ago") == 1).count()) # For 3 months df.filter(col("last_access_more_than_3_months_ago") == 1).select([ "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)" ]).groupBy().sum().show() print(df.filter(col("last_access_more_than_3_months_ago") == 1).count()) # For all null accessed_at(all files) datasets df_all_null_accessed_at.select([ "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)" ]).groupBy().sum().show() print(df_all_null_accessed_at.count()) return df, df_all_null_accessed_at
def process_and_get_pd_dfs(spark, start_date, end_date): schema = _get_schema() raw_df = ( spark.read.option('basePath', DEFAULT_HDFS_FOLDER).json( get_candidate_files(start_date, end_date, spark, base=DEFAULT_HDFS_FOLDER), schema=schema, ).select('data.*').filter( col("RecordTime").between(f"{start_date.timestamp() * 1000}", f"{end_date.timestamp() * 1000}")). filter( (col('Site') == 'T3_US_ANL') | # ANL (col('Site') == 'T3_US_NERSC') | # NERSC (col('Site') == 'T3_US_OSG') | # OSG (col('Site') == 'T3_US_PSC') | # PSC (col('Site') == 'T3_US_SDSC') | # SDSC (col('Site') == 'T3_US_TACC') | # TACC ((col('Site').endswith('_ES_PIC_BSC')) & (col('MachineAttrCMSSubSiteName0') == 'PIC-BSC')) | # BSC ((col('Site') == 'T1_IT_CNAF') & (col('MachineAttrCMSSubSiteName0') == 'CNAF-CINECA')) | # CINECA ((col('Site') == 'T1_DE_KIT') & (col('MachineAttrCMSSubSiteName0') == 'KIT-HOREKA')) | # HOREKA ((col('Site') == 'T2_DE_RWTH') & (col('MachineAttrCMSSubSiteName0') == 'RWTH-HPC')) # RWTH ).filter(col('Status').isin([ 'Running', 'Completed' ])).withColumn('date', from_unixtime( (col('RecordTime') / 1000))).withColumn( 'site_name', when(col('Site') == 'T3_US_ANL', lit("ANL")).when( col('Site') == 'T3_US_NERSC', lit("NERSC")).when( col('Site') == 'T3_US_OSG', lit("OSG")).when( col('Site') == 'T3_US_PSC', lit("PSC")).when( col('Site') == 'T3_US_SDSC', lit("SDSC")).when( col('Site') == 'T3_US_TACC', lit("TACC")). when( col('Site').endswith('_ES_PIC_BSC'), lit("BSC")).when( col('MachineAttrCMSSubSiteName0') == 'CNAF-CINECA', lit("CINECA")).when( col('MachineAttrCMSSubSiteName0') == 'KIT-HOREKA', lit("HOREKA")). when( col('MachineAttrCMSSubSiteName0') == 'RWTH-HPC', lit("RWTH"))).withColumn( "RequestCpus", when( col("RequestCpus").isNotNull(), col("RequestCpus")).otherwise(lit(1)), ).withColumn('dayofmonth', _dayofmonth( col('date'))).withColumn( 'month', concat_ws( '-', _year(col('date')), format_string('%02d', _month( col('date')))) # 2-digit month, default 1 ).drop( 'Site', 'MachineAttrCMSSubSiteName0').withColumnRenamed( 'site_name', 'site')) # There should be only Completed status for a GlobalJobId df_core_hr = raw_df.filter(col('Status') == 'Completed') \ .drop_duplicates(["GlobalJobId"]) df_core_hr_daily = df_core_hr.groupby(['site', 'month', 'dayofmonth']) \ .agg(_round(_sum("CoreHr")).alias("sum CoreHr")) df_core_hr_monthly = df_core_hr.groupby(['site', 'month']) \ .agg(_round(_sum("CoreHr")).alias("sum CoreHr")) sec_12_min = 60 * 12 time_window_12m = from_unixtime( unix_timestamp('date') - unix_timestamp('date') % sec_12_min) # 1st group-by includes GlobaljobId to get running cores of GlobaljobId without duplicates in each 12 minutes window # 2nd group-by gets sum of RequestCpus in 12 minutes window # 3rd group-by gets avg of RequestCpus(12 minutes window) for each site for each month df_running_cores_daily = raw_df \ .withColumn('12m_window', time_window_12m) \ .groupby(['site', 'month', 'dayofmonth', '12m_window', 'GlobalJobId']) \ .agg(_max(col('RequestCpus')).alias('running_cores_of_single_job_in_12m')) \ .groupby(['site', 'month', 'dayofmonth', '12m_window']) \ .agg(_sum(col('running_cores_of_single_job_in_12m')).alias('running_cores_12m_sum')) \ .groupby(['site', 'month', 'dayofmonth']) \ .agg(_round(_avg(col('running_cores_12m_sum'))).alias('running_cores_avg_over_12m_sum')) return df_core_hr_daily.toPandas(), df_running_cores_daily.toPandas( ), df_core_hr_monthly.toPandas()
def recommandationAction(request): dateCreation = datetime.datetime.strptime(request.POST.get('date'), '%Y-%m-%d') recieved = int(model_recieved.predict([[request.POST.get("compSent"), dateCreation.month, request.POST.get("hour"), dateCreation.day, (dateCreation.weekday() + 1)]])[0]) df_globalStat = spark.read.json("/dataLake/Tisseo/mail_sending_global_stats") nearnbSend = (df_globalStat.where("volumesend <= "+str(request.POST.get("compSent"))).agg(_max("volumesend").alias("nbMailSend")).collect())[0][0] bestClic = (df_globalStat.where("volumesend = "+str(nearnbSend)).collect())[0][1] opens = int(model_opens.predict([[request.POST.get("compSent"), recieved, dateCreation.month, request.POST.get("hour"), dateCreation.day, (int(dateCreation.weekday()) + 1), bestClic]])) clics = int(model_clic.predict([[request.POST.get("compSent"), recieved, dateCreation.month, request.POST.get("hour"), dateCreation.day , (int(dateCreation.weekday()) + 1), opens ]])) data = {'data_sent_recieved_inqueue' : [request.POST.get("compSent"), recieved, (int(request.POST.get("compSent")) - recieved)], "data_open_clic" : [request.POST.get("compSent"), opens, clics]} if request.POST.get("databaseSave") == "1" : if request.method == "POST": obj = {'user' : request.user,'compName' : request.POST.get('compName'),'sent' : request.POST.get('compSent'),'recieved' : recieved,'opens' : opens,'clics' : clics,'dateRecomendation' : request.POST.get('date') ,'hour' : request.POST.get('hour')} recom = recommandationForm(obj) recom.save() recom_Analyse = recommandation.objects.aggregate(Sum('sent'), Sum('recieved'), Sum('clics'), Sum('opens')) data["sent"] = recom_Analyse["sent__sum"] data["recieved"] = recom_Analyse["recieved__sum"] data["clics"] = recom_Analyse["clics__sum"] data["opens"] = recom_Analyse["opens__sum"] data["in_queue"] = recom_Analyse["sent__sum"] - recom_Analyse["recieved__sum"] data["nbComp"] = recommandation.objects.count() data['comp_list'] = render_to_string('ajax_template_dashbord/listCompagnes.html',{'recommandation' : recommandation.objects.all()}, request=request) dataPerTimes = recommandation.objects.values('dateRecomendation').order_by('dateRecomendation').annotate(clics=Sum('clics'), opens=Sum('opens'), recieved=Sum('recieved')) labels, clicsLst, opensLst, recievedLst = [], [], [], [] for dataPerTime in dataPerTimes: recievedLst.append(dataPerTime["recieved"]) clicsLst.append(dataPerTime["clics"]) opensLst.append(dataPerTime["opens"]) dateRec = dataPerTime["dateRecomendation"].split("-") labels.append(str(dateRec[2]+" "+(calendar.month_name[int(dateRec[1])])+" "+dateRec[0])) data["recievedLst"] = recievedLst data["clicsLst"] = clicsLst data["opensLst"] = opensLst data["labels"] = labels return HttpResponse(json.dumps(data), content_type="application/json")
# Define schema an retrieve data from the blob storage schema = t.StructType() \ .add("time", t.StringType(), True) \ .add("open", t.DoubleType(), True) \ .add("close", t.DoubleType(), True) \ .add("high", t.DoubleType(), True) \ .add("low", t.DoubleType(), True) \ .add("volume", t.DoubleType(), True) \ .add("input_file", t.StringType(), True) df = spark.read.format(file_type).options( header="true", inferSchema="true").schema(schema).load(file_location).withColumn( "input_file", input_file_name()) # Get and split file name to create a column with the coin pair corresponding for each row split_col = split(df['input_file'], '/') df = df.withColumn('coin_pair', split(split_col.getItem(3), '\.').getItem(0)) df = df.drop("input_file") # We have a timestamp and we want a date df = df.withColumn('Date', from_unixtime((col('time') / 1000))) # Agregate data to have a monthly result, ready to insert into the database df = df.withColumn("date",to_date(concat_ws("-",year('Date'),month('Date')))) \ .groupBy("coin_pair","date") \ .agg(_sum('volume').alias("monthlyVolume"), (last('close')-first('open')).alias("monthlyResult"), _min('low').alias("monthlyLower"), _max('high').alias("monthlyHigher")) \ .orderBy("coin_pair","date") # Write data into the MongoDB database coins and monthly_coins_data collection df.write.format("com.mongodb.spark.sql.DefaultSource").option( "uri", "mongodb://XXXXXXX/coins.monthly_coins_data").mode("append").save()
N = 5 """ repartition based on 'LNAME' and 'Address' and generate spark_partiion_id then run mapPartitions() function and create in-partition idx """ df1 = df.repartition(N, 'LNAME', 'Address') \ .rdd.mapPartitionsWithIndex(func) \ .toDF() # get number of unique rows (based on Address+LNAME) which is max_idx # and then grab the running SUM of this rcnt # the new df should be small and just cache it w1 = Window.partitionBy().orderBy('partition_id').rowsBetween( Window.unboundedPreceding, -1) df2 = df1.groupby('partition_id') \ .agg((_max('idx')).alias('cnt')) \ .withColumn('rcnt', coalesce(_sum('cnt').over(w1),lit(0))) \ .cache() df2.show() #+------------+---+----+ #|partition_id|cnt|rcnt| #+------------+---+----+ #| 0| 3| 0| #| 1| 1| 3| #| 2| 1| 4| #| 4| 1| 5| #+------------+---+----+ """join df1 with df2 and create id = idx + rcnt""" df_new = df1.join(df2, on=['partition_id']).withColumn('id', col('idx') + col('rcnt'))
.filter('Earnings < 0')\ .select('Instrument').distinct().orderBy('Instrument') positive_earnings = fund_data\ .groupBy('Instrument', year('Date').alias('actual_year'))\ .agg(avg('Normalized_Income_Avail_to_Cmn_Shareholders').alias('Earnings'))\ .orderBy('Instrument', 'actual_year')\ .filter('actual_year >= 2015')\ .filter('Earnings > 0') positive_earnings = positive_earnings.join(negative_earnings, 'Instrument', 'leftanti') positive_earnings.show() positive_earnings = positive_earnings.withColumn('metric4', lit(1))\ .drop('actual_year', 'Earnings')\ .groupBy('Instrument')\ .agg(_max('metric4').alias('metric4'))\ .orderBy('Instrument') positive_earnings.show() # Metric 5: Dividend payment for past 5 years no_dividend = fund_data\ .groupBy('Instrument', year('Date').alias('actual_year'))\ .agg((_sum('Gross_Dividends___Common_Stock') / avg('Total_Common_Shares_Outstanding')).alias('Dividend')) \ .filter('actual_year >= 2015')\ .filter('Dividend == 0')\ .select('Instrument').distinct().orderBy('Instrument') dividend = fund_data\ .groupBy('Instrument', year('Date').alias('actual_year'))\ .agg((_sum('Gross_Dividends___Common_Stock') / avg('Total_Common_Shares_Outstanding')).alias('Dividend')) \
def create_main_df(spark, hdfs_paths, base_eos_dir): # UTC timestamp of start hour of spark job ts_current_hour = int(datetime.utcnow().replace( minute=0, second=0, microsecond=0, tzinfo=timezone.utc).timestamp() * 1000) # ----------------------------------------------------------------------------------------------------------------- # -- ================== Prepare main Spark dataframes =========================== # Get RSES id, name, type, tier, country, kind from RSES table dump df_rses = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['RSES']) \ .filter(col('DELETED_AT').isNull()) \ .withColumn('replica_rse_id', lower(_hex(col('ID')))) \ .withColumnRenamed('RSE', 'rse') \ .withColumnRenamed('RSE_TYPE', 'rse_type') \ .withColumn('rse_tier', _split(col('rse'), '_').getItem(0)) \ .withColumn('rse_country', _split(col('rse'), '_').getItem(1)) \ .withColumn('rse_kind', when(col("rse").endswith('Temp'), 'temp') .when(col("rse").endswith('Test'), 'test') .otherwise('prod') ) \ .select(['replica_rse_id', 'rse', 'rse_type', 'rse_tier', 'rse_country', 'rse_kind']) # Rucio Dataset(D) refers to dbs block, so we used DBS terminology from the beginning df_contents_f_to_b = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['CONTENTS']) \ .filter(col("SCOPE") == "cms") \ .filter(col("DID_TYPE") == "D") \ .filter(col("CHILD_TYPE") == "F") \ .withColumnRenamed("NAME", "block") \ .withColumnRenamed("CHILD_NAME", "file") \ .select(["block", "file"]) # Rucio Dataset(D) refers to dbs block; Rucio Container(C) refers to dbs dataset. # We used DBS terminology from the beginning df_contents_b_to_d = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['CONTENTS']) \ .filter(col("SCOPE") == "cms") \ .filter(col("DID_TYPE") == "C") \ .filter(col("CHILD_TYPE") == "D") \ .withColumnRenamed("NAME", "dataset") \ .withColumnRenamed("CHILD_NAME", "block") \ .select(["dataset", "block"]) # Get file to dataset map df_contents_ds_files = df_contents_f_to_b.join(df_contents_b_to_d, ["block"], how="left") \ .filter(col('file').isNotNull()) \ .filter(col('dataset').isNotNull()) \ .withColumnRenamed('dataset', 'contents_dataset') \ .withColumn('is_d_name_from_rucio', lit(BOOL_STR[True])) \ .select(["contents_dataset", "file", "is_d_name_from_rucio"]) dbs_files = spark.read.format('avro').load(hdfs_paths['FILES']) \ .withColumnRenamed('LOGICAL_FILE_NAME', 'file') \ .withColumnRenamed('DATASET_ID', 'dbs_file_ds_id') \ .withColumnRenamed('FILE_SIZE', 'dbs_file_size') \ .select(['file', 'dbs_file_ds_id', 'dbs_file_size']) dbs_datasets = spark.read.format('avro').load(hdfs_paths['DATASETS']) df_dbs_ds_files = dbs_files.join(dbs_datasets.select(['DATASET_ID', 'DATASET']), dbs_files.dbs_file_ds_id == dbs_datasets.DATASET_ID, how='left') \ .filter(col('file').isNotNull()) \ .filter(col('DATASET').isNotNull()) \ .withColumnRenamed('dbs_file_ds_id', 'dbs_dataset_id') \ .withColumnRenamed('DATASET', 'dbs_dataset') \ .withColumn('is_d_name_from_dbs', lit(BOOL_STR[True])) \ .select(['file', 'dbs_dataset', 'is_d_name_from_dbs']) # Prepare replicas df_replicas = spark.read.format('avro').load(hdfs_paths['REPLICAS']) \ .filter(col("SCOPE") == "cms") \ .withColumn('replica_rse_id', lower(_hex(col('RSE_ID')))) \ .withColumn('replica_file_size', col('BYTES').cast(LongType())) \ .withColumnRenamed('NAME', 'file') \ .withColumnRenamed('ACCESSED_AT', 'replica_accessed_at') \ .withColumnRenamed('CREATED_AT', 'replica_created_at') \ .withColumnRenamed('LOCK_CNT', 'lock_cnt') \ .withColumnRenamed('STATE', 'state') \ .select(['file', 'replica_rse_id', 'replica_file_size', 'replica_accessed_at', 'replica_created_at', 'lock_cnt']) # Create enriched file df which adds dbs file size to replicas files. Left join select only replicas files df_files_enriched_with_dbs = df_replicas \ .join(dbs_files.select(['file', 'dbs_file_size']), ['file'], how='left') \ .withColumn('joint_file_size', when(col('replica_file_size').isNotNull(), col('replica_file_size')) .when(col('dbs_file_size').isNotNull(), col('dbs_file_size')) ) \ .select(['file', 'replica_rse_id', 'replica_accessed_at', 'replica_created_at', 'lock_cnt', 'replica_file_size', 'dbs_file_size', 'joint_file_size']) # ----------------------------------------------------------------------------------------------------------------- # -- ================== only Rucio: Replicas and Contents ======================= -- df_only_from_rucio = df_replicas \ .join(df_contents_ds_files, ['file'], how='left') \ .select(['contents_dataset', 'file', 'replica_rse_id', 'replica_file_size', 'replica_accessed_at', 'replica_created_at', 'is_d_name_from_rucio', 'lock_cnt']) # Use them in outer join # _max(col('replica_accessed_at')).alias('rucio_last_accessed_at'), # _max(col('replica_created_at')).alias('rucio_last_created_at'), df_only_from_rucio = df_only_from_rucio \ .groupby(['replica_rse_id', 'contents_dataset']) \ .agg(_sum(col('replica_file_size')).alias('rucio_size'), _count(lit(1)).alias('rucio_n_files'), _sum( when(col('replica_accessed_at').isNull(), 0) .otherwise(1) ).alias('rucio_n_accessed_files'), _first(col("is_d_name_from_rucio")).alias("is_d_name_from_rucio"), _sum(col('lock_cnt')).alias('rucio_locked_files') ) \ .withColumn('rucio_is_d_locked', when(col('rucio_locked_files') > 0, IS_DATASET_LOCKED[True]) .otherwise(IS_DATASET_LOCKED[False]) ) \ .select(['contents_dataset', 'replica_rse_id', 'rucio_size', 'rucio_n_files', 'rucio_n_accessed_files', 'is_d_name_from_rucio', 'rucio_locked_files', 'rucio_is_d_locked', ]) # ----------------------------------------------------------------------------------------------------------------- # -- ================= only DBS: Replicas, Files, Datasets ====================== -- # Of course only files from Replicas processed, select only dbs related fields df_only_from_dbs = df_files_enriched_with_dbs \ .select(['file', 'replica_rse_id', 'dbs_file_size', 'replica_accessed_at', 'lock_cnt']) \ .join(df_dbs_ds_files, ['file'], how='left') \ .filter(col('dbs_dataset').isNotNull()) \ .select(['file', 'dbs_dataset', 'replica_rse_id', 'dbs_file_size', 'replica_accessed_at', 'is_d_name_from_dbs', 'lock_cnt']) df_only_from_dbs = df_only_from_dbs \ .groupby(['replica_rse_id', 'dbs_dataset']) \ .agg(_sum(col('dbs_file_size')).alias('dbs_size'), _count(lit(1)).alias('dbs_n_files'), _sum( when(col('replica_accessed_at').isNull(), 0) .otherwise(1) ).alias('dbs_n_accessed_files'), _first(col("is_d_name_from_dbs")).alias("is_d_name_from_dbs"), _sum(col('lock_cnt')).alias('dbs_locked_files') ) \ .withColumn('dbs_is_d_locked', when(col('dbs_locked_files') > 0, IS_DATASET_LOCKED[True]) .otherwise(IS_DATASET_LOCKED[False]) ) \ .select(['dbs_dataset', 'replica_rse_id', 'dbs_size', 'dbs_n_files', 'dbs_n_accessed_files', 'is_d_name_from_dbs', 'dbs_locked_files', 'dbs_is_d_locked']) # Full outer join of Rucio and DBS to get all dataset-file maps df_dataset_file_map_enr = df_contents_ds_files.join(df_dbs_ds_files, ['file'], how='full') # ----------------------------------------------------------------------------------------------------------------- # -- ====== check files do not have dataset name ============ -- # Check Replicas files do not have dataset name in Contents, DBS or both x = df_replicas.join(df_dataset_file_map_enr, ['file'], how='left') \ .select(['contents_dataset', 'dbs_dataset', 'file']) y_contents = x.filter(col('contents_dataset').isNull()) z_dbs = x.filter(col('dbs_dataset').isNull()) t_both = x.filter( col('contents_dataset').isNull() & col('dbs_dataset').isNull()) stats_dict = { "Replicas files do not have dataset name in Contents": y_contents.select('file').distinct().count(), "Replicas files do not have dataset name in DBS": z_dbs.select('file').distinct().count(), "Replicas files do not have dataset name neither in Contents nor DBS": t_both.select('file').distinct().count() } write_stats_to_eos(base_eos_dir, stats_dict) del x, y_contents, z_dbs, t_both # ----------------------------------------------------------------------------------------------------------------- # -- ====== joint Rucio and DBS: Replicas, Contents, Files, Datasets ============ -- # Main aim is to get all datasets of files df_dataset_file_map_enr = df_dataset_file_map_enr \ .withColumn("dataset", when(col("contents_dataset").isNotNull(), col("contents_dataset")) .when(col("dbs_dataset").isNotNull(), col("dbs_dataset")) ) \ .withColumn("is_ds_from_rucio", when(col("is_d_name_from_rucio").isNotNull(), 1).otherwise(0)) \ .withColumn("is_ds_from_dbs", when(col("is_d_name_from_dbs").isNotNull(), 1).otherwise(0)) \ .select(['dataset', 'file', 'is_ds_from_dbs', 'is_ds_from_rucio']) df_joint_ds_files = df_files_enriched_with_dbs \ .select(['file', 'replica_rse_id', 'replica_accessed_at', 'replica_created_at', 'joint_file_size', 'lock_cnt']) \ .join(df_dataset_file_map_enr, ['file'], how='left') \ .filter(col('dataset').isNotNull()) \ .select(['dataset', 'file', 'is_ds_from_dbs', 'is_ds_from_rucio', 'replica_rse_id', 'replica_accessed_at', 'replica_created_at', 'joint_file_size', 'lock_cnt']) df_joint_main = df_joint_ds_files \ .groupby(['replica_rse_id', 'dataset']) \ .agg(_sum(col('joint_file_size')).alias('joint_size'), _max(col('replica_accessed_at')).alias('joint_last_accessed_at'), _max(col('replica_created_at')).alias('joint_last_created_at'), _sum(col('is_ds_from_dbs')).alias('joint_dbs_n_files'), _sum(col('is_ds_from_rucio')).alias('joint_rucio_n_files'), _count(lit(1)).alias('joint_n_files'), _sum( when(col('replica_accessed_at').isNull(), 0).otherwise(1) ).alias('joint_n_accessed_files'), _sum(col('lock_cnt')).alias('joint_locked_files') ) \ .withColumn('all_f_in_dbs', when((col('joint_dbs_n_files') == 0) & (col('joint_dbs_n_files').isNull()), IS_ALL_DATASET_FILES_EXISTS['n']) .when(col('joint_dbs_n_files') == col('joint_n_files'), IS_ALL_DATASET_FILES_EXISTS['a']) .when(col('joint_dbs_n_files') > 0, IS_ALL_DATASET_FILES_EXISTS['p']) ) \ .withColumn('all_f_in_rucio', when((col('joint_rucio_n_files') == 0) & (col('joint_rucio_n_files').isNull()), IS_ALL_DATASET_FILES_EXISTS['n']) .when(col('joint_rucio_n_files') == col('joint_n_files'), IS_ALL_DATASET_FILES_EXISTS['a']) .when(col('joint_rucio_n_files') > 0, IS_ALL_DATASET_FILES_EXISTS['p']) ) \ .withColumn('joint_is_d_locked', when(col('joint_locked_files') > 0, IS_DATASET_LOCKED[True]) .otherwise(IS_DATASET_LOCKED[False]) ) \ .withColumnRenamed("replica_rse_id", "rse_id") \ .select(['dataset', 'rse_id', 'joint_size', 'joint_last_accessed_at', 'joint_last_created_at', 'joint_dbs_n_files', 'joint_rucio_n_files', 'joint_n_files', 'joint_n_accessed_files', 'all_f_in_dbs', 'all_f_in_rucio', 'joint_locked_files', 'joint_is_d_locked' ]) # ----------------------------------------------------------------------------------------------------------------- # -- ============ Dataset enrichment with Dataset tags ============ -- # Enrich dbs dataset with names from id properties of other tables dbs_data_tiers = spark.read.format('avro').load(hdfs_paths['DATA_TIERS']) dbs_physics_group = spark.read.format('avro').load( hdfs_paths['PHYSICS_GROUPS']) dbs_acquisition_era = spark.read.format('avro').load( hdfs_paths['ACQUISITION_ERAS']) dbs_dataset_access_type = spark.read.format('avro').load( hdfs_paths['DATASET_ACCESS_TYPES']) dbs_datasets_enr = dbs_datasets \ .join(dbs_data_tiers, ['data_tier_id'], how='left') \ .join(dbs_physics_group, ['physics_group_id'], how='left') \ .join(dbs_acquisition_era, ['acquisition_era_id'], how='left') \ .join(dbs_dataset_access_type, ['dataset_access_type_id'], how='left') \ .select(['dataset', 'dataset_id', 'is_dataset_valid', 'primary_ds_id', 'processed_ds_id', 'prep_id', 'data_tier_id', 'data_tier_name', 'physics_group_id', 'physics_group_name', 'acquisition_era_id', 'acquisition_era_name', 'dataset_access_type_id', 'dataset_access_type']) # ----------------------------------------------------------------------------------------------------------------- # -- ============ Main: join all ============ -- cond_with_only_rucio = [ df_joint_main.dataset == df_only_from_rucio.contents_dataset, df_joint_main.rse_id == df_only_from_rucio.replica_rse_id ] cond_with_only_dbs = [ df_joint_main.dataset == df_only_from_dbs.dbs_dataset, df_joint_main.rse_id == df_only_from_dbs.replica_rse_id ] # Left joins: since df_join_main has outer join, should have all datasets of both Rucio and DBS df_main = df_joint_main.join(df_only_from_rucio, cond_with_only_rucio, how='left').drop('replica_rse_id') df_main = df_main.join(df_only_from_dbs, cond_with_only_dbs, how='left').drop('replica_rse_id') df_main = df_main \ .withColumn('rucio_has_ds_name', when(col('is_d_name_from_rucio').isNotNull(), col('is_d_name_from_rucio')) .otherwise(BOOL_STR[False])) \ .withColumn('dbs_has_ds_name', when(col('is_d_name_from_dbs').isNotNull(), col('is_d_name_from_dbs')) .otherwise(BOOL_STR[False])) # Remove unneeded columns by selecting specific ones df_main = df_main.select([ 'dataset', 'rse_id', 'joint_size', 'joint_last_accessed_at', 'joint_last_created_at', 'joint_dbs_n_files', 'joint_rucio_n_files', 'joint_n_files', 'joint_n_accessed_files', 'all_f_in_dbs', 'all_f_in_rucio', 'rucio_size', 'rucio_n_files', 'rucio_n_accessed_files', 'rucio_has_ds_name', 'dbs_size', 'dbs_n_files', 'dbs_n_accessed_files', 'dbs_has_ds_name', 'rucio_locked_files', 'rucio_is_d_locked', 'dbs_locked_files', 'dbs_is_d_locked', 'joint_locked_files', 'joint_is_d_locked' ]) # Add DBS dataset enrichment's to main df df_main = df_main.join(dbs_datasets_enr, ['dataset'], how='left') # Add RSES name, type, tier, country, kind to dataset df_main = df_main \ .join(df_rses, df_main.rse_id == df_rses.replica_rse_id, how='left') \ .drop('rse_id', 'replica_rse_id') # UTC timestamp of start hour of the spark job df_main = df_main.withColumn('tstamp_hour', lit(ts_current_hour)) # Fill null values of string type columns. Null values is hard to handle in ES queries. df_main = df_main.fillna(value=NULL_STR_TYPE_COLUMN_VALUE, subset=STR_TYPE_COLUMNS) return df_main