コード例 #1
0
def get_df_main_datasets_never_read(df_dataset_file_rse_ts_size,
                                    filtered_rses_id_name_map, min_tb_limit):
    """Get never accessed datasets' dataframes"""
    reverted_filtered_rses_id_name_map = get_reverted_rses_id_name_map(
        filtered_rses_id_name_map)
    df_sub_datasets_never_read = df_dataset_file_rse_ts_size \
        .groupby(['rse_id', 'dataset']) \
        .agg(_round(_sum(col('f_size')) / TB_DENOMINATOR, 5).alias('dataset_size_in_rse_tb'),
             _max(col('accessed_at')).alias('last_access_time_of_dataset_in_rse'),
             _max(col('created_at')).alias('last_create_time_of_dataset_in_rse'),
             _count(lit(1)).alias('#_files_of_dataset_in_rse'),
             ) \
        .filter(col('last_access_time_of_dataset_in_rse').isNull()) \
        .filter(col('dataset_size_in_rse_tb') > min_tb_limit) \
        .replace(reverted_filtered_rses_id_name_map, subset=['rse_id']) \
        .withColumnRenamed('rse_id', 'RSE name') \
        .select(['RSE name',
                 'dataset',
                 'dataset_size_in_rse_tb',
                 'last_create_time_of_dataset_in_rse',
                 '#_files_of_dataset_in_rse'
                 ]) \
        .cache()

    df_main_datasets_never_read = df_sub_datasets_never_read \
        .groupby(['dataset']) \
        .agg(_max(col('dataset_size_in_rse_tb')).alias('max_dataset_size_in_rses(TB)'),
             _min(col('dataset_size_in_rse_tb')).alias('min_dataset_size_in_rses(TB)'),
             _avg(col('dataset_size_in_rse_tb')).alias('avg_dataset_size_in_rses(TB)'),
             _sum(col('dataset_size_in_rse_tb')).alias('sum_dataset_size_in_rses(TB)'),
             _max(col('last_create_time_of_dataset_in_rse')).alias('last_create_time_of_dataset_in_all_rses'),
             concat_ws(', ', collect_list('RSE name')).alias('RSE(s)'),
             ) \
        .cache()
    return df_main_datasets_never_read, df_sub_datasets_never_read
コード例 #2
0
def get_prcp_day(df_in):
    # Efetua o agrupamento dos valores, efetuando a agregacao de determinados campos
    print('CONSTRUINDO DATAFRAME COM VALORES AGREGADOS POR DIA')
    df_out = df_in.groupby('city', 'yr', 'mo', 'da').agg(
        _abs(_max('lat')).alias('latitude'),
        _sum('prcp').alias('prcp_dia'),
        _max('tmax').alias('tmax'),
        _min('tmin').alias('tmin'),
        _mean('temp').alias('med_temp')).orderBy('city', 'yr', 'mo', 'da')
    return df_out
コード例 #3
0
def updateStatsPerDate(request):
    df_stathour = spark.read.json("/dataLake/"+request.POST.get("bdname")+"/stat_hour")
    statsPerDate = {"clic" : [] ,"open" : [] , "recieved" : [], "dates" : []}
    # After Loading page
    if request.POST.get("etat") == "0" :
        if  request.POST.get("withCompanes") == "1":
            lstCompanes = list(request.POST.get("companes").split(","))
            df_stats = df_stathour.where(df_stathour["mail_sending_id"].isin(lstCompanes)).groupBy(year("date").alias("year"), month("date").alias("month")).agg(_sum("opens").alias("opens"),_sum("clics").alias("clics"),_sum("recieved").alias("recieved")).orderBy("year", "month")
        else:
            df_stats = df_stathour.groupBy(year("date").alias("year"), month("date").alias("month")).agg(_sum("opens").alias("opens"),_sum("clics").alias("clics"),_sum("recieved").alias("recieved")).orderBy("year", "month")
        for content in df_stats.collect():
            statsPerDate["dates"].append(calendar.month_name[content["month"]]+" "+str(content["year"]))
            statsPerDate["clic"].append(content["clics"])
            statsPerDate["open"].append(content["opens"])
            statsPerDate["recieved"].append(content["recieved"])
    #Actions with listener
    elif request.POST.get("etat") == "1" :
        if request.POST.get("action") == "week" or request.POST.get("action") == "month"  :
            if request.POST.get("action") == "week":
                nbday = 7
            else:
                nbday = 30
            if  request.POST.get("withCompanes") == "1":
                lstCompanes = list(request.POST.get("companes").split(","))
                maxDateInStatHour =  (datetime.datetime.strptime(df_stathour.where(df_stathour["mail_sending_id"].isin(lstCompanes)).agg(_max("date").alias("date")).collect()[0]["date"], "%Y-%m-%d") - datetime.timedelta(days=nbday)).strftime("%Y-%m-%d")
                df_stats =  df_stathour.where(df_stathour["mail_sending_id"].isin(lstCompanes)).where("date >'"+maxDateInStatHour+"'").groupBy(year("date").alias("year"), month("date").alias("month"), dayofmonth("date").alias("day")).agg(_sum("opens").alias("opens"),_sum("clics").alias("clics"),_sum("recieved").alias("recieved")).orderBy("year", "month", "day")
            else:
                maxDateInStatHour =  (datetime.datetime.strptime(df_stathour.agg(_max("date").alias("date")).collect()[0]["date"], "%Y-%m-%d") - datetime.timedelta(days=nbday)).strftime("%Y-%m-%d")
                df_stats =  df_stathour.where("date >'"+maxDateInStatHour+"'").groupBy(year("date").alias("year"), month("date").alias("month"), dayofmonth("date").alias("day")).agg(_sum("opens").alias("opens"),_sum("clics").alias("clics"),_sum("recieved").alias("recieved")).orderBy("year", "month", "day")

            for content in df_stats.collect():
                statsPerDate["dates"].append(str(content["day"])+" "+calendar.month_name[content["month"]])
                statsPerDate["clic"].append(content["clics"])
                statsPerDate["open"].append(content["opens"])
                statsPerDate["recieved"].append(content["recieved"])
        elif request.POST.get("action") == "sixMonth" :
            if  request.POST.get("withCompanes") == "1":
                lstCompanes = list(request.POST.get("companes").split(","))
                maxDateInStatHour =  (datetime.datetime.strptime(df_stathour.where(df_stathour["mail_sending_id"].isin(lstCompanes)).agg(_max("date").alias("date")).collect()[0]["date"], "%Y-%m-%d") - datetime.timedelta(days=180)).strftime("%Y-%m-%d")
                df_stats = df_stathour.where(df_stathour["mail_sending_id"].isin(lstCompanes)).where("date >'"+maxDateInStatHour+"'").groupBy(year("date").alias("year"), month("date").alias("month")).agg(_sum("opens").alias("opens"),_sum("clics").alias("clics"),_sum("recieved").alias("recieved")).orderBy("year", "month")
            else:
                maxDateInStatHour =  (datetime.datetime.strptime(df_stathour.agg(_max("date").alias("date")).collect()[0]["date"], "%Y-%m-%d") - datetime.timedelta(days=180)).strftime("%Y-%m-%d")
                df_stats = df_stathour.where("date >'"+maxDateInStatHour+"'").groupBy(year("date").alias("year"), month("date").alias("month")).agg(_sum("opens").alias("opens"),_sum("clics").alias("clics"),_sum("recieved").alias("recieved")).orderBy("year", "month")
            for content in df_stats.collect():
                statsPerDate["dates"].append(calendar.month_name[content["month"]]+" "+str(content["year"]))
                statsPerDate["clic"].append(content["clics"])
                statsPerDate["open"].append(content["opens"])
                statsPerDate["recieved"].append(content["recieved"])
    elif  request.POST.get("etat") == "2" :
        pass
    return HttpResponse(json.dumps(statsPerDate), content_type="application/json")
コード例 #4
0
def get_df_main_not_read_since(df_sub_not_read_since):
    """Get dataframe of datasets not read since N months for main htmls.

    Get last access of dataframe in all RSE(s)
    """
    return df_sub_not_read_since \
        .groupby(['dataset']) \
        .agg(_max(col('dataset_size_in_rse_tb')).alias('max_dataset_size_in_rses(TB)'),
             _min(col('dataset_size_in_rse_tb')).alias('min_dataset_size_in_rses(TB)'),
             _avg(col('dataset_size_in_rse_tb')).alias('avg_dataset_size_in_rses(TB)'),
             _sum(col('dataset_size_in_rse_tb')).alias('sum_dataset_size_in_rses(TB)'),
             _max(col('last_access_time_of_dataset_in_rse')).alias('last_access_time_of_dataset_in_all_rses'),
             concat_ws(', ', collect_list('RSE name')).alias('RSE(s)'),
             ) \
        .cache()
コード例 #5
0
def get_device_tracking_observations(df):
    """
    Calculates first_observed_at and last_observed_at for each device from the supplied observations.

    :param df: A DataFrame of network connection observations.
    :return: A DataFrame of (organization, mac, first_observed_at, last_observed_at)
    """

    if df and df.columns:
        outbound_df = df.select(
            'organization',
            col('network_src_mac').name('mac'),
            'occurred_at'
        )

        inbound_df = df.select(
            'organization',
            col('network_dest_mac').name('mac'),
            'occurred_at'
        )

        return outbound_df.union(
            inbound_df
        ).where(
            col('mac').isNotNull()
        ).groupBy(
            'organization',
            'mac'
        ).agg(
            _min('occurred_at').name('first_observed_at'),
            _max('occurred_at').name('last_observed_at'),
        )
コード例 #6
0
def prepare_cyclomatic_complexity_data(
        file_tree_dataframe: DataFrame,
        cyclomatic_complexity_dataframe: DataFrame) -> DataFrame:
    # yapf: disable
    cyclomatic_complexity_max_dataframe = cyclomatic_complexity_dataframe.groupBy(
        'file_hash'
    ).agg(
        _max('complexity').alias('max_complexity_in_file')
    )

    cyclomatic_complexity_stats_df = cyclomatic_complexity_max_dataframe.join(
        file_tree_dataframe,
        'file_hash'
    ).groupBy(
        'firmware_hash'
    ).agg(
        _max('max_complexity_in_file').alias('max_complexity')
    )
    # yapf: enable
    return cyclomatic_complexity_stats_df
コード例 #7
0
def get_metrics(df_in):
    # Efetua o agrupamento dos valores, efetuando a agregacao de determinados campos
    print('CONSTRUINDO DATAFRAME COM VALORES AGREGADOS')
    df_out = df_in.groupby('yr', 'city').agg(
        _max('elvt').alias('ele_max'),
        _mean('temp').alias('med_temp'),
        _mean('tmin').alias('med_temp_min'),
        _mean('tmax').alias('med_temp_max'),
        _sum('prcp').alias('prcp'),
        _mean('hmdy').alias('med_umi'),
        _mean('hmin').alias('med_umi_min'),
        _mean('hmax').alias('med_umi_max'),
        _mean('wdsp').alias('med_velo_vento'),
        _mean('gust').alias('med_velo_rajadas_vento')).orderBy('yr', 'city')
    return df_out
コード例 #8
0
def get_crab_popularity_ds(start_date,
                           end_date,
                           verbose=False,
                           base=_BASE_PATH):
    """
    Query the hdfs data and returns a pandas dataframe with:
    Datatier, Dataset, CMSPrimaryPrimaryDataset, job_count, workflow_count, ChirpCMSSWReadBytes
    args:
        - start_date datetime Start of the query period (RecordTime)
        - end_date datetime End of the query period
    """
    start = int(start_date.timestamp() * 1000)
    end = int(end_date.timestamp() * 1000)
    spark = get_spark_session(yarn=True, verbose=verbose)

    dfs_crabdb = (spark.read.option("basePath", base).json(
        _get_candidate_files(start_date, end_date, spark, base=base),
        schema=_get_crab_condor_schema(),
    ).select("metadata.timestamp",
             "data.*").filter("""Status in ('Completed', 'Removed') AND
                              CRAB_DataBlock is not NULL  AND
                              timestamp >= {} AND
                              timestamp <= {}""".format(
                 start, end)).repartition("CRAB_DataBlock").drop_duplicates([
                     "GlobalJobId"
                 ]).withColumnRenamed(
                     "CMSPrimaryPrimaryDataset", "PrimaryDataset").withColumn(
                         "Dataset",
                         regexp_extract("CRAB_DataBlock", "^(.*)/([^/]*)#.*$",
                                        1)).withColumn(
                                            "Datatier",
                                            regexp_extract(
                                                "CRAB_DataBlock",
                                                "^(.*)/([^/]*)#.*$", 2)))
    dfs_crabdb = (dfs_crabdb.groupBy(
        "Datatier", "PrimaryDataset", "Dataset").agg(
            _max(col("RecordTime")),
            _min(col("RecordTime")),
            count(lit(1)),
            countDistinct("CRAB_Workflow"),
            _sum(col("ChirpCMSSWReadBytes")),
        ).withColumnRenamed("count(1)", "job_count").withColumnRenamed(
            "count(DISTINCT CRAB_Workflow)",
            "workflow_count").withColumnRenamed(
                "sum(ChirpCMSSWReadBytes)", "ChirpCMSSWReadBytes").na.fill(
                    "Unknown", ["Datatier", "PrimaryDataset", "Dataset"]))
    return dfs_crabdb.toPandas()
コード例 #9
0
ファイル: AquisSpark.py プロジェクト: ketanpurohit0/python
def useSpark(sourceFile: str, targetTsvFile: str) -> None:
    """[Process the input source files using Spark to transform to target data]

    Args:
        sourceFile (str): [Path to the location of the input data]
        targetTsvFile (str): [Path to the location of the target data]
    """

    # secrets for access to postgres database are held in .env file
    # this loads that into the application environment
    load_dotenv(verbose=True)

    spark = SparkSession.builder \
        .appName('Aquis2') \
        .master("local[2]") \
        .config(conf=getSparkConf(getJars())) \
        .getOrCreate()

    # clean data from source file
    cleanDf = spark.read.text(sourceFile) \
        .filter(col("value").contains("msgType_") & ~col("value").contains('msgType_":11')) \
        .withColumn("value", expr("substring(value,2)")) \
        .withColumn("value", regexp_replace("value", '\{\{', r'\{"header":\{')) \
        .withColumn("value", regexp_replace("value", 'SELL,', '"SELL",')) \
        .withColumn("value", regexp_replace("value", 'BUY,', '"BUY",')) \
        .withColumn("value", regexp_replace("value", '"flags_":"\{"', '"flags_":\{"'))

    # figure out schema on message 8, keep for re-use later as a technology demonstration
    msg8Schema = spark.read.json(
        cleanDf.filter(col("value").contains('"msgType_":8')).select(
            col("value").cast("string")).rdd.map(
                lambda r: r.value))._jdf.schema().toDDL()
    msg8Df = cleanDf.filter(col("value").contains('"msgType_":8')).withColumn("value", from_json("value", msg8Schema)) \
        .select("value.security_.securityId_", "value.security_.isin_", "value.security_.currency_") \
        .repartition(2, ["securityId_"])
    # msg8Df.printSchema()
    # root
    # | -- securityId_: long(nullable=true)
    # | -- isin_: string(nullable=true)
    # | -- currency_: string(nullable=true)

    # figure out schema on message 12, keep for re-use later as a technology demonstration
    msg12Schema = spark.read.json(
        cleanDf.filter(col("value").contains('"msgType_":12')).select(
            col("value").cast("string")).rdd.map(
                lambda r: r.value))._jdf.schema().toDDL()
    msg12Df = cleanDf.filter(col("value").contains('"msgType_":12')) \
        .withColumn("value", from_json("value", msg12Schema)) \
        .repartition(2, ["value.bookEntry_.securityId_"])

    # msg12Df.printSchema()
    # msg12Df.select("value.bookEntry_.side_").show()
    # root
    # | -- value: struct(nullable=true)
    # | | -- bookEntry_: struct(nullable=true)
    # | | | -- orderId_: long(nullable=true)
    # | | | -- price_: long(nullable=true)
    # | | | -- quantity_: long(nullable=true)
    # | | | -- securityId_: long(nullable=true)
    # | | | -- side_: string(nullable=true)
    # | | -- header: struct(nullable=true)
    # | | | -- length_: long(nullable=true)
    # | | | -- msgType_: long(nullable=true)
    # | | | -- seqNo_: long(nullable=true)

    # now aggregate messageType12 by securityId_ and side_
    aggDfSells = msg12Df.filter("value.bookEntry_.side_ == 'SELL'") \
        .select("*", (col("value.bookEntry_.quantity_") * col("value.bookEntry_.price_")).alias("TotalSellAmount")) \
        .groupby("value.bookEntry_.securityId_") \
        .agg(count("value.bookEntry_.securityId_").alias("Total Sell Count"),
             _sum("value.bookEntry_.quantity_").alias("Total Sell Quantity"),
             _min("value.bookEntry_.price_").alias("Min Sell Price"),
             _sum("TotalSellAmount").alias("Weighted Average Sell Price")
             ) \
        .withColumn("Weighted Average Sell Price", col("Weighted Average Sell Price") / col("Total Sell Quantity"))

    # now aggregate messageType12 by securityId_ and side_
    aggDfBuys = msg12Df.filter("value.bookEntry_.side_ == 'BUY'") \
        .select("*", (col("value.bookEntry_.quantity_") * col("value.bookEntry_.price_")).alias("TotalBuyAmount")) \
        .groupby("value.bookEntry_.securityId_") \
        .agg(count("value.bookEntry_.securityId_").alias("Total Buy Count"),
             _sum("value.bookEntry_.quantity_").alias("Total Buy Quantity"),
             _max("value.bookEntry_.price_").alias("Max Buy Price"),
             _sum("TotalBuyAmount").alias("Weighted Average Buy Price")) \
        .withColumn("Weighted Average Buy Price", col("Weighted Average Buy Price") / col("Total Buy Quantity"))

    # bring it together with joins, use outer join with the security data due to missing ids
    # select columns in the following order..
    outputColList = [
        col("isin_").alias("ISIN"),
        col("currency_").alias("Currency"), "Total Buy Count",
        "Total Sell Count", "Total Buy Quantity", "Total Sell Quantity",
        "Weighted Average Buy Price", "Weighted Average Sell Price",
        "Max Buy Price", "Min Sell Price"
    ]

    outputDf = aggDfBuys.join(aggDfSells, ["securityId_"], "full_outer") \
        .join(msg8Df, ["securityId_"], "left_outer") \
        .na.fill(0, outputColList[2:]) \
        .na.fill("MISSING", ["isin_", "currency_"]) \
        .select(outputColList)

    # collect into a single file
    outputDf.coalesce(1).write.option("sep", "\t").csv(targetTsvFile,
                                                       header=True)

    # Demo writing to postgresql (msg8 dataframe)
    # will append records to table AcquisExample. Table will
    # be created on the fly it it does not exist.
    dburl = getDbConnectionUrl(db=os.getenv("POSTGRES_DB"),
                               user=os.getenv("POSTGRES_USER"),
                               secret=os.getenv("POSTGRES_SECRET"))
    msg8Df.write.format("jdbc") \
        .option("url", dburl) \
        .option("dbtable", "AcquisExample") \
        .option("driver", "org.postgresql.Driver") \
        .save(mode="append")

    spark.stop()
    # remove useless data
    init_flat_data = init_flat_data.fillna(0)

    #cast columns
    init_flat_data = init_flat_data \
        .withColumn('order_date', to_date(unix_timestamp("order_date", "yyyy-MM-dd").cast('timestamp')) ) \
        .withColumn('NBI', col('NBI').cast('float') ) \
        .withColumn('order_id', col('order_id').cast('int')) \
        .withColumn('company_id', col('company_id').cast('int'))

    # print schema
    print(init_flat_data.printSchema(), '\n')

    # calculate min and max or order date in order to calculate recency
    max_order_date, min_order_date = init_flat_data \
        .select( _max(col('order_date')), _min(col('order_date'))) \
        .take(1)[0]

    # calculate recency/frequency and monetary
    calculate_diff_day = udf(lambda x: (max_order_date - x).days,
                             IntegerType())
    rfm_table = init_flat_data \
        .withColumn('recency', calculate_diff_day('order_date')) \
        .groupby(['company_id', 'company_name', 'country']) \
        .agg(
            _mean(col('recency')).alias('recency'),
            _count(col('order_id')).alias('frequency'),
            _sum(col('NBI')).alias('monetary')
        )

    # calculate quantiles for each variable
コード例 #11
0
schema = t.StructType() \
        .add("time", t.StringType(), True) \
        .add("open", t.DoubleType(), True) \
        .add("close", t.DoubleType(), True) \
        .add("high", t.DoubleType(), True) \
        .add("low", t.DoubleType(), True) \
        .add("volume", t.DoubleType(), True) \
        .add("input_file", t.StringType(), True)
df = spark.read.format(file_type).options(header="true",inferSchema="true").schema(schema).load(file_location).withColumn("input_file", input_file_name())

# Get and split file name to create a column with the coin pair corresponding for each row
split_col = split(df['input_file'], '/')
df = df.withColumn('coin_pair', split(split_col.getItem(3),'\.').getItem(0))
df = df.drop("input_file")

# We have a timestamp and we want a date
df = df.withColumn('Date', from_unixtime((col('time')/1000)))

# Agregate data to have a daily result, ready to insert into the database
df = df.groupBy("coin_pair",window("Date","1 day")) \
          .agg(_sum('volume'), last('close')-first('open'), _min('low'), _max('high') ) \
          .select(col("coin_pair"), \
                  to_date(col("window.start")).alias("date"), \
                  col("sum(volume)").alias("dailyVolume"), \
                  col("(last(close) - first(open))").alias("dailyResult"), \
                  col("min(low)").alias("dailyLower"), \
                  col("max(high)").alias("dailyHigher")) \
          .orderBy("coin_pair","date")

# Write data into the MongoDB database coins and daily_coins_data collection
df.write.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://XXXXXXX/coins.daily_coins_data").mode("append").save()
コード例 #12
0
def get_df_sub_not_read_since(df_dataset_file_rse_ts_size,
                              filtered_rses_id_name_map, min_tb_limit,
                              n_months_filter):
    """Get dataframe of datasets that are not read since N months for sub details htmls

    Group by 'dataset' and 'rse_id' of get_df_dataset_file_rse_ts_size

    Filters:
        - If a dataset contains EVEN a single file with null accessed_at, filter out

    Access time filter logic:
        - If 'last_access_time_of_dataset_in_all_rses' is less than 'n_months_filter', ...
          ... set 'is_not_read_since_{n_months_filter}_months' column as True

    Columns:
        - 'dataset_size_in_rse_tb'
                Total size of a Dataset in an RSE.
                Produced by summing up datasets' all files in that RSE.
        - 'last_access_time_of_dataset_in_rse'
                Last access time of a Dataset in an RSE.
                Produced by getting max `accessed_at`(represents single file's access time) of a dataset in an RSE.
        - '#files_with_null_access_time_of_dataset_in_rse'
                Number of files count, which have NULL `accessed_at` values, of a Dataset in an RSE.
                This is important to know to filter out if there is any NULL `accessed_at` value of a Dataset.
        - '#files_of_dataset_in_rse'
                Number of files count of a Dataset in an RSE
        - '#distinct_files_of_dataset_in_rse'
                Number of unique files count of dataset in an RSE

    df_main_datasets_and_rses: RSE name, dataset and their size and access time calculations
    """
    # New boolean column name to map dataset-rse_id couples are not read at least n_months_filter or not
    bool_column_is_not_read_since_n_months = 'is_not_read_since_{}_months'.format(
        str(n_months_filter))

    # Get reverted dict to get RSE names from id
    reverted_filtered_rses_id_name_map = get_reverted_rses_id_name_map(
        filtered_rses_id_name_map)

    return df_dataset_file_rse_ts_size \
        .groupby(['rse_id', 'dataset']) \
        .agg(_round(_sum(col('f_size')) / TB_DENOMINATOR, 5).alias('dataset_size_in_rse_tb'),
             _max(col('accessed_at')).alias('last_access_time_of_dataset_in_rse'),
             _sum(
                 when(col('accessed_at').isNull(), 0).otherwise(1)
             ).alias('#_accessed_files_of_dataset_in_rse'),
             _count(lit(1)).alias('#_files_of_dataset_in_rse'),
             ) \
        .withColumn(bool_column_is_not_read_since_n_months,
                    when(
                        col('last_access_time_of_dataset_in_rse') < get_n_months_ago_epoch_msec(n_months_filter),
                        True).otherwise(False)
                    ) \
        .filter(col('last_access_time_of_dataset_in_rse').isNotNull()) \
        .filter(col(bool_column_is_not_read_since_n_months)) \
        .filter(col('dataset_size_in_rse_tb') > min_tb_limit) \
        .replace(reverted_filtered_rses_id_name_map, subset=['rse_id']) \
        .withColumnRenamed('rse_id', 'RSE name') \
        .select(['RSE name',
                 'dataset',
                 'dataset_size_in_rse_tb',
                 'last_access_time_of_dataset_in_rse',
                 '#_files_of_dataset_in_rse',
                 '#_accessed_files_of_dataset_in_rse',
                 ]) \
        .cache()
コード例 #13
0
def main():
    """
    TODO: Create html page

    Access time filter logic:
        - If "last_access_ts" is less than 3 months ago, then set "months_old" as 3,
        - If "last_access_ts" is less than 6 monthsa ago, then set "months_old" as 6,
        - If "last_access_ts" is less than 12 months ago, then set "months_old" as 12

    The result includes only the datasets whose last access time are 12, 6 or 3 months ago.
    """
    spark = get_spark_session()
    (df_contents_f_to_b, df_contents_b_to_d, df_replicas, df_dids_files,
     df_replicas_j_dids, df_files_complete) = prepare_spark_dataframes(spark)

    # ===============================================================================
    # Continue with joins
    # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    # -------------------------------------------------------------------------------

    # --- STEP-10 / Tests to check dataframes are okay ---:
    #         df_block_file_rse.select("file").distinct().count() =  is 29921156
    #         df_block_file_rse.filter(col("file").isNull()).count() = 0
    #         df_block_file_rse.filter(col("block").isNull()).count() = 57892
    #         Above line means, we cannot extract block names of 57892 file from CONTENTS table ..
    #         .. which provides F:D and D:C mapping (file, dataset, container in Rucio terms)
    #         df_block_file_rse.filter(col("rse_id").isNull()).count() = 0
    #         df_block_file_rse.filter(col("fsize").isNull()).count() = 0
    #         We are all good, just drop null block names.

    # STEP-10: Left join df_files_complete and df_contents_f_to_b to get block names of files.
    #   - There are some files that we cannot extract their block names from CONTENTS table
    #   - So filter out them.
    df_block_file_rse = df_files_complete \
        .join(df_contents_f_to_b, ["file"], how="left") \
        .select(['block', 'file', 'rse_id', 'accessed_at', 'fsize', ]) \
        .filter(col("block").isNotNull()) \
        .cache()

    # --- STEP-11 / Tests to check dataframes are okay ---:
    #         df_all.filter(col("dataset").isNull()).count() = 280821

    # STEP-11: Left join df_block_file_rse and df_contents_b_to_d to get dataset names of blocks&files.
    #   - There are some blocks that we cannot extract their dataset names from CONTENTS table.
    #   - So filter out them.
    df_all = df_block_file_rse \
        .join(df_contents_b_to_d, ["block"], how="left") \
        .select(['dataset', 'block', 'file', 'rse_id', 'accessed_at', 'fsize']) \
        .filter(col("dataset").isNotNull()) \
        .cache()

    # STEP-12: Group by "dataset" and "rses" to calculate:
    #       - dataset_size_in_rse: total size of dataset in a RSE by summing up dataset's all files in that RSE.
    #       - `last_access_time_of_dataset_per_rse`: last access time of dataset in a RSE ...
    #           ... by getting max of file `accessed_at` field of dataset's all files in that RSE.
    #       - `#files_null_access_time_per_rse`: number of files which has NULL `accessed_at` field ...
    #           ... in each dataset in a RSE. ...
    #           ... This important to know to filter out if there is any NULL accessed_at file in calculation.
    #       - `#files_per_rse`: number of files od the dataset in that RSE
    #       - `#files_unique_per_rse`: unique count of dataset files in that RSE
    #       Final result will be like: one dataset can be in multiple RSEs and presumably ...
    #           ... it may have different sizes since a dataset may lost one of its block or file in a RSE?
    df_final_dataset_rse = df_all \
        .groupby(["dataset", "rse_id"]) \
        .agg(_sum(col("fsize")).alias("dataset_size_in_rse"),
             _max(col("accessed_at")).alias("last_access_time_of_dataset_per_rse"),
             _sum(when(col("accessed_at").isNull(), 1).otherwise(0)).alias("#files_null_access_time_per_rse"),
             _count(lit(1)).alias("#files_per_rse"),
             countDistinct(col("file")).alias("#files_unique_per_rse"),
             ) \
        .cache()

    # STEP-13: Get thresholds. They are unix timestamps which are 3, 6 and 12 months ago from today.
    ts_thresholds = get_ts_thresholds()

    # STEP-14:
    #   Filter for calculating last_accessed_at_least_{12|6|3}_months_ago columns.
    #       - To produce correct results, "last_access_time_of_dataset_per_rse" field should not be null
    #           which means a dataset's all files' accessed_at fields are filled.
    #       - And "#files_null_access_time_per_rse"==0 means that there should not be ...
    #           any file with NULL "accessed_at" field.
    # Group by dataset to get final result from all RSEs' datasets.
    #   - max_dataset_size(TB): max size of dataset in all RSEs that contain this dataset
    #   - max_dataset_size(TB): min size of dataset in all RSEs that contain this dataset
    #   - max_dataset_size(TB): avg size of dataset in all RSEs that contain this dataset
    #   - last_access_time_of_dataset: last access time of dataset in all RSEs
    df = df_final_dataset_rse \
        .filter(col("last_access_time_of_dataset_per_rse").isNotNull() &
                (col("#files_null_access_time_per_rse") == 0)
                ) \
        .groupby(["dataset"]) \
        .agg(_round(_max(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("max_dataset_size(TB)"),
             _round(_min(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("min_dataset_size(TB)"),
             _round(_avg(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("avg_dataset_size(TB)"),
             _sum(col("#files_null_access_time_per_rse")).alias("#files_null_access_time_per_dataset"),
             _max(col("last_access_time_of_dataset_per_rse")).alias("last_access_time_of_dataset"),
             ) \
        .withColumn('last_access_more_than_12_months_ago',
                    when(col('last_access_time_of_dataset') < ts_thresholds[12], 1).otherwise(0)
                    ) \
        .withColumn('last_access_more_than_6_months_ago',
                    when(col('last_access_time_of_dataset') < ts_thresholds[6], 1).otherwise(0)
                    ) \
        .withColumn('last_access_more_than_3_months_ago',
                    when(col('last_access_time_of_dataset') < ts_thresholds[3], 1).otherwise(0)
                    ) \
        .filter((col('last_access_more_than_12_months_ago') == 1) |
                (col('last_access_more_than_6_months_ago') == 1) |
                (col('last_access_more_than_3_months_ago') == 1)
                ) \
        .cache()

    # STEP-15: Find datasets which have only null accessed_at fields in its files
    df_all_null_accessed_at = df_final_dataset_rse \
        .filter(col("last_access_time_of_dataset_per_rse").isNull()) \
        .groupby(["dataset"]) \
        .agg(_round(_max(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("max_dataset_size(TB)"),
             _round(_min(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("min_dataset_size(TB)"),
             _round(_avg(col("dataset_size_in_rse")) / (10 ** 12), 2).alias("avg_dataset_size(TB)"),
             _sum(col("#files_null_access_time_per_rse")).alias("#files_null_access_time_per_dataset"),
             _max(col("last_access_time_of_dataset_per_rse")).alias("last_access_time_of_dataset"),
             ) \
        .cache()

    # Total for not null data: not read more than 3,6,12 months which is equal to more than 3 months values.
    df.select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()

    # For 12 months
    df.filter(col("last_access_more_than_12_months_ago") == 1).select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df.filter(col("last_access_more_than_12_months_ago") == 1).count())

    # For 6 months
    df.filter(col("last_access_more_than_6_months_ago") == 1).select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df.filter(col("last_access_more_than_6_months_ago") == 1).count())

    # For 3 months
    df.filter(col("last_access_more_than_3_months_ago") == 1).select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df.filter(col("last_access_more_than_3_months_ago") == 1).count())

    # For all null accessed_at(all files) datasets
    df_all_null_accessed_at.select([
        "max_dataset_size(TB)", "min_dataset_size(TB)", "avg_dataset_size(TB)"
    ]).groupBy().sum().show()
    print(df_all_null_accessed_at.count())

    return df, df_all_null_accessed_at
コード例 #14
0
def process_and_get_pd_dfs(spark, start_date, end_date):
    schema = _get_schema()
    raw_df = (
        spark.read.option('basePath', DEFAULT_HDFS_FOLDER).json(
            get_candidate_files(start_date,
                                end_date,
                                spark,
                                base=DEFAULT_HDFS_FOLDER),
            schema=schema,
        ).select('data.*').filter(
            col("RecordTime").between(f"{start_date.timestamp() * 1000}",
                                      f"{end_date.timestamp() * 1000}")).
        filter(
            (col('Site') == 'T3_US_ANL') |  # ANL
            (col('Site') == 'T3_US_NERSC') |  # NERSC
            (col('Site') == 'T3_US_OSG') |  # OSG
            (col('Site') == 'T3_US_PSC') |  # PSC
            (col('Site') == 'T3_US_SDSC') |  # SDSC
            (col('Site') == 'T3_US_TACC') |  # TACC
            ((col('Site').endswith('_ES_PIC_BSC'))
             & (col('MachineAttrCMSSubSiteName0') == 'PIC-BSC')) |  # BSC
            ((col('Site') == 'T1_IT_CNAF')
             & (col('MachineAttrCMSSubSiteName0') == 'CNAF-CINECA'))
            |  # CINECA
            ((col('Site') == 'T1_DE_KIT')
             & (col('MachineAttrCMSSubSiteName0') == 'KIT-HOREKA')) |  # HOREKA
            ((col('Site') == 'T2_DE_RWTH')
             & (col('MachineAttrCMSSubSiteName0') == 'RWTH-HPC'))  # RWTH
        ).filter(col('Status').isin([
            'Running',
            'Completed'
        ])).withColumn('date', from_unixtime(
            (col('RecordTime') /
             1000))).withColumn(
                 'site_name',
                 when(col('Site') == 'T3_US_ANL',
                      lit("ANL")).when(
                          col('Site') == 'T3_US_NERSC', lit("NERSC")).when(
                              col('Site') == 'T3_US_OSG', lit("OSG")).when(
                                  col('Site') == 'T3_US_PSC', lit("PSC")).when(
                                      col('Site') == 'T3_US_SDSC',
                                      lit("SDSC")).when(
                                          col('Site') == 'T3_US_TACC',
                                          lit("TACC")).
                 when(
                     col('Site').endswith('_ES_PIC_BSC'), lit("BSC")).when(
                         col('MachineAttrCMSSubSiteName0') == 'CNAF-CINECA',
                         lit("CINECA")).when(
                             col('MachineAttrCMSSubSiteName0') == 'KIT-HOREKA',
                             lit("HOREKA")).
                 when(
                     col('MachineAttrCMSSubSiteName0') == 'RWTH-HPC',
                     lit("RWTH"))).withColumn(
                         "RequestCpus",
                         when(
                             col("RequestCpus").isNotNull(),
                             col("RequestCpus")).otherwise(lit(1)),
                     ).withColumn('dayofmonth', _dayofmonth(
                         col('date'))).withColumn(
                             'month',
                             concat_ws(
                                 '-', _year(col('date')),
                                 format_string('%02d', _month(
                                     col('date'))))  # 2-digit month, default 1
                         ).drop(
                             'Site',
                             'MachineAttrCMSSubSiteName0').withColumnRenamed(
                                 'site_name', 'site'))

    # There should be only Completed status for a GlobalJobId
    df_core_hr = raw_df.filter(col('Status') == 'Completed') \
        .drop_duplicates(["GlobalJobId"])

    df_core_hr_daily = df_core_hr.groupby(['site', 'month', 'dayofmonth']) \
        .agg(_round(_sum("CoreHr")).alias("sum CoreHr"))

    df_core_hr_monthly = df_core_hr.groupby(['site', 'month']) \
        .agg(_round(_sum("CoreHr")).alias("sum CoreHr"))

    sec_12_min = 60 * 12
    time_window_12m = from_unixtime(
        unix_timestamp('date') - unix_timestamp('date') % sec_12_min)

    # 1st group-by includes GlobaljobId to get running cores of GlobaljobId without duplicates in each 12 minutes window
    # 2nd group-by gets sum of RequestCpus in 12 minutes window
    # 3rd group-by gets avg of RequestCpus(12 minutes window) for each site for each month
    df_running_cores_daily = raw_df \
        .withColumn('12m_window', time_window_12m) \
        .groupby(['site', 'month', 'dayofmonth', '12m_window', 'GlobalJobId']) \
        .agg(_max(col('RequestCpus')).alias('running_cores_of_single_job_in_12m')) \
        .groupby(['site', 'month', 'dayofmonth', '12m_window']) \
        .agg(_sum(col('running_cores_of_single_job_in_12m')).alias('running_cores_12m_sum')) \
        .groupby(['site', 'month', 'dayofmonth']) \
        .agg(_round(_avg(col('running_cores_12m_sum'))).alias('running_cores_avg_over_12m_sum'))
    return df_core_hr_daily.toPandas(), df_running_cores_daily.toPandas(
    ), df_core_hr_monthly.toPandas()
コード例 #15
0
def recommandationAction(request):
    dateCreation = datetime.datetime.strptime(request.POST.get('date'), '%Y-%m-%d')
    recieved = int(model_recieved.predict([[request.POST.get("compSent"), dateCreation.month, request.POST.get("hour"), dateCreation.day, (dateCreation.weekday() + 1)]])[0])
    df_globalStat = spark.read.json("/dataLake/Tisseo/mail_sending_global_stats")
    nearnbSend = (df_globalStat.where("volumesend <= "+str(request.POST.get("compSent"))).agg(_max("volumesend").alias("nbMailSend")).collect())[0][0]
    bestClic = (df_globalStat.where("volumesend = "+str(nearnbSend)).collect())[0][1]
    opens = int(model_opens.predict([[request.POST.get("compSent"), recieved, dateCreation.month, request.POST.get("hour"), dateCreation.day, (int(dateCreation.weekday()) + 1), bestClic]]))
    clics = int(model_clic.predict([[request.POST.get("compSent"), recieved, dateCreation.month, request.POST.get("hour"), dateCreation.day , (int(dateCreation.weekday()) + 1), opens ]]))
    data = {'data_sent_recieved_inqueue' : [request.POST.get("compSent"), recieved, (int(request.POST.get("compSent")) - recieved)], "data_open_clic" : [request.POST.get("compSent"), opens, clics]}
    if request.POST.get("databaseSave") == "1" :
        if request.method == "POST":
            obj = {'user' : request.user,'compName' : request.POST.get('compName'),'sent' : request.POST.get('compSent'),'recieved' : recieved,'opens' : opens,'clics' : clics,'dateRecomendation' : request.POST.get('date') ,'hour' : request.POST.get('hour')}
            recom = recommandationForm(obj)
            recom.save()
            recom_Analyse = recommandation.objects.aggregate(Sum('sent'), Sum('recieved'), Sum('clics'), Sum('opens'))
            data["sent"] = recom_Analyse["sent__sum"]
            data["recieved"] = recom_Analyse["recieved__sum"]
            data["clics"] = recom_Analyse["clics__sum"]
            data["opens"] = recom_Analyse["opens__sum"]
            data["in_queue"] = recom_Analyse["sent__sum"] - recom_Analyse["recieved__sum"]
            data["nbComp"] = recommandation.objects.count()
            data['comp_list'] = render_to_string('ajax_template_dashbord/listCompagnes.html',{'recommandation' : recommandation.objects.all()}, request=request)
            dataPerTimes = recommandation.objects.values('dateRecomendation').order_by('dateRecomendation').annotate(clics=Sum('clics'), opens=Sum('opens'), recieved=Sum('recieved'))
            labels, clicsLst, opensLst, recievedLst = [], [], [], []
            for dataPerTime in dataPerTimes:
                recievedLst.append(dataPerTime["recieved"])
                clicsLst.append(dataPerTime["clics"])
                opensLst.append(dataPerTime["opens"])
                dateRec = dataPerTime["dateRecomendation"].split("-")
                labels.append(str(dateRec[2]+" "+(calendar.month_name[int(dateRec[1])])+" "+dateRec[0]))
            data["recievedLst"] = recievedLst
            data["clicsLst"] = clicsLst
            data["opensLst"] = opensLst
            data["labels"] = labels
    return HttpResponse(json.dumps(data), content_type="application/json")
コード例 #16
0
# Define schema an retrieve data from the blob storage
schema = t.StructType() \
        .add("time", t.StringType(), True) \
        .add("open", t.DoubleType(), True) \
        .add("close", t.DoubleType(), True) \
        .add("high", t.DoubleType(), True) \
        .add("low", t.DoubleType(), True) \
        .add("volume", t.DoubleType(), True) \
        .add("input_file", t.StringType(), True)
df = spark.read.format(file_type).options(
    header="true",
    inferSchema="true").schema(schema).load(file_location).withColumn(
        "input_file", input_file_name())

# Get and split file name to create a column with the coin pair corresponding for each row
split_col = split(df['input_file'], '/')
df = df.withColumn('coin_pair', split(split_col.getItem(3), '\.').getItem(0))
df = df.drop("input_file")

# We have a timestamp and we want a date
df = df.withColumn('Date', from_unixtime((col('time') / 1000)))

# Agregate data to have a monthly result, ready to insert into the database
df = df.withColumn("date",to_date(concat_ws("-",year('Date'),month('Date')))) \
          .groupBy("coin_pair","date") \
          .agg(_sum('volume').alias("monthlyVolume"), (last('close')-first('open')).alias("monthlyResult"), _min('low').alias("monthlyLower"), _max('high').alias("monthlyHigher")) \
          .orderBy("coin_pair","date")

# Write data into the MongoDB database coins and monthly_coins_data collection
df.write.format("com.mongodb.spark.sql.DefaultSource").option(
    "uri", "mongodb://XXXXXXX/coins.monthly_coins_data").mode("append").save()
コード例 #17
0
    N = 5
    """ repartition based on 'LNAME' and 'Address' and generate spark_partiion_id
    then run mapPartitions() function and create in-partition idx
    """
    df1 = df.repartition(N, 'LNAME', 'Address') \
            .rdd.mapPartitionsWithIndex(func) \
            .toDF()

    # get number of unique rows (based on Address+LNAME) which is max_idx
    # and then grab the running SUM of this rcnt
    # the new df should be small and just cache it
    w1 = Window.partitionBy().orderBy('partition_id').rowsBetween(
        Window.unboundedPreceding, -1)

    df2 = df1.groupby('partition_id') \
             .agg((_max('idx')).alias('cnt')) \
             .withColumn('rcnt', coalesce(_sum('cnt').over(w1),lit(0))) \
             .cache()
    df2.show()
    #+------------+---+----+
    #|partition_id|cnt|rcnt|
    #+------------+---+----+
    #|           0|  3|   0|
    #|           1|  1|   3|
    #|           2|  1|   4|
    #|           4|  1|   5|
    #+------------+---+----+
    """join df1 with df2 and create id = idx + rcnt"""
    df_new = df1.join(df2,
                      on=['partition_id']).withColumn('id',
                                                      col('idx') + col('rcnt'))
コード例 #18
0
ファイル: graham_metrics.py プロジェクト: satyakig/seng550
    .filter('Earnings < 0')\
    .select('Instrument').distinct().orderBy('Instrument')

positive_earnings = fund_data\
    .groupBy('Instrument', year('Date').alias('actual_year'))\
    .agg(avg('Normalized_Income_Avail_to_Cmn_Shareholders').alias('Earnings'))\
    .orderBy('Instrument', 'actual_year')\
    .filter('actual_year >= 2015')\
    .filter('Earnings > 0')
positive_earnings = positive_earnings.join(negative_earnings, 'Instrument', 'leftanti')

positive_earnings.show()
positive_earnings = positive_earnings.withColumn('metric4', lit(1))\
    .drop('actual_year', 'Earnings')\
    .groupBy('Instrument')\
    .agg(_max('metric4').alias('metric4'))\
    .orderBy('Instrument')
positive_earnings.show()


# Metric 5: Dividend payment for past 5 years
no_dividend = fund_data\
    .groupBy('Instrument', year('Date').alias('actual_year'))\
    .agg((_sum('Gross_Dividends___Common_Stock') / avg('Total_Common_Shares_Outstanding')).alias('Dividend')) \
    .filter('actual_year >= 2015')\
    .filter('Dividend == 0')\
    .select('Instrument').distinct().orderBy('Instrument')

dividend = fund_data\
    .groupBy('Instrument', year('Date').alias('actual_year'))\
    .agg((_sum('Gross_Dividends___Common_Stock') / avg('Total_Common_Shares_Outstanding')).alias('Dividend')) \
コード例 #19
0
def create_main_df(spark, hdfs_paths, base_eos_dir):
    # UTC timestamp of start hour of spark job
    ts_current_hour = int(datetime.utcnow().replace(
        minute=0, second=0, microsecond=0, tzinfo=timezone.utc).timestamp() *
                          1000)
    # -----------------------------------------------------------------------------------------------------------------
    #                -- ==================  Prepare main Spark dataframes  ===========================

    # Get RSES id, name, type, tier, country, kind from RSES table dump
    df_rses = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['RSES']) \
        .filter(col('DELETED_AT').isNull()) \
        .withColumn('replica_rse_id', lower(_hex(col('ID')))) \
        .withColumnRenamed('RSE', 'rse') \
        .withColumnRenamed('RSE_TYPE', 'rse_type') \
        .withColumn('rse_tier', _split(col('rse'), '_').getItem(0)) \
        .withColumn('rse_country', _split(col('rse'), '_').getItem(1)) \
        .withColumn('rse_kind',
                    when(col("rse").endswith('Temp'), 'temp')
                    .when(col("rse").endswith('Test'), 'test')
                    .otherwise('prod')
                    ) \
        .select(['replica_rse_id', 'rse', 'rse_type', 'rse_tier', 'rse_country', 'rse_kind'])

    # Rucio Dataset(D) refers to dbs block, so we used DBS terminology from the beginning
    df_contents_f_to_b = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['CONTENTS']) \
        .filter(col("SCOPE") == "cms") \
        .filter(col("DID_TYPE") == "D") \
        .filter(col("CHILD_TYPE") == "F") \
        .withColumnRenamed("NAME", "block") \
        .withColumnRenamed("CHILD_NAME", "file") \
        .select(["block", "file"])

    # Rucio Dataset(D) refers to dbs block; Rucio Container(C) refers to dbs dataset.
    # We used DBS terminology from the beginning
    df_contents_b_to_d = spark.read.format("com.databricks.spark.avro").load(hdfs_paths['CONTENTS']) \
        .filter(col("SCOPE") == "cms") \
        .filter(col("DID_TYPE") == "C") \
        .filter(col("CHILD_TYPE") == "D") \
        .withColumnRenamed("NAME", "dataset") \
        .withColumnRenamed("CHILD_NAME", "block") \
        .select(["dataset", "block"])

    # Get file to dataset map
    df_contents_ds_files = df_contents_f_to_b.join(df_contents_b_to_d, ["block"], how="left") \
        .filter(col('file').isNotNull()) \
        .filter(col('dataset').isNotNull()) \
        .withColumnRenamed('dataset', 'contents_dataset') \
        .withColumn('is_d_name_from_rucio', lit(BOOL_STR[True])) \
        .select(["contents_dataset", "file", "is_d_name_from_rucio"])

    dbs_files = spark.read.format('avro').load(hdfs_paths['FILES']) \
        .withColumnRenamed('LOGICAL_FILE_NAME', 'file') \
        .withColumnRenamed('DATASET_ID', 'dbs_file_ds_id') \
        .withColumnRenamed('FILE_SIZE', 'dbs_file_size') \
        .select(['file', 'dbs_file_ds_id', 'dbs_file_size'])

    dbs_datasets = spark.read.format('avro').load(hdfs_paths['DATASETS'])

    df_dbs_ds_files = dbs_files.join(dbs_datasets.select(['DATASET_ID', 'DATASET']),
                                     dbs_files.dbs_file_ds_id == dbs_datasets.DATASET_ID, how='left') \
        .filter(col('file').isNotNull()) \
        .filter(col('DATASET').isNotNull()) \
        .withColumnRenamed('dbs_file_ds_id', 'dbs_dataset_id') \
        .withColumnRenamed('DATASET', 'dbs_dataset') \
        .withColumn('is_d_name_from_dbs', lit(BOOL_STR[True])) \
        .select(['file', 'dbs_dataset', 'is_d_name_from_dbs'])

    # Prepare replicas
    df_replicas = spark.read.format('avro').load(hdfs_paths['REPLICAS']) \
        .filter(col("SCOPE") == "cms") \
        .withColumn('replica_rse_id', lower(_hex(col('RSE_ID')))) \
        .withColumn('replica_file_size', col('BYTES').cast(LongType())) \
        .withColumnRenamed('NAME', 'file') \
        .withColumnRenamed('ACCESSED_AT', 'replica_accessed_at') \
        .withColumnRenamed('CREATED_AT', 'replica_created_at') \
        .withColumnRenamed('LOCK_CNT', 'lock_cnt') \
        .withColumnRenamed('STATE', 'state') \
        .select(['file', 'replica_rse_id', 'replica_file_size',
                 'replica_accessed_at', 'replica_created_at', 'lock_cnt'])

    # Create enriched file df which adds dbs file size to replicas files. Left join select only replicas files
    df_files_enriched_with_dbs = df_replicas \
        .join(dbs_files.select(['file', 'dbs_file_size']), ['file'], how='left') \
        .withColumn('joint_file_size',
                    when(col('replica_file_size').isNotNull(), col('replica_file_size'))
                    .when(col('dbs_file_size').isNotNull(), col('dbs_file_size'))
                    ) \
        .select(['file', 'replica_rse_id', 'replica_accessed_at', 'replica_created_at', 'lock_cnt',
                 'replica_file_size', 'dbs_file_size', 'joint_file_size'])

    # -----------------------------------------------------------------------------------------------------------------
    #            -- ==================  only Rucio: Replicas and Contents  ======================= --

    df_only_from_rucio = df_replicas \
        .join(df_contents_ds_files, ['file'], how='left') \
        .select(['contents_dataset', 'file', 'replica_rse_id', 'replica_file_size',
                 'replica_accessed_at', 'replica_created_at', 'is_d_name_from_rucio', 'lock_cnt'])

    # Use them in outer join
    # _max(col('replica_accessed_at')).alias('rucio_last_accessed_at'),
    # _max(col('replica_created_at')).alias('rucio_last_created_at'),

    df_only_from_rucio = df_only_from_rucio \
        .groupby(['replica_rse_id', 'contents_dataset']) \
        .agg(_sum(col('replica_file_size')).alias('rucio_size'),
             _count(lit(1)).alias('rucio_n_files'),
             _sum(
                 when(col('replica_accessed_at').isNull(), 0)
                     .otherwise(1)
             ).alias('rucio_n_accessed_files'),
             _first(col("is_d_name_from_rucio")).alias("is_d_name_from_rucio"),
             _sum(col('lock_cnt')).alias('rucio_locked_files')
             ) \
        .withColumn('rucio_is_d_locked',
                    when(col('rucio_locked_files') > 0, IS_DATASET_LOCKED[True])
                    .otherwise(IS_DATASET_LOCKED[False])
                    ) \
        .select(['contents_dataset', 'replica_rse_id', 'rucio_size', 'rucio_n_files', 'rucio_n_accessed_files',
                 'is_d_name_from_rucio', 'rucio_locked_files', 'rucio_is_d_locked', ])

    # -----------------------------------------------------------------------------------------------------------------
    #             -- =================  only DBS: Replicas, Files, Datasets  ====================== --

    # Of course only files from Replicas processed, select only dbs related fields
    df_only_from_dbs = df_files_enriched_with_dbs \
        .select(['file', 'replica_rse_id', 'dbs_file_size', 'replica_accessed_at', 'lock_cnt']) \
        .join(df_dbs_ds_files, ['file'], how='left') \
        .filter(col('dbs_dataset').isNotNull()) \
        .select(['file', 'dbs_dataset', 'replica_rse_id', 'dbs_file_size', 'replica_accessed_at',
                 'is_d_name_from_dbs', 'lock_cnt'])

    df_only_from_dbs = df_only_from_dbs \
        .groupby(['replica_rse_id', 'dbs_dataset']) \
        .agg(_sum(col('dbs_file_size')).alias('dbs_size'),
             _count(lit(1)).alias('dbs_n_files'),
             _sum(
                 when(col('replica_accessed_at').isNull(), 0)
                     .otherwise(1)
             ).alias('dbs_n_accessed_files'),
             _first(col("is_d_name_from_dbs")).alias("is_d_name_from_dbs"),
             _sum(col('lock_cnt')).alias('dbs_locked_files')
             ) \
        .withColumn('dbs_is_d_locked',
                    when(col('dbs_locked_files') > 0, IS_DATASET_LOCKED[True])
                    .otherwise(IS_DATASET_LOCKED[False])
                    ) \
        .select(['dbs_dataset', 'replica_rse_id', 'dbs_size', 'dbs_n_files', 'dbs_n_accessed_files',
                 'is_d_name_from_dbs', 'dbs_locked_files', 'dbs_is_d_locked'])

    # Full outer join of Rucio and DBS to get all dataset-file maps
    df_dataset_file_map_enr = df_contents_ds_files.join(df_dbs_ds_files,
                                                        ['file'],
                                                        how='full')

    # -----------------------------------------------------------------------------------------------------------------
    #               -- ======  check files do not have dataset name  ============ --

    # Check Replicas files do not have dataset name in Contents, DBS or both
    x = df_replicas.join(df_dataset_file_map_enr, ['file'], how='left') \
        .select(['contents_dataset', 'dbs_dataset', 'file'])

    y_contents = x.filter(col('contents_dataset').isNull())
    z_dbs = x.filter(col('dbs_dataset').isNull())
    t_both = x.filter(
        col('contents_dataset').isNull() & col('dbs_dataset').isNull())
    stats_dict = {
        "Replicas files do not have dataset name in Contents":
        y_contents.select('file').distinct().count(),
        "Replicas files do not have dataset name in DBS":
        z_dbs.select('file').distinct().count(),
        "Replicas files do not have dataset name neither in Contents nor DBS":
        t_both.select('file').distinct().count()
    }
    write_stats_to_eos(base_eos_dir, stats_dict)
    del x, y_contents, z_dbs, t_both

    # -----------------------------------------------------------------------------------------------------------------
    #              -- ======  joint Rucio and DBS: Replicas, Contents, Files, Datasets  ============ --

    # Main aim is to get all datasets of files
    df_dataset_file_map_enr = df_dataset_file_map_enr \
        .withColumn("dataset",
                    when(col("contents_dataset").isNotNull(), col("contents_dataset"))
                    .when(col("dbs_dataset").isNotNull(), col("dbs_dataset"))
                    ) \
        .withColumn("is_ds_from_rucio", when(col("is_d_name_from_rucio").isNotNull(), 1).otherwise(0)) \
        .withColumn("is_ds_from_dbs", when(col("is_d_name_from_dbs").isNotNull(), 1).otherwise(0)) \
        .select(['dataset', 'file', 'is_ds_from_dbs', 'is_ds_from_rucio'])

    df_joint_ds_files = df_files_enriched_with_dbs \
        .select(['file', 'replica_rse_id', 'replica_accessed_at', 'replica_created_at',
                 'joint_file_size', 'lock_cnt']) \
        .join(df_dataset_file_map_enr, ['file'], how='left') \
        .filter(col('dataset').isNotNull()) \
        .select(['dataset', 'file', 'is_ds_from_dbs', 'is_ds_from_rucio',
                 'replica_rse_id', 'replica_accessed_at', 'replica_created_at', 'joint_file_size', 'lock_cnt'])

    df_joint_main = df_joint_ds_files \
        .groupby(['replica_rse_id', 'dataset']) \
        .agg(_sum(col('joint_file_size')).alias('joint_size'),
             _max(col('replica_accessed_at')).alias('joint_last_accessed_at'),
             _max(col('replica_created_at')).alias('joint_last_created_at'),
             _sum(col('is_ds_from_dbs')).alias('joint_dbs_n_files'),
             _sum(col('is_ds_from_rucio')).alias('joint_rucio_n_files'),
             _count(lit(1)).alias('joint_n_files'),
             _sum(
                 when(col('replica_accessed_at').isNull(), 0).otherwise(1)
             ).alias('joint_n_accessed_files'),
             _sum(col('lock_cnt')).alias('joint_locked_files')
             ) \
        .withColumn('all_f_in_dbs',
                    when((col('joint_dbs_n_files') == 0) & (col('joint_dbs_n_files').isNull()),
                         IS_ALL_DATASET_FILES_EXISTS['n'])
                    .when(col('joint_dbs_n_files') == col('joint_n_files'), IS_ALL_DATASET_FILES_EXISTS['a'])
                    .when(col('joint_dbs_n_files') > 0, IS_ALL_DATASET_FILES_EXISTS['p'])
                    ) \
        .withColumn('all_f_in_rucio',
                    when((col('joint_rucio_n_files') == 0) & (col('joint_rucio_n_files').isNull()),
                         IS_ALL_DATASET_FILES_EXISTS['n'])
                    .when(col('joint_rucio_n_files') == col('joint_n_files'), IS_ALL_DATASET_FILES_EXISTS['a'])
                    .when(col('joint_rucio_n_files') > 0, IS_ALL_DATASET_FILES_EXISTS['p'])
                    ) \
        .withColumn('joint_is_d_locked',
                    when(col('joint_locked_files') > 0, IS_DATASET_LOCKED[True])
                    .otherwise(IS_DATASET_LOCKED[False])
                    ) \
        .withColumnRenamed("replica_rse_id", "rse_id") \
        .select(['dataset',
                 'rse_id',
                 'joint_size',
                 'joint_last_accessed_at',
                 'joint_last_created_at',
                 'joint_dbs_n_files',
                 'joint_rucio_n_files',
                 'joint_n_files',
                 'joint_n_accessed_files',
                 'all_f_in_dbs',
                 'all_f_in_rucio',
                 'joint_locked_files',
                 'joint_is_d_locked'
                 ])
    # -----------------------------------------------------------------------------------------------------------------
    #          -- ============  Dataset enrichment with Dataset tags  ============ --

    # Enrich dbs dataset with names from id properties of other tables
    dbs_data_tiers = spark.read.format('avro').load(hdfs_paths['DATA_TIERS'])
    dbs_physics_group = spark.read.format('avro').load(
        hdfs_paths['PHYSICS_GROUPS'])
    dbs_acquisition_era = spark.read.format('avro').load(
        hdfs_paths['ACQUISITION_ERAS'])
    dbs_dataset_access_type = spark.read.format('avro').load(
        hdfs_paths['DATASET_ACCESS_TYPES'])

    dbs_datasets_enr = dbs_datasets \
        .join(dbs_data_tiers, ['data_tier_id'], how='left') \
        .join(dbs_physics_group, ['physics_group_id'], how='left') \
        .join(dbs_acquisition_era, ['acquisition_era_id'], how='left') \
        .join(dbs_dataset_access_type, ['dataset_access_type_id'], how='left') \
        .select(['dataset', 'dataset_id', 'is_dataset_valid', 'primary_ds_id', 'processed_ds_id', 'prep_id',
                 'data_tier_id', 'data_tier_name',
                 'physics_group_id', 'physics_group_name',
                 'acquisition_era_id', 'acquisition_era_name',
                 'dataset_access_type_id', 'dataset_access_type'])

    # -----------------------------------------------------------------------------------------------------------------
    #                       -- ============  Main: join all  ============ --

    cond_with_only_rucio = [
        df_joint_main.dataset == df_only_from_rucio.contents_dataset,
        df_joint_main.rse_id == df_only_from_rucio.replica_rse_id
    ]

    cond_with_only_dbs = [
        df_joint_main.dataset == df_only_from_dbs.dbs_dataset,
        df_joint_main.rse_id == df_only_from_dbs.replica_rse_id
    ]

    # Left joins: since df_join_main has outer join, should have all datasets of both Rucio and DBS
    df_main = df_joint_main.join(df_only_from_rucio,
                                 cond_with_only_rucio,
                                 how='left').drop('replica_rse_id')
    df_main = df_main.join(df_only_from_dbs, cond_with_only_dbs,
                           how='left').drop('replica_rse_id')

    df_main = df_main \
        .withColumn('rucio_has_ds_name',
                    when(col('is_d_name_from_rucio').isNotNull(), col('is_d_name_from_rucio'))
                    .otherwise(BOOL_STR[False])) \
        .withColumn('dbs_has_ds_name',
                    when(col('is_d_name_from_dbs').isNotNull(), col('is_d_name_from_dbs'))
                    .otherwise(BOOL_STR[False]))

    # Remove unneeded columns by selecting specific ones
    df_main = df_main.select([
        'dataset', 'rse_id', 'joint_size', 'joint_last_accessed_at',
        'joint_last_created_at', 'joint_dbs_n_files', 'joint_rucio_n_files',
        'joint_n_files', 'joint_n_accessed_files', 'all_f_in_dbs',
        'all_f_in_rucio', 'rucio_size', 'rucio_n_files',
        'rucio_n_accessed_files', 'rucio_has_ds_name', 'dbs_size',
        'dbs_n_files', 'dbs_n_accessed_files', 'dbs_has_ds_name',
        'rucio_locked_files', 'rucio_is_d_locked', 'dbs_locked_files',
        'dbs_is_d_locked', 'joint_locked_files', 'joint_is_d_locked'
    ])

    # Add DBS dataset enrichment's to main df
    df_main = df_main.join(dbs_datasets_enr, ['dataset'], how='left')

    # Add RSES name, type, tier, country, kind to dataset
    df_main = df_main \
        .join(df_rses, df_main.rse_id == df_rses.replica_rse_id, how='left') \
        .drop('rse_id', 'replica_rse_id')

    # UTC timestamp of start hour of the spark job
    df_main = df_main.withColumn('tstamp_hour', lit(ts_current_hour))

    # Fill null values of string type columns. Null values is hard to handle in ES queries.
    df_main = df_main.fillna(value=NULL_STR_TYPE_COLUMN_VALUE,
                             subset=STR_TYPE_COLUMNS)
    return df_main