Beispiel #1
0
def run_eos(date, fout, ctx, sql_context, verbose=False):

    if verbose:
        print('Starting EOS part')

    # Create fout by adding stream name and date paths
    fout = fout + "/EOS/" + short_date_string(date)

    # Convert date
    date = short_date_string(date)

    # Create EOS tables in sql_context
    eos_tables(sql_context, date=date, verbose=verbose)

    if verbose:
        print('Will build query for EOS and DBS tables')

    # - file name       +
    # - file size       +
    # - primds          +
    # - procds          +
    # - tier            +
    # - site name
    # - file replicas
    # - user dn         +
    # - start/end time  +
    # - read bytes
    # - cpu/wc values
    # - source: eos     +

    # EOS columns
    # Same timestamp is used in both start and end times
    eos_cols = ['file_lfn AS file_name',
                'user_dn',
                '"eos" AS source',
                'timestamp AS start_time',
                'timestamp AS end_time']

    # DBS columns
    ddf_cols = ['d_dataset']
    fdf_cols = ['f_file_size AS file_size']

    # Concatenate arrays with column names (i.e. use all column names from arrays)
    cols = eos_cols + ddf_cols + fdf_cols

    # Build a query with "cols" columns. Join DDF, FDF and EOS tables
    query = ("SELECT %s FROM ddf "
             "JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id "
             "JOIN eos_df ON fdf.f_logical_file_name = eos_df.file_lfn") % ','.join(cols)

    result = run_query(query, sql_context, verbose)

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset(result, 'd_dataset')

    output_dataframe(fout, result, verbose)

    if verbose:
        print('Finished EOS part')
Beispiel #2
0
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    ddf = tables['ddf']  # dataset table
    fdf = tables['fdf']  # file table

    if verbose:
        for row in ddf.head(1):
            print("### ddf row", row)

    # read CMSSW avro rdd
    date = eos_date(date)
    tables.update(eos_tables(sqlContext, date=date, verbose=verbose))
    eos_df = tables['eos_df']  # EOS table

    if verbose:
        for row in eos_df.head(1):
            print("### eos_df row", row)

    # merge DBS and CMSSW data
    cols = ['d_dataset', 'd_dataset_id', 'f_logical_file_name', 'file_lfn']
    stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN eos_df ON fdf.f_logical_file_name = eos_df.file_lfn' % ','.join(
        cols)
    joins = sqlContext.sql(stmt)
    print_rows(joins, stmt, verbose)

    # perform aggregation
    fjoin = joins.groupBy(['d_dataset'])\
            .agg({'file_lfn':'count'})\
            .withColumnRenamed('count(file_lfn)', 'count')\
            .withColumnRenamed('d_dataset', 'dataset')\
            .withColumn('date', lit(eos_date_unix(date)))\
            .withColumn('count_type', lit('eos'))\

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        ndf = split_dataset(fjoin, 'dataset')
        ndf.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Beispiel #3
0
def generate_parquet(
    date,
    # hdir="hdfs:///project/monitoring/archive/eos/logs/reports/cms", #before 2020
    hdir="hdfs:///project/monitoring/archive/eos-report/logs/cms",  # after 2020
    parquetLocation=DEFAULT_PARQUET_LOCATION,
    spark=None,
    mode="append",
    verbose=False,
):
    """
    Creates or append to the given parquet file.
    Args:
        date: date string in format yyyy/MM/dd or a date glob expresion,
                 e.g. '2019/[0-1][0-9]/[0-3][0-9]', '2019/02/18'
        hdir: raw eos dataset location in hdfs
    parquetLocation: location of the parquet dataset in hdfs
    spark: the spark session object.
    mode: write mode, it could be 'append' to add records to the existing file (in new partitions),
              'overwrite' to replace the current parquet file
               or 'ErrorIfExists' to fail it the parquet file already exists.
        verbose: True if you want to see aditional output, False otherwise.
                       The verbose mode increases the execution time.
    """
    if spark is None:
        spark = get_spark_session(True, False)
    # spark.conf.set('spark.sql.session.timeZone', 'UTC')
    tables = eos_tables(spark, date=date, verbose=verbose, hdir=hdir)
    df = tables["eos_df"]
    # Repartition by day could reduce the number of files written and improve query time,
    # but will make this process slower
    write_df = df.write.partitionBy("day").mode(mode)
    if mode == "overwrite":
        # This will make that the overwrite affect only specific partitions
        # otherwise it will delete the existing dataset and create a new one.
        spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
        write_df = write_df.option("partitionOverwriteMode", "dynamic")

    write_df.parquet(parquetLocation)
Beispiel #4
0
def run_agg_eos(date, ctx, sql_context, verbose=False):
    """
    Runs aggregation for EOS stream for a certain date.
    Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream.
    Site name is taken from f_b_s_df table which is joined by file name.
    Result dataframe is sorted by nacc.
    """
    print('Starting EOS part')

    # Make a UNIX timestamp from date
    unix_date = short_date_to_unix(short_date_string(date))

    # Convert date
    date = short_date_string(date)

    # Create EOS tables in sql_context
    eos_df = eos_tables(sql_context, date=date, verbose=verbose)

    if verbose:
        print('Found %s records in EOS stream' % eos_df['eos_df'].count())

    # - site name                +
    # - dataset name             +
    # - app                      +
    # - uid                      +
    # - dn                       +
    # - number of access (nacc)  +
    # - distinct users           +
    # - stream: eos              +
    # - timestamp                +
    # - site tier                +
    # - cpu time                -1

    cols = [
        'site_name', 'dataset_name', 'count(dataset_name) AS nacc',
        'count(distinct(eos_df.user_dn)) AS distinct_users',
        '\"eos\" as stream',
        '%s AS timestamp' % unix_date,
        'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time'
    ]

    # Build a query with "cols" columns
    #    query = ("SELECT %s FROM eos_df " \
    #             "JOIN f_b_s_df ON f_b_s_df.file_name = eos_df.file_lfn " \
    #             "GROUP BY site_name, dataset_name") \
    #             % ','.join(cols)
    cols = [
        'site_name', 'dataset_name', 'parse_app(eos_df.application) AS app',
        'dn2uuid(eos_df.user_dn) AS uid', 'parse_dn(eos_df.user_dn) AS dn',
        '\"eos\" as stream',
        '%s AS timestamp' % unix_date, '-1 AS cpu'
    ]
    query = "SELECT %s FROM eos_df " \
             "JOIN f_b_s_df ON f_b_s_df.file_name = eos_df.file_lfn " \
             % ','.join(cols)
    cols = [
        'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid',
        'first(stream) as stream', 'first(timestamp) as timestamp',
        'count(dataset_name) AS nacc', 'count(dn) AS distinct_users',
        'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time',
        '-1 AS wc_time'
    ]
    query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \
            % (','.join(cols), query)

    result = run_query(query, sql_context, verbose)

    # result = result.sort(desc("nacc"))

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset_col(result, 'dataset_name')

    if verbose:
        print('Finished EOS part (output is %s records)' % result.count())
    else:
        print('Finished EOS part')

    return result