コード例 #1
0
ファイル: data_aggregation.py プロジェクト: MircoT/CMSSpark
def run_agg_jm(date, ctx, sql_context, verbose=False):
    """
    Runs aggregation for JobMonitoring stream for a certain date.
    Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream.
    Result dataframe is sorted by nacc.
    """
    print('Starting JobMonitoring part')

    # Make a UNIX timestamp from date
    unix_date = short_date_to_unix(short_date_string(date))

    # Convert date
    date = long_date_string(date)

    # Create JobMonitoring tables in sql_context
    jm_df = jm_tables(ctx, sql_context, date=date, verbose=verbose)

    if verbose:
        print('Found %s records in JobMonitoring stream' %
              jm_df['jm_df'].count())

    # - site name                +
    # - dataset name             +
    # - app                      +
    # - uid                      +
    # - dn                       +
    # - number of access (nacc)  +
    # - distinct users           +
    # - stream: crab             +
    # - timestamp                +
    # - site tier                +
    # - cpu time                 +

    cols = [
        'SiteName AS site_name', 'dataset_name', 'count(dataset_name) AS nacc',
        'count(distinct(UserId)) AS distinct_users', '\"crab\" AS stream',
        '%s AS timestamp' % unix_date,
        'first(tier_from_site_name(SiteName)) AS site_tier',
        'SUM(WrapCPU) AS cpu_time'
    ]

    # Build a query with "cols" columns
    #    query = ("SELECT %s FROM jm_df "\
    #             "JOIN f_b_s_df ON f_b_s_df.file_name = jm_df.FileName "\
    #             "GROUP BY jm_df.SiteName, dataset_name") \
    cols = [
        'SiteName AS site_name', 'dataset_name',
        'stream4app(jm_df.SubmissionTool) AS app', 'dn2uuid(GridName) AS uid',
        'parse_dn(GridName) AS dn', '\"crab\" AS stream',
        '%s AS timestamp' % unix_date, 'WrapCPU AS cpu', 'WrapWC as wc'
    ]
    query = "SELECT %s FROM jm_df "\
             "JOIN f_b_s_df ON f_b_s_df.file_name = jm_df.FileName " \
             % ','.join(cols)
    cols = [
        'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid',
        'first(stream) as stream', 'first(timestamp) as timestamp',
        'count(dataset_name) AS nacc', 'count(dn) AS distinct_users',
        'first(tier_from_site_name(site_name)) AS site_tier',
        'SUM(cpu) AS cpu_time', 'SUM(wc) AS wc_time'
    ]
    query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \
            % (','.join(cols), query)

    result = run_query(query, sql_context, verbose)

    # result = result.sort(desc("nacc"))

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset_col(result, 'dataset_name')

    if verbose:
        print('Finished JobMonitoring part (output is %s records)' %
              result.count())
    else:
        print('Finished JobMonitoring part')

    return result
コード例 #2
0
ファイル: data_aggregation.py プロジェクト: MircoT/CMSSpark
def run_agg_aaa(date,
                ctx,
                sql_context,
                hdir='hdfs:///project/monitoring/archive/xrootd/enr/gled',
                verbose=False):
    """
    Runs aggregation for JobMonitoring stream for a certain date.
    Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream.
    Data is taken from /project/monitoring/archive/xrootd/enr/gled and not the default location (raw instead of enr)
    because enr records have src_experiment_site. src_experiment_site is used as site_name.
    Result dataframe is sorted by nacc.
    """
    print('Starting AAA part')

    # Make a UNIX timestamp from date
    unix_date = short_date_to_unix(short_date_string(date))

    # Convert date
    date = short_date_string(date)

    # Create AAA tables in sql_context
    aaa_df = aaa_tables_enr(sql_context, hdir=hdir, date=date, verbose=verbose)

    if verbose:
        print('Found %s records in AAA stream' % aaa_df['aaa_df'].count())

    # - site name                +
    # - dataset name             +
    # - app                      +
    # - uid                      +
    # - dn                       +
    # - number of access (nacc)  +
    # - distinct users           +
    # - stream: aaa              +
    # - timestamp                +
    # - site tier                +
    # - cpu time                -1

    cols = [
        'src_experiment_site AS site_name', 'dataset_name',
        'count(dataset_name) AS nacc',
        'count(distinct(aaa_df.user_dn)) AS distinct_users',
        '\"aaa\" as stream',
        '%s AS timestamp' % unix_date,
        'first(tier_from_site_name(src_experiment_site)) AS site_tier',
        '-1 AS cpu_time'
    ]

    # Build a query with "cols" columns
    #    query = ("SELECT %s FROM aaa_df " \
    #             "JOIN f_b_s_df ON f_b_s_df.file_name = aaa_df.file_lfn " \
    #             "GROUP BY src_experiment_site, dataset_name") \
    #             % ','.join(cols)
    cols = [
        'src_experiment_site AS site_name', 'dataset_name',
        '\"xrootd\" AS app', 'dn2uuid(aaa_df.user_dn) AS uid',
        'parse_dn(aaa_df.user_dn) AS dn', '\"aaa\" as stream',
        '%s AS timestamp' % unix_date, '-1 AS cpu'
    ]
    query = "SELECT %s FROM aaa_df " \
             "JOIN f_b_s_df ON f_b_s_df.file_name = aaa_df.file_lfn " \
             % ','.join(cols)
    cols = [
        'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid',
        'first(stream) as stream', 'first(timestamp) as timestamp',
        'count(dataset_name) AS nacc', 'count(dn) AS distinct_users',
        'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time',
        '-1 AS wc_time'
    ]
    query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \
            % (','.join(cols), query)

    result = run_query(query, sql_context, verbose)

    # result = result.sort(desc("nacc"))

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset_col(result, 'dataset_name')

    if verbose:
        print('Finished AAA part (output is %s records)' % result.count())
    else:
        print('Finished AAA part')

    return result
コード例 #3
0
ファイル: data_aggregation.py プロジェクト: MircoT/CMSSpark
def run_agg_cmssw(date, ctx, sql_context, verbose=False):
    """
    Runs aggregation for CMSSW stream for a certain date.
    Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream.
    Result dataframe is sorted by nacc.
    """
    print('Starting CMSSW part')

    # Make a UNIX timestamp from date
    unix_date = short_date_to_unix(short_date_string(date))

    # Convert date
    date = long_date_string(date)

    # Create CMSSW tables in sql_context
    cmssw_df = cmssw_tables(ctx, sql_context, date=date, verbose=verbose)

    if verbose:
        print('Found %s records in CMSSW stream' %
              cmssw_df['cmssw_df'].count())

    # - site name                +
    # - dataset name             +
    # - app                      +
    # - uid                      +
    # - dn                       +
    # - number of access (nacc)  +
    # - distinct users           +
    # - stream: cmssw            +
    # - timestamp                +
    # - site tier                +
    # - cpu time                -1

    cols = [
        'cmssw_df.SITE_NAME AS site_name', 'dataset_name',
        'count(dataset_name) AS nacc',
        'count(distinct(USER_DN)) AS distinct_users', '\"cmssw\" as stream',
        '%s AS timestamp' % unix_date,
        'first(tier_from_site_name(cmssw_df.SITE_NAME)) AS site_tier',
        '-1 AS cpu_time'
    ]

    # Build a query with "cols" columns
    #    query = ("SELECT %s FROM cmssw_df "\
    #             "JOIN f_b_s_df ON f_b_s_df.file_name = cmssw_df.FILE_LFN "\
    #             "GROUP BY cmssw_df.SITE_NAME, dataset_name") \
    #             % ','.join(cols)
    cols = [
        'cmssw_df.SITE_NAME AS site_name', 'dataset_name',
        'parse_app(cmssw_df.APP_INFO) AS app',
        'dn2uuid(cmssw_df.USER_DN) AS uid', 'parse_dn(cmssw_df.USER_DN) AS dn',
        'stream4app(cmssw_df.APP_INFO) as stream',
        '%s AS timestamp' % unix_date, '-1 AS cpu'
    ]
    query = "SELECT %s FROM cmssw_df "\
             "JOIN f_b_s_df ON f_b_s_df.file_name = cmssw_df.FILE_LFN " \
             % ','.join(cols)
    cols = [
        'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid',
        'first(stream) as stream', 'first(timestamp) as timestamp',
        'count(dataset_name) AS nacc', 'count(dn) AS distinct_users',
        'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time',
        '-1 AS wc_time'
    ]
    query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \
            % (','.join(cols), query)

    result = run_query(query, sql_context, verbose)

    # result = result.sort(desc("nacc"))

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset_col(result, 'dataset_name')

    if verbose:
        print('Finished CMSSW part (output is %s records)' % result.count())
    else:
        print('Finished CMSSW part')

    return result
コード例 #4
0
ファイル: data_aggregation.py プロジェクト: MircoT/CMSSpark
def run_agg_eos(date, ctx, sql_context, verbose=False):
    """
    Runs aggregation for EOS stream for a certain date.
    Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream.
    Site name is taken from f_b_s_df table which is joined by file name.
    Result dataframe is sorted by nacc.
    """
    print('Starting EOS part')

    # Make a UNIX timestamp from date
    unix_date = short_date_to_unix(short_date_string(date))

    # Convert date
    date = short_date_string(date)

    # Create EOS tables in sql_context
    eos_df = eos_tables(sql_context, date=date, verbose=verbose)

    if verbose:
        print('Found %s records in EOS stream' % eos_df['eos_df'].count())

    # - site name                +
    # - dataset name             +
    # - app                      +
    # - uid                      +
    # - dn                       +
    # - number of access (nacc)  +
    # - distinct users           +
    # - stream: eos              +
    # - timestamp                +
    # - site tier                +
    # - cpu time                -1

    cols = [
        'site_name', 'dataset_name', 'count(dataset_name) AS nacc',
        'count(distinct(eos_df.user_dn)) AS distinct_users',
        '\"eos\" as stream',
        '%s AS timestamp' % unix_date,
        'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time'
    ]

    # Build a query with "cols" columns
    #    query = ("SELECT %s FROM eos_df " \
    #             "JOIN f_b_s_df ON f_b_s_df.file_name = eos_df.file_lfn " \
    #             "GROUP BY site_name, dataset_name") \
    #             % ','.join(cols)
    cols = [
        'site_name', 'dataset_name', 'parse_app(eos_df.application) AS app',
        'dn2uuid(eos_df.user_dn) AS uid', 'parse_dn(eos_df.user_dn) AS dn',
        '\"eos\" as stream',
        '%s AS timestamp' % unix_date, '-1 AS cpu'
    ]
    query = "SELECT %s FROM eos_df " \
             "JOIN f_b_s_df ON f_b_s_df.file_name = eos_df.file_lfn " \
             % ','.join(cols)
    cols = [
        'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid',
        'first(stream) as stream', 'first(timestamp) as timestamp',
        'count(dataset_name) AS nacc', 'count(dn) AS distinct_users',
        'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time',
        '-1 AS wc_time'
    ]
    query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \
            % (','.join(cols), query)

    result = run_query(query, sql_context, verbose)

    # result = result.sort(desc("nacc"))

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset_col(result, 'dataset_name')

    if verbose:
        print('Finished EOS part (output is %s records)' % result.count())
    else:
        print('Finished EOS part')

    return result