コード例 #1
0
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and JobMonitoring tables
    tables = {}

    # read JobMonitoring avro rdd
    date = jm_date(date)
    jm_df = jm_tables(ctx, sqlContext, date=date, verbose=verbose)

    # DBS tables
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    ddf = tables['ddf']  # dataset table
    fdf = tables['fdf']  # file table

    # merge DBS and JobMonitoring data
    cols = [
        'd_dataset', 'd_dataset_id', 'f_logical_file_name', 'FileName',
        'FileType', 'Type', 'SiteName', 'WrapWC', 'WrapCPU', 'JobExecExitCode'
    ]
    stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN jm_df ON fdf.f_logical_file_name = jm_df.FileName' % ','.join(
        cols)
    joins = sqlContext.sql(stmt)
    print_rows(joins, stmt, verbose)

    # perform aggregation
    fjoin = joins.groupBy(['SiteName','JobExecExitCode','FileType','Type','d_dataset'])\
            .agg({'WrapWC':'sum','WrapCPU':'sum','JobExecExitCode':'count','FileType':'count','Type':'count'})\
            .withColumnRenamed('sum(WrapWC)', 'tot_wc')\
            .withColumnRenamed('sum(WrapCPU)', 'tot_cpu')\
            .withColumnRenamed('count(JobExecExitCode)', 'ecode_count')\
            .withColumnRenamed('count(FileType)', 'file_type_count')\
            .withColumnRenamed('count(Type)', 'type_count')\
            .withColumnRenamed('d_dataset', 'dataset')\
            .withColumn('date', lit(jm_date_unix(date)))\
            .withColumn('count_type', lit('jm'))\

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        ndf = split_dataset(fjoin, 'dataset')
        ndf.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
コード例 #2
0
ファイル: data_aggregation.py プロジェクト: MircoT/CMSSpark
def run_agg_jm(date, ctx, sql_context, verbose=False):
    """
    Runs aggregation for JobMonitoring stream for a certain date.
    Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream.
    Result dataframe is sorted by nacc.
    """
    print('Starting JobMonitoring part')

    # Make a UNIX timestamp from date
    unix_date = short_date_to_unix(short_date_string(date))

    # Convert date
    date = long_date_string(date)

    # Create JobMonitoring tables in sql_context
    jm_df = jm_tables(ctx, sql_context, date=date, verbose=verbose)

    if verbose:
        print('Found %s records in JobMonitoring stream' %
              jm_df['jm_df'].count())

    # - site name                +
    # - dataset name             +
    # - app                      +
    # - uid                      +
    # - dn                       +
    # - number of access (nacc)  +
    # - distinct users           +
    # - stream: crab             +
    # - timestamp                +
    # - site tier                +
    # - cpu time                 +

    cols = [
        'SiteName AS site_name', 'dataset_name', 'count(dataset_name) AS nacc',
        'count(distinct(UserId)) AS distinct_users', '\"crab\" AS stream',
        '%s AS timestamp' % unix_date,
        'first(tier_from_site_name(SiteName)) AS site_tier',
        'SUM(WrapCPU) AS cpu_time'
    ]

    # Build a query with "cols" columns
    #    query = ("SELECT %s FROM jm_df "\
    #             "JOIN f_b_s_df ON f_b_s_df.file_name = jm_df.FileName "\
    #             "GROUP BY jm_df.SiteName, dataset_name") \
    cols = [
        'SiteName AS site_name', 'dataset_name',
        'stream4app(jm_df.SubmissionTool) AS app', 'dn2uuid(GridName) AS uid',
        'parse_dn(GridName) AS dn', '\"crab\" AS stream',
        '%s AS timestamp' % unix_date, 'WrapCPU AS cpu', 'WrapWC as wc'
    ]
    query = "SELECT %s FROM jm_df "\
             "JOIN f_b_s_df ON f_b_s_df.file_name = jm_df.FileName " \
             % ','.join(cols)
    cols = [
        'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid',
        'first(stream) as stream', 'first(timestamp) as timestamp',
        'count(dataset_name) AS nacc', 'count(dn) AS distinct_users',
        'first(tier_from_site_name(site_name)) AS site_tier',
        'SUM(cpu) AS cpu_time', 'SUM(wc) AS wc_time'
    ]
    query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \
            % (','.join(cols), query)

    result = run_query(query, sql_context, verbose)

    # result = result.sort(desc("nacc"))

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset_col(result, 'dataset_name')

    if verbose:
        print('Finished JobMonitoring part (output is %s records)' %
              result.count())
    else:
        print('Finished JobMonitoring part')

    return result
コード例 #3
0
def run_jm(date, fout, ctx, sql_context, verbose=False):

    if verbose:
        print('Starting JobMonitoring part')

    # Create fout by adding stream name and date paths
    fout = fout + "/CRAB/" + short_date_string(date)

    # Convert date
    date = long_date_string(date)

    # Create JobMonitoring tables in sql_context
    jm_tables(ctx, sql_context, date=date, verbose=verbose)

    if verbose:
        print('Will build query for JM and DBS tables')

    # - file name       +
    # - file size       +
    # - primds          +
    # - procds          +
    # - tier            +
    # - site name       +
    # - file replicas
    # - user dn
    # - start/end time  +
    # - read bytes
    # - cpu/wc values   +
    # - source: crab    +

    # JobMonitoring (CRAB) columns
    # For cpu value WrapCPU is used. Records also have ExeCPU.
    jm_cols = ['FileName AS file_name',
               'SiteName AS site_name',
               'WrapWC AS wc',
               'WrapCPU AS cpu',
               'StartedRunningTimeStamp AS start_time',
               'FinishedTimeStamp AS end_time',
               '"crab" AS source']

    # DBS columns
    ddf_cols = ['d_dataset']
    fdf_cols = ['f_file_size AS file_size']

    # Concatenate arrays with column names (i.e. use all column names from arrays)
    cols = jm_cols + ddf_cols + fdf_cols

    # Build a query with "cols" columns. Join DDF, FDF and JobMonitoring tables
    query = ("SELECT %s FROM ddf "
             "JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id "
             "JOIN jm_df ON fdf.f_logical_file_name = jm_df.FileName") % ','.join(cols)

    result = run_query(query, sql_context, verbose)

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset(result, 'd_dataset')

    output_dataframe(fout, result, verbose)

    if verbose:
        print('Finished JobMonitoring part')