Python split_dataset Examples, CMSSpark.spark_utils.split_dataset Python Examples

Example #1

0

Show file

def run_eos(date, fout, ctx, sql_context, verbose=False):

    if verbose:
        print('Starting EOS part')

    # Create fout by adding stream name and date paths
    fout = fout + "/EOS/" + short_date_string(date)

    # Convert date
    date = short_date_string(date)

    # Create EOS tables in sql_context
    eos_tables(sql_context, date=date, verbose=verbose)

    if verbose:
        print('Will build query for EOS and DBS tables')

    # - file name       +
    # - file size       +
    # - primds          +
    # - procds          +
    # - tier            +
    # - site name
    # - file replicas
    # - user dn         +
    # - start/end time  +
    # - read bytes
    # - cpu/wc values
    # - source: eos     +

    # EOS columns
    # Same timestamp is used in both start and end times
    eos_cols = ['file_lfn AS file_name',
                'user_dn',
                '"eos" AS source',
                'timestamp AS start_time',
                'timestamp AS end_time']

    # DBS columns
    ddf_cols = ['d_dataset']
    fdf_cols = ['f_file_size AS file_size']

    # Concatenate arrays with column names (i.e. use all column names from arrays)
    cols = eos_cols + ddf_cols + fdf_cols

    # Build a query with "cols" columns. Join DDF, FDF and EOS tables
    query = ("SELECT %s FROM ddf "
             "JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id "
             "JOIN eos_df ON fdf.f_logical_file_name = eos_df.file_lfn") % ','.join(cols)

    result = run_query(query, sql_context, verbose)

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset(result, 'd_dataset')

    output_dataframe(fout, result, verbose)

    if verbose:
        print('Finished EOS part')

Example #2

0

Show file

def run_aaa(date, fout, ctx, sql_context, verbose=False):

    if verbose:
        print('Starting AAA part')

    # Create fout by adding stream name and date paths
    fout = fout + "/AAA/" + short_date_string(date)

    # Convert date
    date = short_date_string(date)

    # Create AAA tables in sql_context
    aaa_tables(sql_context, date=date, verbose=verbose)

    if verbose:
        print('Will build query for AAA and DBS tables')

    # - file name         +
    # - file size         +
    # - primds            +
    # - procds            +
    # - tier              +
    # - site name
    # - file replicas
    # - user dn           +
    # - start/end time    +
    # - read bytes        +
    # - cpu/wc values
    # - source: xrootd    +

    # AAA columns
    aaa_cols = ['file_lfn AS file_name',
                'file_size',
                'user_dn',
                'start_time',
                'end_time',
                'read_bytes',
                '"xrootd" AS source']

    # DBS columns
    ddf_cols = ['d_dataset']

    # Concatenate arrays with column names (i.e. use all column names from arrays)
    cols = aaa_cols + ddf_cols

    # Build a query with "cols" columns. Join DDF, FDF and AAA tables
    query = ("SELECT %s FROM ddf "
             "JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id "
             "JOIN aaa_df ON fdf.f_logical_file_name = aaa_df.file_lfn") % ','.join(cols)

    result = run_query(query, sql_context, verbose)

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset(result, 'd_dataset')

    output_dataframe(fout, result, verbose)

    if verbose:
        print('Finished AAA part')

Example #3

0

Show file

def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    ddf = tables['ddf']  # dataset table
    fdf = tables['fdf']  # file table

    if verbose:
        for row in ddf.head(1):
            print("### ddf row", row)

    # read CMSSW avro rdd
    date = eos_date(date)
    tables.update(eos_tables(sqlContext, date=date, verbose=verbose))
    eos_df = tables['eos_df']  # EOS table

    if verbose:
        for row in eos_df.head(1):
            print("### eos_df row", row)

    # merge DBS and CMSSW data
    cols = ['d_dataset', 'd_dataset_id', 'f_logical_file_name', 'file_lfn']
    stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN eos_df ON fdf.f_logical_file_name = eos_df.file_lfn' % ','.join(
        cols)
    joins = sqlContext.sql(stmt)
    print_rows(joins, stmt, verbose)

    # perform aggregation
    fjoin = joins.groupBy(['d_dataset'])\
            .agg({'file_lfn':'count'})\
            .withColumnRenamed('count(file_lfn)', 'count')\
            .withColumnRenamed('d_dataset', 'dataset')\
            .withColumn('date', lit(eos_date_unix(date)))\
            .withColumn('count_type', lit('eos'))\

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        ndf = split_dataset(fjoin, 'dataset')
        ndf.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()

Example #4

0

Show file

def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and JobMonitoring tables
    tables = {}

    # read JobMonitoring avro rdd
    date = jm_date(date)
    jm_df = jm_tables(ctx, sqlContext, date=date, verbose=verbose)

    # DBS tables
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    ddf = tables['ddf']  # dataset table
    fdf = tables['fdf']  # file table

    # merge DBS and JobMonitoring data
    cols = [
        'd_dataset', 'd_dataset_id', 'f_logical_file_name', 'FileName',
        'FileType', 'Type', 'SiteName', 'WrapWC', 'WrapCPU', 'JobExecExitCode'
    ]
    stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN jm_df ON fdf.f_logical_file_name = jm_df.FileName' % ','.join(
        cols)
    joins = sqlContext.sql(stmt)
    print_rows(joins, stmt, verbose)

    # perform aggregation
    fjoin = joins.groupBy(['SiteName','JobExecExitCode','FileType','Type','d_dataset'])\
            .agg({'WrapWC':'sum','WrapCPU':'sum','JobExecExitCode':'count','FileType':'count','Type':'count'})\
            .withColumnRenamed('sum(WrapWC)', 'tot_wc')\
            .withColumnRenamed('sum(WrapCPU)', 'tot_cpu')\
            .withColumnRenamed('count(JobExecExitCode)', 'ecode_count')\
            .withColumnRenamed('count(FileType)', 'file_type_count')\
            .withColumnRenamed('count(Type)', 'type_count')\
            .withColumnRenamed('d_dataset', 'dataset')\
            .withColumn('date', lit(jm_date_unix(date)))\
            .withColumn('count_type', lit('jm'))\

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        ndf = split_dataset(fjoin, 'dataset')
        ndf.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()

Example #5

0

Show file

def run(fout,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    tables.update(phedex_tables(sqlContext, verbose=verbose))
    phedex_df = tables['phedex_df']
    daf = tables['daf']
    ddf = tables['ddf']
    bdf = tables['bdf']
    fdf = tables['fdf']
    aef = tables['aef']
    pef = tables['pef']
    mcf = tables['mcf']
    ocf = tables['ocf']
    rvf = tables['rvf']

    print("### ddf from main", ddf)

    # aggregate phedex info into dataframe
    phedex_cols = [
        'node_name', 'dataset_name', 'dataset_is_open', 'block_bytes',
        'replica_time_create'
    ]
    newpdf = phedex_df.select(phedex_cols).groupBy(['node_name', 'dataset_name', 'dataset_is_open'])\
            .agg({'block_bytes':'sum', 'replica_time_create':'max'})\
            .withColumnRenamed('sum(block_bytes)', 'pbr_size')\
            .withColumnRenamed('max(replica_time_create)', 'max_replica_time')
    newpdf.registerTempTable('newpdf')
    #    print_rows(newpdf, 'newpdf', verbose)
    #    newpdf.persist(StorageLevel.MEMORY_AND_DISK)

    # join tables
    cols = ['*']  # to select all fields from table
    cols = [
        'd_dataset_id', 'd_dataset', 'd_creation_date', 'd_is_dataset_valid',
        'f_event_count', 'f_file_size', 'dataset_access_type',
        'acquisition_era_name', 'processing_version'
    ]

    # join tables
    stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join(
        cols)
    print(stmt)
    joins = sqlContext.sql(stmt)
    #    print_rows(joins, 'joins', verbose)

    # keep joins table around
    #    joins.persist(StorageLevel.MEMORY_AND_DISK)

    # construct conditions
    cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1'
    fjoin = joins.where(cond).distinct().select(cols)

    # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000'))

    newdf = fjoin\
            .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\
            .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\
            .withColumnRenamed('sum(f_event_count)', 'evts')\
            .withColumnRenamed('sum(f_file_size)', 'size')\
            .withColumnRenamed('max(d_creation_date)', 'date')

    # at this point we have ndf dataframe with our collected stats for every dataset
    # let's join it with release info
    newdf.registerTempTable('newdf')
    cols = [
        'd_dataset_id', 'd_dataset', 'evts', 'size', 'date',
        'dataset_access_type', 'acquisition_era_name', 'processing_version',
        'r_release_version'
    ]
    stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join(
        cols)
    agg_dbs_df = sqlContext.sql(stmt)
    agg_dbs_df.registerTempTable('agg_dbs_df')
    #    print_rows(agg_dbs_df, 'agg_dbs_df', verbose)

    # keep agg_dbs_df table around
    #    agg_dbs_df.persist(StorageLevel.MEMORY_AND_DISK)

    # join dbs and phedex tables
    #    cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version','dataset_name','node_name','pbr_size','dataset_is_open','max_replica_time']
    cols = [
        'd_dataset', 'evts', 'size', 'date', 'dataset_access_type',
        'acquisition_era_name', 'r_release_version', 'node_name', 'pbr_size',
        'dataset_is_open', 'max_replica_time'
    ]
    stmt = 'SELECT %s FROM agg_dbs_df JOIN newpdf ON agg_dbs_df.d_dataset = newpdf.dataset_name' % ','.join(
        cols)
    finaldf = sqlContext.sql(stmt)

    # keep agg_dbs_df table around
    #    finaldf.persist(StorageLevel.MEMORY_AND_DISK)
    #    print_rows(finaldf, stmt, verbose)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        ndf = split_dataset(finaldf, 'd_dataset')
        ndf.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()

Example #6

0

Show file

def run_jm(date, fout, ctx, sql_context, verbose=False):

    if verbose:
        print('Starting JobMonitoring part')

    # Create fout by adding stream name and date paths
    fout = fout + "/CRAB/" + short_date_string(date)

    # Convert date
    date = long_date_string(date)

    # Create JobMonitoring tables in sql_context
    jm_tables(ctx, sql_context, date=date, verbose=verbose)

    if verbose:
        print('Will build query for JM and DBS tables')

    # - file name       +
    # - file size       +
    # - primds          +
    # - procds          +
    # - tier            +
    # - site name       +
    # - file replicas
    # - user dn
    # - start/end time  +
    # - read bytes
    # - cpu/wc values   +
    # - source: crab    +

    # JobMonitoring (CRAB) columns
    # For cpu value WrapCPU is used. Records also have ExeCPU.
    jm_cols = ['FileName AS file_name',
               'SiteName AS site_name',
               'WrapWC AS wc',
               'WrapCPU AS cpu',
               'StartedRunningTimeStamp AS start_time',
               'FinishedTimeStamp AS end_time',
               '"crab" AS source']

    # DBS columns
    ddf_cols = ['d_dataset']
    fdf_cols = ['f_file_size AS file_size']

    # Concatenate arrays with column names (i.e. use all column names from arrays)
    cols = jm_cols + ddf_cols + fdf_cols

    # Build a query with "cols" columns. Join DDF, FDF and JobMonitoring tables
    query = ("SELECT %s FROM ddf "
             "JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id "
             "JOIN jm_df ON fdf.f_logical_file_name = jm_df.FileName") % ','.join(cols)

    result = run_query(query, sql_context, verbose)

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset(result, 'd_dataset')

    output_dataframe(fout, result, verbose)

    if verbose:
        print('Finished JobMonitoring part')

Example #7

0

Show file

def run_cmssw(date, fout, ctx, sql_context, verbose=False):

    if verbose:
        print('Starting CMSSW part')

    # Create fout by adding stream name and date paths
    fout = fout + "/CMSSW/" + short_date_string(date)

    # Convert date
    date = long_date_string(date)

    # Create CMSSW tables in sql_context
    cmssw_tables(ctx, sql_context, date=date, verbose=verbose)

    if verbose:
        print('Will build query for CMSSW and DBS tables')

    # - file name         +
    # - file size         +
    # - primds            +
    # - procds            +
    # - tier              +
    # - site name         +
    # - file replicas
    # - user dn           +
    # - start/end time    +
    # - read bytes        +
    # - cpu/wc values
    # - source: cmssw     +

    # CMSSW columns
    cmssw_cols = ['FILE_LFN AS file_name',
                  'FILE_SIZE AS file_size',
                  'SITE_NAME AS site_name',
                  'user_dn',
                  'START_TIME AS start_time',
                  'END_TIME as end_time',
                  'READ_BYTES as read_bytes',
                  '"cmssw" AS source']

    # DBS columns
    ddf_cols = ['d_dataset']

    # Concatenate arrays with column names (i.e. use all column names from arrays)
    cols = cmssw_cols + ddf_cols

    # Build a query with "cols" columns. Join DDF, FDF and CMSSW tables
    query = ("SELECT %s FROM ddf "
             "JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id "
             "JOIN cmssw_df ON fdf.f_logical_file_name = cmssw_df.FILE_LFN") % ','.join(cols)

    result = run_query(query, sql_context, verbose)

    if verbose:
        print('Query done. Will split "dataset" column')

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset(result, 'd_dataset')

    output_dataframe(fout, result, verbose)

    if verbose:
        print('Finished CMSSW part')