Example #1
0
def run(fin, attrs, yarn, fout, verbose, nparts=3000):

    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = SQLContext(ctx)

    # Reading all the files in a directory
    paths = [fin]
    res = sqlContext.read.json(paths)

    data = res.select("data.*")
    data.repartition(nparts)
    print("### number of new data paritions", data.rdd.getNumPartitions())

    anonymize = udf(hash_private_info, returnType=StringType())

    # Use the above udf to anonymize data
    for attr in attrs:
        col = attr+'_hash'
        data = data.withColumn(col, anonymize(getattr(data, attr)))

    # drop attributes
    data = data.drop(*attrs)

    # Save to csv
    data.write.option("compression","gzip").json(fout)
def run(fout,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    bdf = tables['bdf']
    fdf = tables['fdf']
    flf = tables['flf']

    # join tables
    cols = ['*']  # to select all fields from table
    cols = [
        'b_block_id', 'b_block_name', 'f_block_id', 'f_file_id', 'fl_file_id',
        'fl_lumi_section_num'
    ]

    # join tables
    stmt = 'SELECT %s FROM bdf JOIN fdf on bdf.b_block_id = fdf.f_block_id JOIN flf on fdf.f_file_id=flf.fl_file_id' % ','.join(
        cols)
    print(stmt)
    joins = sqlContext.sql(stmt)

    # keep table around
    joins.persist(StorageLevel.MEMORY_AND_DISK)

    # construct conditions
    #    adler = ['ad8f6ad2','9c441343','f68d5dca','81c90e2a','471d2524','a3c1f077','6f0018a0','8bb03b60','d504882c','5ede357f','b05303c3','716d1776','7e9cf258','1945804b','ec7bc1d7','12c87747','94f2aa32']
    #    cond = 'f_adler32 in %s' % adler
    #    cond = cond.replace('[', '(').replace(']', ')')
    #    fjoin = joins.where(cond).distinct().select(cols)

    #    print_rows(fjoin, stmt, verbose)
    fjoin = joins\
            .groupBy(['b_block_name'])\
            .agg({'fl_lumi_section_num':'count'})\
            .withColumnRenamed('count(fl_lumi_section_num)', 'nlumis')\

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        fjoin.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Example #3
0
def run(path, amq, stomp, yarn=None, aggregation_schema=False, verbose=False):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    if stomp and os.path.isfile(stomp):
        ctx.addPyFile(stomp)
    else:
        raise Exception('No stomp module egg is provided')
    if amq and os.path.isfile(amq):
        if amq.split('/')[-1] == 'amq_broker.json':
            ctx.addFile(amq)
        else:
            raise Exception(
                'Wrong AMQ broker file name, please name it as amq_broker.json'
            )
    else:
        raise Exception('No AMQ credential file is provided')
    sqlContext = HiveContext(ctx)

    hpath = "hadoop fs -ls %s | awk '{print $8}'" % path
    if verbose:
        print("### Read files: %s" % hpath)
    pipe = Popen(hpath,
                 shell=True,
                 stdin=PIPE,
                 stdout=PIPE,
                 stderr=PIPE,
                 close_fds=True)
    pipe.wait()
    pfiles = [
        f for f in pipe.stdout.read().split('\n') if f.find('part-') != -1
    ]
    df = []

    if aggregation_schema:
        df = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                    .options(treatEmptyValuesAsNulls='true', nullValue='null', header='true') \
                    .load(fname, schema=aggregated_data_schema()) for fname in pfiles])
    else:
        df = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                    .options(treatEmptyValuesAsNulls='true', nullValue='null', header='true') \
                    .load(fname) for fname in pfiles])

    # Register temporary tables to be able to use sqlContext.sql
    df.registerTempTable('df')
    print_rows(df, "DataFrame", verbose)

    print('Schema:')
    df.printSchema()

    # for testing uncomment line below
    # df.toJSON().foreachPartition(print_data)
    # send data to CERN MONIT via stomp AMQ, see send2monit function
    df.toJSON().foreachPartition(send2monit)

    ctx.stop()
Example #4
0
def main():
    "Main function"
    optmgr = OptionParser()
    opts = optmgr.parser.parse_args()

    print("Input arguments: %s" % opts)

    start_time = time.time()
    verbose = opts.verbose
    yarn = opts.yarn
    inst = opts.inst
    date = opts.date
    fout = opts.fout

    if  inst.lower() in ['global', 'phys01', 'phys02', 'phys03']:
        inst = inst.upper()
    else:
        raise Exception('Unsupported DBS instance "%s"' % inst)

    # Create spark context
    ctx = spark_context('cms', yarn, verbose)

    # Create SQL context to be used for SQL queries
    sql_context = HiveContext(ctx)

    # Initialize DBS tables (will be used with AAA, CMSSW)
    dbs_tables(sql_context, inst=inst, verbose=verbose)

    aaa_start_time = time.time()

    run_aaa(date, fout, ctx, sql_context, verbose)

    aaa_elapsed_time = elapsed_time(aaa_start_time)
    cmssw_start_time = time.time()

    run_cmssw(date, fout, ctx, sql_context, verbose)

    cmssw_elapsed_time = elapsed_time(cmssw_start_time)
    eos_start_time = time.time()

    run_eos(date, fout, ctx, sql_context, verbose)

    eos_elapsed_time = elapsed_time(eos_start_time)
    jm_start_time = time.time()

    run_jm(date, fout, ctx, sql_context, verbose)

    jm_elapsed_time = elapsed_time(jm_start_time)

    ctx.stop()

    print('Start time         : %s' % time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(start_time)))
    print('End time           : %s' % time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(time.time())))
    print('Total elapsed time : %s' % elapsed_time(start_time))

    print('AAA elapsed time   : %s' % aaa_elapsed_time)
    print('CMSSW elapsed time : %s' % cmssw_elapsed_time)
    print('EOS elapsed time   : %s' % eos_elapsed_time)
    print('JM elapsed time    : %s' % jm_elapsed_time)
Example #5
0
def run(date, fout, yarn=None, verbose=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:])
    todate = fromdate
    # read Phedex tables
    tables = {}
    tables.update(
        phedex_tables(sqlContext,
                      verbose=verbose,
                      fromdate=fromdate,
                      todate=todate))
    phedex_df = tables['phedex_df']

    # register user defined function
    unix2date = udf(unix2human, StringType())
    siteFilter = udf(site_filter, IntegerType())
    one_day = 60 * 60 * 24

    # aggregate phedex info into dataframe
    cols = [
        'node_name', 'dataset_name', 'block_bytes', 'replica_time_create',
        'br_user_group_id'
    ]
    pdf = phedex_df.select(cols).where(siteFilter(col('node_name')) == 1)\
            .groupBy(['node_name', 'dataset_name', 'replica_time_create', 'br_user_group_id'])\
            .agg({'block_bytes':'sum'})\
            .withColumn('date', lit(date))\
            .withColumn('replica_date', unix2date(col('replica_time_create')))\
            .withColumnRenamed('sum(block_bytes)', 'size')\
            .withColumnRenamed('dataset_name', 'dataset')\
            .withColumnRenamed('node_name', 'site')\
            .withColumnRenamed('br_user_group_id', 'groupid')
    pdf.registerTempTable('pdf')
    pdf.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        year, month, day = split_date(date)
        out = '%s/%s/%s/%s' % (fout, year, month, day)
        cols = ['date', 'site', 'dataset', 'size', 'replica_date', 'groupid']
        # don't write header since when we'll read back the data it will
        # mismatch the data types, i.e. headers are string and rows
        # may be different data types
        pdf.select(cols)\
            .write.format("com.databricks.spark.csv")\
            .option("header", "true").save(out)

    ctx.stop()
Example #6
0
def run(fout,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    ddf = tables['ddf']
    bdf = tables['bdf']
    fdf = tables['fdf']

    # join tables
    cols = [
        'd_dataset', 'd_dataset_id', 'b_block_id', 'b_file_count',
        'f_block_id', 'f_file_id', 'f_dataset_id', 'f_event_count',
        'f_file_size'
    ]

    # join tables
    stmt = 'SELECT %s FROM ddf JOIN bdf on ddf.d_dataset_id = bdf.b_dataset_id JOIN fdf on bdf.b_block_id=fdf.f_block_id' % ','.join(
        cols)
    print(stmt)
    joins = sqlContext.sql(stmt)

    # keep table around
    joins.persist(StorageLevel.MEMORY_AND_DISK)

    # construct aggregation
    fjoin = joins\
            .groupBy(['d_dataset'])\
            .agg({'b_file_count':'sum', 'f_event_count':'sum', 'f_file_size':'sum'})\
            .withColumnRenamed('d_dataset', 'dataset')\
            .withColumnRenamed('sum(b_file_count)', 'nfiles')\
            .withColumnRenamed('sum(f_event_count)', 'nevents')\
            .withColumnRenamed('sum(f_file_size)', 'size')

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        fjoin.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Example #7
0
def main():
    "Main function"
    optmgr = OptionParser()
    opts = optmgr.parser.parse_args()

    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', opts.yarn, opts.verbose)
    sqlContext = SQLContext(ctx)

    convert(ctx, sqlContext, opts.date, opts.fin, opts.fout, opts.verbose)
Example #8
0
def run(fout,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = SQLContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    tables.update(phedex_tables(sqlContext, verbose=verbose))
    phedex_df = tables['phedex_df']
    ddf = tables['ddf']
    fdf = tables['fdf']

    print("### ddf from main", ddf)

    # join tables
    cols = ['*']  # to select all fields from table
    cols = ['d_dataset_id', 'd_dataset', 'f_logical_file_name', 'f_adler32']

    # join tables
    stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id' % ','.join(
        cols)
    print(stmt)
    joins = sqlContext.sql(stmt)

    # construct conditions
    adler = [
        'ad8f6ad2', '9c441343', 'f68d5dca', '81c90e2a', '471d2524', 'a3c1f077',
        '6f0018a0', '8bb03b60', 'd504882c', '5ede357f', 'b05303c3', '716d1776',
        '7e9cf258', '1945804b', 'ec7bc1d7', '12c87747', '94f2aa32'
    ]
    cond = 'f_adler32 in %s' % adler
    cond = cond.replace('[', '(').replace(']', ')')
    #    scols = ['f_logical_file_name']
    fjoin = joins.where(cond).distinct().select(cols)

    print_rows(fjoin, stmt, verbose)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        fjoin.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Example #9
0
def run(fout, yarn=None, verbose=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read Phedex tables
    tables = {}
    hdir = 'hdfs:///cms/users/vk/phedex'
    tables.update(phedex_summary_tables(sqlContext, hdir=hdir,
                                        verbose=verbose))
    phedex_summary_df = tables['phedex_summary_df']
    phedex_summary_df.persist(StorageLevel.MEMORY_AND_DISK)
    print("### schema", phedex_summary_df.printSchema())
    phedex_summary_df.show(5)

    # register user defined function
    days = udf(days_present, IntegerType())

    # aggregate phedex info into dataframe
    #    pdf = phedex_summary_df\
    #            .groupBy(['site', 'dataset'])\
    #            .agg(
    #                    F.min(col('date')).alias('min_date'),
    #                    F.max(col('date')).alias('max_date'),
    #                    F.min(col('replica_date')).alias('min_rdate'),
    #                    F.max(col('replica_date')).alias('max_rdate'),
    #                    F.max(col('size')).alias('max_size'),
    #                    F.min(col('size')).alias('min_size')
    #                )\
    #            .withColumn('days', days(col('min_date'), col('max_date'), col('min_rdate'), col('max_rdate')))
    pdf = phedex_summary_df\
            .groupBy(['site', 'dataset', 'size'])\
            .agg(
                    F.min(col('date')).alias('min_date'),
                    F.max(col('date')).alias('max_date'),
                    F.min(col('replica_date')).alias('min_rdate'),
                    F.max(col('replica_date')).alias('max_rdate')
                )\
            .withColumn('days', days(col('min_date'), col('max_date'), col('min_rdate'), col('max_rdate')))
    pdf.persist(StorageLevel.MEMORY_AND_DISK)
    pdf.show(5)
    print_rows(pdf, 'pdf', verbose=1)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        pdf.write.format("com.databricks.spark.csv")\
            .option("header", "true").save(fout)

    ctx.stop()
Example #10
0
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and JobMonitoring tables
    tables = {}

    # read JobMonitoring avro rdd
    date = jm_date(date)
    jm_df = jm_tables(ctx, sqlContext, date=date, verbose=verbose)

    # DBS tables
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    ddf = tables['ddf']  # dataset table
    fdf = tables['fdf']  # file table

    # merge DBS and JobMonitoring data
    cols = [
        'd_dataset', 'd_dataset_id', 'f_logical_file_name', 'FileName',
        'FileType', 'Type', 'SiteName', 'WrapWC', 'WrapCPU', 'JobExecExitCode'
    ]
    stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN jm_df ON fdf.f_logical_file_name = jm_df.FileName' % ','.join(
        cols)
    joins = sqlContext.sql(stmt)
    print_rows(joins, stmt, verbose)

    # perform aggregation
    fjoin = joins.groupBy(['SiteName','JobExecExitCode','FileType','Type','d_dataset'])\
            .agg({'WrapWC':'sum','WrapCPU':'sum','JobExecExitCode':'count','FileType':'count','Type':'count'})\
            .withColumnRenamed('sum(WrapWC)', 'tot_wc')\
            .withColumnRenamed('sum(WrapCPU)', 'tot_cpu')\
            .withColumnRenamed('count(JobExecExitCode)', 'ecode_count')\
            .withColumnRenamed('count(FileType)', 'file_type_count')\
            .withColumnRenamed('count(Type)', 'type_count')\
            .withColumnRenamed('d_dataset', 'dataset')\
            .withColumn('date', lit(jm_date_unix(date)))\
            .withColumn('count_type', lit('jm'))\

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        ndf = split_dataset(fjoin, 'dataset')
        ndf.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Example #11
0
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    ddf = tables['ddf']  # dataset table
    fdf = tables['fdf']  # file table

    if verbose:
        for row in ddf.head(1):
            print("### ddf row", row)

    # read CMSSW avro rdd
    date = eos_date(date)
    tables.update(eos_tables(sqlContext, date=date, verbose=verbose))
    eos_df = tables['eos_df']  # EOS table

    if verbose:
        for row in eos_df.head(1):
            print("### eos_df row", row)

    # merge DBS and CMSSW data
    cols = ['d_dataset', 'd_dataset_id', 'f_logical_file_name', 'file_lfn']
    stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN eos_df ON fdf.f_logical_file_name = eos_df.file_lfn' % ','.join(
        cols)
    joins = sqlContext.sql(stmt)
    print_rows(joins, stmt, verbose)

    # perform aggregation
    fjoin = joins.groupBy(['d_dataset'])\
            .agg({'file_lfn':'count'})\
            .withColumnRenamed('count(file_lfn)', 'count')\
            .withColumnRenamed('d_dataset', 'dataset')\
            .withColumn('date', lit(eos_date_unix(date)))\
            .withColumn('count_type', lit('eos'))\

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        ndf = split_dataset(fjoin, 'dataset')
        ndf.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Example #12
0
def run(fout,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = SQLContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    bdf = tables['bdf']
    fdf = tables['fdf']
    flf = tables['flf']

    # join tables
    cols = ['*']  # to select all fields from table
    cols = ['b_block_id', 'b_block_name', 'f_block_id', 'f_logical_file_name']

    # join tables
    stmt = 'SELECT %s FROM bdf JOIN fdf on bdf.b_block_id = fdf.f_block_id' % ','.join(
        cols)
    print(stmt)
    joins = sqlContext.sql(stmt)

    # keep table around
    joins.persist(StorageLevel.MEMORY_AND_DISK)

    # construct conditions
    cols = ['b_block_name', 'f_logical_file_name']
    pat = '%00047DB7-9F77-E011-ADC8-00215E21D9A8.root'
    #    pat = '%02ACAA1A-9F32-E111-BB31-0002C90B743A.root'
    fjoin = joins.select(cols).where(col('f_logical_file_name').like(pat))

    print_rows(fjoin, stmt, verbose)

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        fjoin.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Example #13
0
def run(date, fout, yarn=None, verbose=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read FTS tables
    date = fts_date(date)
    tables = {}
    tables.update(fts_tables(sqlContext, date=date, verbose=verbose))
    fts_df = tables['fts_df']  # fts table

    # example to extract transfer records for ASO
    # VK: the commented lines show how to extract some info from fts_df via SQL
    #    cols = ['data.job_metadata.issuer', 'data.f_size']
    #    stmt = 'SELECT %s FROM fts_df' % ','.join(cols)
    #    joins = sqlContext.sql(stmt)
    #    fjoin = joins.groupBy(['issuer'])\
    #            .agg({'f_size':'sum'})\
    #            .withColumnRenamed('sum(f_size)', 'sum_file_size')\
    #            .withColumnRenamed('issuer', 'issuer')

    # we can use fts_df directly for groupby/aggregated tasks
    fjoin = fts_df.groupBy(['job_metadata.issuer'])\
            .agg(agg_sum(fts_df.f_size).alias("sum_f_size"))

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        fjoin.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Example #14
0
def run(fout, date, yarn=None, verbose=None, inst='GLOBAL', limit=100):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    sqlContext = HiveContext(ctx)
    
    fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:])
    todate = fromdate

    # read Phedex and DBS tables
    tables = {}

    tables.update(phedex_tables(sqlContext, verbose=verbose, fromdate=fromdate, todate=todate))
    phedex = tables['phedex_df']

    instances = ['GLOBAL'] # , 'PHYS01', 'PHYS02', 'PHYS03'
    for instance in instances:
        dbs_dict = dbs_tables(sqlContext, inst=instance, verbose=verbose)
        for key, val in dbs_dict.items():
            new_key = '%s_%s' % (key, instance)
            tables[new_key] = val
    
    daf = reduce(lambda a,b: a.unionAll(b), [tables['daf_%s' % x] for x in instances])
    ddf = reduce(lambda a,b: a.unionAll(b), [tables['ddf_%s' % x] for x in instances])
    fdf = reduce(lambda a,b: a.unionAll(b), [tables['fdf_%s' % x] for x in instances])

    dbs_fdf_cols = ['f_dataset_id', 'f_file_size']
    dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id']
    dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type']

    fdf_df = fdf.select(dbs_fdf_cols)
    ddf_df = ddf.select(dbs_ddf_cols)
    daf_df = daf.select(dbs_daf_cols)

    # d_dataset_id, d_dataset, dataset_access_type
    dbs_df = ddf_df.join(daf_df, ddf_df.d_dataset_access_type_id == daf_df.dataset_access_type_id)\
                   .drop(ddf_df.d_dataset_access_type_id)\
                   .drop(daf_df.dataset_access_type_id)

    # dataset, dbs_size
    dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\
                   .join(fdf_df, dbs_df.d_dataset_id == fdf_df.f_dataset_id)\
                   .withColumnRenamed('d_dataset', 'dataset')\
                   .withColumnRenamed('f_file_size', 'dbs_size')\
                   .drop(dbs_df.d_dataset_id)\
                   .drop(fdf_df.f_dataset_id)\
                   .drop(dbs_df.dataset_access_type)
    
    # dataset, dbs_size
    dbs_df = dbs_df.groupBy(['dataset'])\
                   .agg({'dbs_size':'sum'})\
                   .withColumnRenamed('sum(dbs_size)', 'dbs_size')

    # dataset_name, block_bytes, node_name
    phedex_cols = ['dataset_name', 'block_bytes', 'node_name']
    phedex_all_df = phedex.select(phedex_cols)
    
    aggregate(sqlContext, fout, phedex_all_df, dbs_df, limit)
    aggregate(sqlContext, fout, phedex_all_df, dbs_df, limit, disk_only=True)

    ctx.stop()
Example #15
0
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = SQLContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    dtables = ['daf', 'ddf', 'bdf', 'fdf', 'aef', 'pef', 'mcf', 'ocf', 'rvf']
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose, tables=dtables))
#    tables.update(phedex_tables(sqlContext, verbose=verbose))
#    phedex_df = tables['phedex_df']
    daf = tables['daf'] # dataset access table
    ddf = tables['ddf'] # dataset table
    bdf = tables['bdf'] # block table
    fdf = tables['fdf'] # file table
    aef = tables['aef'] # acquisition era
    pef = tables['pef'] # processing era table
    mcf = tables['mcf'] # output mod config table
    ocf = tables['ocf'] # output module table
    rvf = tables['rvf'] # release version table

    # read Condor rdd
#    tables.update(condor_tables(sqlContext, hdir='hdfs:///cms/users/vk/condor', date=condor_date(date), verbose=verbose))
    tables.update(condor_tables(sqlContext, date=condor_date(date), verbose=verbose))
    condor_df = tables['condor_df'] # aaa table

    # aggregate dbs info into dataframe
    cols = ['d_dataset_id', 'd_dataset','d_creation_date','d_is_dataset_valid','f_event_count','f_file_size','dataset_access_type','acquisition_era_name','processing_version']
    stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join(cols)
    print(stmt)
    joins = sqlContext.sql(stmt)

    # construct conditions
    cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1'
    fjoin = joins.where(cond).distinct().select(cols)

    # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000'))

    newdf = fjoin\
            .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\
            .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\
            .withColumnRenamed('sum(f_event_count)', 'evts')\
            .withColumnRenamed('sum(f_file_size)', 'size')\
            .withColumnRenamed('max(d_creation_date)', 'date')

    # at this point we have ndf dataframe with our collected stats for every dataset
    # let's join it with release info
    newdf.registerTempTable('newdf')
    cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version']
    stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join(cols)
    agg_dbs_df = sqlContext.sql(stmt)
    agg_dbs_df.registerTempTable('agg_dbs_df')

    # merge dbs+phedex and Condor data
    cols = ['d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','r_release_version']
    cols = cols + ['data.KEvents', 'data.CMSSWKLumis', 'data.CMSSWWallHrs', 'data.Campaign', 'data.Workflow', 'data.CpuEff', 'data.CoreHr', 'data.QueueHrs', 'data.CRAB_UserHN', 'data.Type', 'data.ExitCode', 'data.TaskType', 'data.RecordTime']
    stmt = 'SELECT %s FROM condor_df JOIN agg_dbs_df ON agg_dbs_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.KEvents > 0' % ','.join(cols)
#     stmt = 'SELECT %s FROM condor_df JOIN dbs_phedex_df ON dbs_phedex_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.KEvents > 0' % ','.join(cols)

    final_df = sqlContext.sql(stmt)
    print_rows(final_df, stmt, verbose)

    # keep table around
    final_df.persist(StorageLevel.MEMORY_AND_DISK)

    # user defined function
    def rate(evts, cores):
        "Calculate the rate of events vs cores, if they're not defineed return -1"
        if evts and cores:
            return float(evts)/float(cores)
        return -1.
    func_rate = udf(rate, DoubleType())

    # our output
    store = {}

    # conditions

    # load pyspark functions to be used here to redefine any previous usage of those names
    from pyspark.sql.functions import lit, sum, count, col, split

    # here we split dataframe based on exitcode conditions to reduce dimentionality
    # of the input, otherwise job crashes with Integer.MAX_VALUE exception which
    # basically tells that input dataframe exceed number of available partitions
    for ecode in [0,1]:
        if ecode == 0:
            refdf = final_df.where(col('ExitCode') == 0)
            condf = condor_df.where(col('data.ExitCode') == 0)
        else:
            refdf = final_df.where(col('ExitCode') != 0)
            condf = condor_df.where(col('data.ExitCode') != 0)
        refdf.persist(StorageLevel.MEMORY_AND_DISK)
        condf.persist(StorageLevel.MEMORY_AND_DISK)

        # aggregate CMS datasets
        cols = ['data.DESIRED_CMSDataset', 'data.CRAB_UserHN', 'data.ExitCode', 'data.Type', 'data.TaskType', 'data.RecordTime']
        xdf = condf.groupBy(cols)\
                .agg(sum('data.KEvents').alias('sum_evts'),sum('data.CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumn("tier", split(col('DESIRED_CMSDataset'), "/").alias('tier').getItem(3))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('DESIRED_CMSDataset', 'dataset')
        store.setdefault('dataset', []).append(xdf)

        # aggregate across campaign
        cols = ['data.Campaign', 'data.CRAB_UserHN', 'data.ExitCode', 'data.Type', 'data.TaskType', 'data.RecordTime']
        xdf = condf.groupBy(cols)\
                .agg(sum('data.KEvents').alias('sum_evts'),sum('data.CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('Campaign', 'campaign')
        store.setdefault('campaign', []).append(xdf)

        # aggregate across DBS releases
        cols = ['r_release_version', 'CRAB_UserHN', 'ExitCode', 'Type', 'TaskType', 'RecordTime']
        xdf = refdf.groupBy(cols)\
                .agg(sum('KEvents').alias('sum_evts'),sum('CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('r_release_version', 'release')
        store.setdefault('release', []).append(xdf)

        # aggregate across DBS eras
        cols = ['acquisition_era_name', 'CRAB_UserHN', 'ExitCode', 'Type', 'TaskType', 'RecordTime']
        xdf = refdf.groupBy(cols)\
                .agg(sum('KEvents').alias('sum_evts'),sum('CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('acquisition_era_name', 'era')
        store.setdefault('era', []).append(xdf)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if  fout:
        year, month, day = split_date(date)
        for col in store.keys():
            out = '%s/%s/%s/%s/%s' % (fout, col, year, month, day)
            print("output: %s" % out)
            odf = unionAll(store[col])
            print("%s rows: %s" % (col, odf.count()))
            print_rows(odf, col, verbose=1)
            odf.write.format("com.databricks.spark.csv")\
                    .option("header", "true").save(out)

    ctx.stop()
Example #16
0
def run(fout, date, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """

    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    sqlContext = HiveContext(ctx)

    fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:])
    todate = fromdate

    # read Phedex and DBS tables
    tables = {}

    tables.update(
        phedex_tables(sqlContext,
                      verbose=verbose,
                      fromdate=fromdate,
                      todate=todate))
    phedex = tables['phedex_df']

    instances = [inst]
    if inst == 'all':
        instances = ['GLOBAL', 'PHYS01', 'PHYS02', 'PHYS03']

    for instance in instances:
        dbs_dict = dbs_tables(sqlContext, inst=instance, verbose=verbose)
        for key, val in dbs_dict.items():
            new_key = '%s_%s' % (key, instance)
            tables[new_key] = val

    daf = reduce(lambda a, b: a.unionAll(b),
                 [tables['daf_%s' % x] for x in instances])
    ddf = reduce(lambda a, b: a.unionAll(b),
                 [tables['ddf_%s' % x] for x in instances])
    fdf = reduce(lambda a, b: a.unionAll(b),
                 [tables['fdf_%s' % x] for x in instances])

    dbs_fdf_cols = ['f_dataset_id', 'f_file_size']
    dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id']
    dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type']

    fdf_df = fdf.select(dbs_fdf_cols)
    ddf_df = ddf.select(dbs_ddf_cols)
    daf_df = daf.select(dbs_daf_cols)

    # Aggregate by campaign and find total PhEDEx and DBS size of each campaign

    # d_dataset_id, d_dataset, dataset_access_type
    dbs_df = ddf_df.join(daf_df, ddf_df.d_dataset_access_type_id == daf_df.dataset_access_type_id)\
                   .drop(ddf_df.d_dataset_access_type_id)\
                   .drop(daf_df.dataset_access_type_id)

    # dataset, dbs_size
    dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\
                   .join(fdf_df, dbs_df.d_dataset_id == fdf_df.f_dataset_id)\
                   .withColumnRenamed('d_dataset', 'dataset')\
                   .withColumnRenamed('f_file_size', 'dbs_size')\
                   .drop(dbs_df.d_dataset_id)\
                   .drop(fdf_df.f_dataset_id)\
                   .drop(dbs_df.dataset_access_type)

    # dataset, dbs_size
    dbs_df = dbs_df.groupBy(['dataset'])\
                   .agg({'dbs_size':'sum'})\
                   .withColumnRenamed('sum(dbs_size)', 'dbs_size')

    # dataset, site, phedex_size
    phedex_cols = ['dataset_name', 'block_bytes', 'node_name']
    phedex_df = phedex.select(phedex_cols)
    phedex_df = phedex_df.withColumnRenamed('block_bytes', 'phedex_size')\
                         .withColumnRenamed('dataset_name', 'dataset')\
                         .withColumnRenamed('node_name', 'site')

    # dataset, sites, phedex_size
    phedex_df = phedex_df.groupBy(['dataset'])\
                   .agg({'phedex_size':'sum', 'site': 'collect_set'})\
                   .withColumnRenamed('sum(phedex_size)', 'phedex_size')\
                   .withColumnRenamed('collect_set(site)', 'sites')

    # Subtract to get leftovers

    extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2])

    # dataset
    leftover_datasets_df = phedex_df.select('dataset').subtract(
        dbs_df.select('dataset'))

    # dataset, campaign, sites, phedex_size
    leftovers_df = leftover_datasets_df.select('dataset').join(
        phedex_df, 'dataset')  #.join(dbs_df, 'dataset')
    leftovers_df = leftovers_df.withColumn('campaign', extract_campaign_udf(leftovers_df.dataset))\
                               .select(['dataset', 'campaign', 'sites', 'phedex_size'])

    # Subtract to get leftovers that don't even exist in DBS (orphans)

    ddf_datasets_df = ddf_df.withColumnRenamed('d_dataset',
                                               'dataset').select('dataset')
    leftover_datasets_df = phedex_df.select('dataset').subtract(
        ddf_datasets_df)

    # dataset, campaign, sites, phedex_size
    leftovers_orphans_df = leftover_datasets_df.select('dataset').join(
        phedex_df, 'dataset')  #.join(dbs_df, 'dataset')
    leftovers_orphans_df = leftovers_orphans_df.withColumn('campaign', extract_campaign_udf(leftovers_orphans_df.dataset))\
                                               .select(['dataset', 'campaign', 'sites', 'phedex_size'])

    # Sum total size of leftovers
    all_leftovers_size = leftovers_df.select(
        'phedex_size').groupBy().sum().rdd.map(lambda x: x[0]).collect()[0]
    orphan_leftovers_size = leftovers_orphans_df.select(
        'phedex_size').groupBy().sum().rdd.map(lambda x: x[0]).collect()[0]

    print 'All leftovers PhEDEx size: %s' % bytes_to_readable(
        all_leftovers_size)
    print 'Orphan leftovers PhEDEx size: %s' % bytes_to_readable(
        orphan_leftovers_size)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        leftovers_df.write.format("com.databricks.spark.csv")\
                          .option("header", "true").save('%s/all' % fout)

        leftovers_orphans_df.write.format("com.databricks.spark.csv")\
                            .option("header", "true").save('%s/orphans' % fout)

    ctx.stop()
Example #17
0
def run(yarn=None, verbose=None, campaign=None, tier=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """

    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    sqlContext = HiveContext(ctx)

    sqlContext.setConf("spark.sql.files.ignoreCorruptFiles", "true")
    sqlContext.sql("set spark.sql.files.ignoreCorruptFiles=true")

    df = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(header='true', treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load('hdfs:///cms/aggregation/sizes/part-*')

    if campaign != None and tier != None:
        campaign_tier_df = df.where(df.campaign == campaign)\
                             .where(df.tier == tier)

        campaign_tier = map(lambda row: row.asDict(),
                            campaign_tier_df.collect())

        print 'Average size: %s' % bytes_to_readable(
            float(campaign_tier[0]['size_average']))
        print 'Average in period of existence: %s' % bytes_to_readable(
            float(campaign_tier[0]['average_size_in_period']))
        print 'Max size: %s' % bytes_to_readable(
            float(campaign_tier[0]['size_max']))
        print 'T1 size: %s' % bytes_to_readable(
            float(campaign_tier[0]['t1_size']))
        print 'T2 size: %s' % bytes_to_readable(
            float(campaign_tier[0]['t2_size']))
        print 'T3 size: %s' % bytes_to_readable(
            float(campaign_tier[0]['t3_size']))

    date_to_timestamp_udf = udf(lambda date: time.mktime(
        datetime.datetime.strptime(date, "%Y%m%d").timetuple()))

    months = [1, 2, 3, 4, 5, 6, 9, 12]

    for month in months:
        now = (datetime.datetime.now() -
               datetime.datetime(1970, 1, 1)).total_seconds()
        seconds = month * 30 * 24 * 60 * 60
        not_accessed_df = df.withColumn(
            'date_timestamp', date_to_timestamp_udf(df.last_access_date))
        not_accessed_df = not_accessed_df.where(
            now - not_accessed_df.date_timestamp > seconds)
        not_accessed_df = not_accessed_df.withColumn(
            "size_average", not_accessed_df["size_average"].cast(DoubleType()))

        total_size = not_accessed_df.groupBy().sum('size_average').rdd.map(
            lambda x: x[0]).collect()[0] or 0

        print 'Size of data not accessed for last %d month(s): %s' % (
            month, bytes_to_readable(total_size))

    ctx.stop()
Example #18
0
def run(fout, hdir, date, yarn=None, verbose=None):
    """
    Main function to run pyspark job.
    """
    if  not date:
        raise Exception("Not date is provided")

    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = SQLContext(ctx)

    # read DBS and Phedex tables
    # construct here hdfs path and pass empty string as a date
    rdd = avro_rdd(ctx, sqlContext, hdfs_path(hdir, date), date='', verbose=verbose)

    def getdata(row):
        """
        Helper function to extract useful data from WMArchive records.
        You may adjust it to your needs. Given row is a dict object.
        """
        meta = row.get('meta_data', {})
        sites = []
        out = {'host': meta.get('host', ''), 'task': row.get('task', '')}
        for step in row['steps']:
            if step['name'].lower().startswith('cmsrun'):
                site = step.get('site', '')
                output = step.get('output', [])
                perf = step.get('performance', {})
                cpu = perf.get('cpu', {})
                mem = perf.get('memory', {})
                storage = perf.get('storage', {})
                out['ncores'] = cpu['NumberOfStreams']
                out['nthreads'] = cpu['NumberOfThreads']
                out['site'] = site
                out['jobCPU'] = cpu['TotalJobCPU']
                out['jobTime'] = cpu['TotalJobTime']
                out['evtCPU'] = cpu['TotalEventCPU']
                out['evtThroughput'] = cpu['EventThroughput']
                if output:
                    output = output[0]
                    out['appName'] = output.get('applicationName', '')
                    out['appVer'] = output.get('applicationName', '')
                    out['globalTag'] = output.get('globalTag', '')
                    out['era'] = output.get('acquisitionEra', '')
                else:
                    out['appName'] = ''
                    out['appVer'] = ''
                    out['globalTag'] = ''
                    out['era'] = ''
                break
        return out

    out = rdd.map(lambda r: getdata(r))
    if  verbose:
        print(out.take(1)) # out here is RDD object

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if  fout:
        # output will be saved as-is, in this case the out is an RDD which
        # contains json records, therefore the output will be records
        # coming out from getdata helper function above.
        out.saveAsTextFile(fout)

    ctx.stop()
Example #19
0
def run(fout, yarn=None, verbose=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    sqlContext = HiveContext(ctx)
    
    sqlContext.setConf("spark.sql.files.ignoreCorruptFiles","true")
    sqlContext.sql("set spark.sql.files.ignoreCorruptFiles=true")

    # date, site, dataset, size, replica_date, groupid
    schema = StructType([
        StructField("date", StringType(), True),
        StructField("site", StringType(), True),
        StructField("dataset", StringType(), True),
        StructField("size", DoubleType(), True),
        StructField("replica_date", StringType(), True),
        StructField("groupid", StringType(), True)
    ])

    df = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load('hdfs:///cms/phedex/*/*/*/part-*', schema=schema)
                        # .load('hdfs:///cms/phedex/2017/03/*/part-00000', schema=schema)

    # Remove all tape sites
    is_tape = lambda site: site.endswith('_MSS') | site.endswith('_Buffer') | site.endswith('_Export')
    df = df.where(is_tape(df.site) == False)

    extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2].split('-')[0])
    extract_tier_udf = udf(lambda dataset: dataset.split('/')[3])
    date_to_timestamp_udf = udf(lambda date: time.mktime(datetime.datetime.strptime(date, "%Y%m%d").timetuple()))
    timestamp_to_date_udf = udf(lambda timestamp: datetime.datetime.fromtimestamp(float(timestamp)).strftime('%Y%m%d'))
    days_delta_udf = udf(lambda t1, t2: (datetime.datetime.fromtimestamp(float(t1)) - datetime.datetime.fromtimestamp(float(t2))).days + 1)
    count_udf = udf(lambda list: len(list))

    df = df.withColumn('campaign', extract_campaign_udf(df.dataset))
    df = df.withColumn('tier', extract_tier_udf(df.dataset))
    df = df.withColumn('date_min', date_to_timestamp_udf(df.date))
    df = df.withColumn('date_max', date_to_timestamp_udf(df.date))
    df = df.withColumn('size_average', df.size)

    df = df.groupBy(['campaign', 'tier'])\
           .agg({'date_min': 'min', 'date_max': 'max', 'date': 'collect_set', 'size_average': 'avg', 'size': 'max'})\
           .withColumnRenamed('min(date_min)', 'date_min')\
           .withColumnRenamed('max(date_max)', 'date_max')\
           .withColumnRenamed('collect_set(date)', 'days_count')\
           .withColumnRenamed('avg(size_average)', 'size_average')\
           .withColumnRenamed('max(size)', 'size_max')\

    df = df.withColumn('period_days', days_delta_udf(df.date_max, df.date_min))\
           .withColumn('days_count', count_udf(df.days_count))\
           .withColumn('date_min', timestamp_to_date_udf(df.date_min))\
           .withColumn('date_max', timestamp_to_date_udf(df.date_max))
        
    df = df.withColumn('existence_in_period', df.days_count / df.period_days)
    df = df.withColumn('average_size_in_period', df.size_average * df.existence_in_period)

    # campaign, tier, date_max, date_min, days_count, size_max, size_average, period_days, existence_in_period, average_size_in_period

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        df.write.format("com.databricks.spark.csv")\
                          .option("header", "true").save(fout)
    
    ctx.stop()
Example #20
0
def run(fout,
        date,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """

    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    sqlContext = HiveContext(ctx)

    fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:])
    todate = fromdate

    # read Phedex and DBS tables
    tables = {}

    tables.update(
        phedex_tables(sqlContext,
                      verbose=verbose,
                      fromdate=fromdate,
                      todate=todate))
    phedex = tables['phedex_df']

    instances = ['GLOBAL', 'PHYS01', 'PHYS02', 'PHYS03']
    for instance in instances:
        dbs_dict = dbs_tables(sqlContext, inst=instance, verbose=verbose)
        for key, val in dbs_dict.items():
            new_key = '%s_%s' % (key, instance)
            tables[new_key] = val

    daf = reduce(lambda a, b: a.unionAll(b),
                 [tables['daf_%s' % x] for x in instances])
    ddf = reduce(lambda a, b: a.unionAll(b),
                 [tables['ddf_%s' % x] for x in instances])
    fdf = reduce(lambda a, b: a.unionAll(b),
                 [tables['fdf_%s' % x] for x in instances])

    dbs_fdf_cols = ['f_dataset_id', 'f_file_size']
    dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id']
    dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type']

    fdf_df = fdf.select(dbs_fdf_cols)
    ddf_df = ddf.select(dbs_ddf_cols)
    daf_df = daf.select(dbs_daf_cols)

    # Aggregate by campaign and find total PhEDEx and DBS size of each campaign

    extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2])

    # dataset, size, dataset_access_type_id
    dbs_df = fdf_df.join(ddf_df, fdf_df.f_dataset_id == ddf_df.d_dataset_id)\
                   .drop('f_dataset_id')\
                   .drop('d_dataset_id')\
                   .withColumnRenamed('d_dataset', 'dataset')\
                   .withColumnRenamed('f_file_size', 'size')\
                   .withColumnRenamed('d_dataset_access_type_id', 'dataset_access_type_id')

    # dataset, size, dataset_access_type
    dbs_df = dbs_df.join(daf_df, dbs_df.dataset_access_type_id == daf_df.dataset_access_type_id)\
                   .drop(dbs_df.dataset_access_type_id)\
                   .drop(daf_df.dataset_access_type_id)

    # campaign, dbs_size
    dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\
                   .withColumn('campaign', extract_campaign_udf(dbs_df.dataset))\
                   .groupBy(['campaign'])\
                   .agg({'size':'sum'})\
                   .withColumnRenamed('sum(size)', 'dbs_size')\
                   .drop('dataset')

    # campaign, phedex_size
    phedex_cols = ['dataset_name', 'block_bytes']
    phedex_df = phedex.select(phedex_cols)
    phedex_df = phedex_df.withColumn('campaign', extract_campaign_udf(phedex_df.dataset_name))\
                .groupBy(['campaign'])\
                .agg({'block_bytes':'sum'})\
                .withColumnRenamed('sum(block_bytes)', 'phedex_size')

    # campaign, dbs_size, phedex_size
    dbs_phedex_df = dbs_df.join(phedex_df, dbs_df.campaign == phedex_df.campaign)\
                          .drop(dbs_df.campaign)

    print 'DISTINCT DBS AND PHEDEX CAMPAIGN COUNT:'
    print dbs_phedex_df.select('campaign').distinct().count()

    # Select campaign - site pairs and their sizes (from PhEDEx)

    # campaign, site, size
    phedex_cols = ['dataset_name', 'node_name', 'block_bytes']
    campaign_site_df = phedex.select(phedex_cols)
    campaign_site_df = campaign_site_df.withColumn('campaign', extract_campaign_udf(campaign_site_df.dataset_name))\
                .groupBy(['campaign', 'node_name'])\
                .agg({'block_bytes':'sum'})\
                .withColumnRenamed('sum(block_bytes)', 'size')\
                .withColumnRenamed('node_name', 'site')

    # Aggregate data for site - campaign count table

    # site, count
    site_campaign_count_df = campaign_site_df.groupBy(['site'])\
                                             .agg(countDistinct('campaign'))\
                                             .withColumnRenamed('count(campaign)', 'campaign_count')\
                                             .orderBy('campaign_count', ascending=False)\
                                             .limit(LIMIT)

    # Find two most significant sites for each campaign

    columns_before_pivot = campaign_site_df.columns

    result = campaign_site_df.groupBy(['campaign'])\
                             .pivot('site')\
                             .sum('size')\
                             .na.fill(0)

    columns_after_pivot = result.columns
    sites_columns = [
        x for x in columns_after_pivot if x not in columns_before_pivot
    ]

    number_of_sites_udf = udf(lambda row: len([x for x in row if x != 0]),
                              IntegerType())
    mss_udf = udf(get_mss, LongType())
    second_mss_udf = udf(get_second_mss, LongType())
    mss_name_udf = udf(lambda row: get_mss_name(row, sites_columns),
                       StringType())
    second_mss_name_udf = udf(
        lambda row: get_second_mss_name(row, sites_columns), StringType())

    result = result.withColumn('sites', number_of_sites_udf(struct([result[x] for x in sites_columns])))\
                   .withColumn('mss', mss_udf(struct([result[x] for x in sites_columns])))\
                   .withColumn('mss_name', mss_name_udf(struct([result[x] for x in sites_columns])))\
                   .withColumn('second_mss', second_mss_udf(struct([result[x] for x in sites_columns])))\
                   .withColumn('second_mss_name', second_mss_name_udf(struct([result[x] for x in sites_columns])))

    # campaign, phedex_size, dbs_size, mss, mss_name, second_mss, second_mss_name, sites
    result = result.join(dbs_phedex_df, result.campaign == dbs_phedex_df.campaign)\
                   .drop(result.campaign)

    sorted_by_phedex = result.orderBy(result.phedex_size,
                                      ascending=False).limit(LIMIT)
    sorted_by_dbs = result.orderBy(result.dbs_size,
                                   ascending=False).limit(LIMIT)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        sorted_by_phedex.write.format("com.databricks.spark.csv")\
                              .option("header", "true").save('%s/phedex' % fout)

        sorted_by_dbs.write.format("com.databricks.spark.csv")\
                           .option("header", "true").save('%s/dbs' % fout)

        site_campaign_count_df.write.format("com.databricks.spark.csv")\
                              .option("header", "true").save('%s/site_campaign_count' % fout)

    ctx.stop()
Example #21
0
def run(fout, yarn=None, verbose=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    sqlContext = HiveContext(ctx)
    
    sqlContext.setConf("spark.sql.files.ignoreCorruptFiles","true")
    sqlContext.sql("set spark.sql.files.ignoreCorruptFiles=true")

    # date, site, dataset, size, replica_date, groupid
    schema = StructType([
        StructField("date", StringType(), True),
        StructField("site", StringType(), True),
        StructField("dataset", StringType(), True),
        StructField("size", DoubleType(), True),
        StructField("replica_date", StringType(), True),
        StructField("groupid", StringType(), True)
    ])

    df = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(PHEDEX_HDFS_URL, schema=schema)
                        
    # Remove all tape sites
    is_tape = lambda site: site.endswith('_MSS') | site.endswith('_Buffer') | site.endswith('_Export')
    df = df.where(is_tape(df.site) == False)

    # Remove all non VALID datasets
    remove_invalid_datasets(df, sqlContext, verbose)

    # Get accesses data frame
    accesses_df = get_dataset_access_dates(sqlContext)

    # extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2].split('-')[0])
    extract_tier_udf = udf(lambda dataset: dataset.split('/')[3])
    days_delta_udf = udf(lambda t1, t2: (datetime.datetime.fromtimestamp(float(t1)) - datetime.datetime.fromtimestamp(float(t2))).days + 1)
    count_udf = udf(lambda list: len(list))
    get_t1_size = udf(lambda size, site: size if site.startswith('T1') else 0)
    get_t2_size = udf(lambda size, site: size if site.startswith('T2') else 0)
    get_t3_size = udf(lambda size, site: size if site.startswith('T3') else 0)

    df = df.withColumn('campaign', get_extract_campaign_udf()(df.dataset))\
           .withColumn('tier', extract_tier_udf(df.dataset))\
           .withColumn('date_min', get_date_to_timestamp_udf()(df.date))\
           .withColumn('date_max', get_date_to_timestamp_udf()(df.date))\
           .withColumn('size_average', df.size)\
           .withColumn('t1_size', get_t1_size(df.size, df.site))\
           .withColumn('t2_size', get_t2_size(df.size, df.site))\
           .withColumn('t3_size', get_t3_size(df.size, df.site))

    df = df.groupBy(['campaign', 'tier'])\
           .agg({'date_min': 'min', 'date_max': 'max', 'date': 'collect_set', 'size_average': 'avg', 'size': 'max', 't1_size': 'avg', 't2_size': 'avg', 't3_size': 'avg'})\
           .withColumnRenamed('min(date_min)', 'date_min')\
           .withColumnRenamed('max(date_max)', 'date_max')\
           .withColumnRenamed('collect_set(date)', 'days_count')\
           .withColumnRenamed('avg(size_average)', 'size_average')\
           .withColumnRenamed('max(size)', 'size_max')\
           .withColumnRenamed('avg(t1_size)', 't1_size')\
           .withColumnRenamed('avg(t2_size)', 't2_size')\
           .withColumnRenamed('avg(t3_size)', 't3_size')\

    df = df.withColumn('period_days', days_delta_udf(df.date_max, df.date_min))\
           .withColumn('days_count', count_udf(df.days_count))\
           .withColumn('date_min', get_timestamp_to_date_udf()(df.date_min))\
           .withColumn('date_max', get_timestamp_to_date_udf()(df.date_max))
        
    df = df.withColumn('existence_in_period', df.days_count / df.period_days)
    df = df.withColumn('average_size_in_period', df.size_average * df.existence_in_period)

    df.show(100, truncate=False)

    # campaign, tier, date_max, date_min, days_count, size_max, size_average, period_days, existence_in_period, average_size_in_period, t1_size, t2_size, t3_size, last_access_date
    df = df.join(accesses_df, 'campaign')
    
    df.show(100, truncate=False)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        df.write.format("com.databricks.spark.csv")\
                          .option("header", "true").save(fout)
    
    ctx.stop()
Example #22
0
def get_spark_session(yarn=True, verbose=False):
    """
        Get or create the spark context and session.
    """
    sc = spark_context("cms-eos-dataset", yarn, verbose)
    return SparkSession.builder.config(conf=sc._conf).getOrCreate()
Example #23
0
def main():
    "Main function"
    optmgr = OptionParser()
    opts = optmgr.parser.parse_args()

    print("Input arguments: %s" % opts)

    start_time = time.time()
    verbose = opts.verbose
    yarn = opts.yarn
    inst = opts.inst
    date = opts.date
    fout = opts.fout
    aaa_hdir = opts.aaa_hdir

    if inst.lower() in ['global', 'phys01', 'phys02', 'phys03']:
        inst = inst.upper()
    else:
        raise Exception('Unsupported DBS instance "%s"' % inst)

    # Create spark context
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    # Create SQL context to be used for SQL queries
    sql_context = SQLContext(ctx)

    # Initialize DBS tables
    dbs_tables(sql_context,
               inst=inst,
               verbose=verbose,
               tables=['fdf', 'bdf', 'ddf'])

    # Initialize PhEDEx table to be used in file_block_site table
    phedex_tables(sql_context, verbose=verbose)

    # Register clean_site_name to be used with SQL queries
    sql_context.udf.register("clean_site_name", clean_site_name)

    # Register tier_from_site_name to be used with SQL queries
    sql_context.udf.register("tier_from_site_name", tier_from_site_name)

    # Register dn2uuid to be used with SQL queries
    sql_context.udf.register("dn2uuid", dn2uuid)

    # Register parse_app to be used with SQL queries
    sql_context.udf.register("parse_app", parse_app)

    # Register stream4app to be used with SQL queries
    sql_context.udf.register("stream4app", stream4app)

    # Register parse_dn to be used with SQL queries
    sql_context.udf.register("parse_dn", parse_dn)

    f_b_s_start_time = time.time()
    # Create temp table with file name, block name, site name and site from PhEDEx
    create_file_block_site_table(ctx, sql_context, verbose)
    f_b_s_elapsed_time = elapsed_time(f_b_s_start_time)

    cmssw_start_time = time.time()
    aggregated_cmssw_df = run_agg_cmssw(date, ctx, sql_context, verbose)
    cmssw_elapsed_time = elapsed_time(cmssw_start_time)

    aaa_start_time = time.time()
    if len(aaa_hdir) > 0:
        aggregated_aaa_df = run_agg_aaa(date, ctx, sql_context, aaa_hdir,
                                        verbose)
    else:
        aggregated_aaa_df = run_agg_aaa(date,
                                        ctx,
                                        sql_context,
                                        verbose=verbose)

    aaa_elapsed_time = elapsed_time(aaa_start_time)

    eos_start_time = time.time()
    aggregated_eos_df = run_agg_eos(date, ctx, sql_context, verbose)
    eos_elapsed_time = elapsed_time(eos_start_time)

    jm_start_time = time.time()
    aggregated_jm_df = run_agg_jm(date, ctx, sql_context, verbose)
    jm_elapsed_time = elapsed_time(jm_start_time)

    if verbose:
        print('Will union outputs from all streams to a single dataframe')
    # Schema for output is:
    # site name, dataset name, number of accesses, distinct users, stream
    all_df = aggregated_cmssw_df.unionAll(aggregated_aaa_df)
    all_df = all_df.unionAll(aggregated_eos_df)
    all_df = all_df.unionAll(aggregated_jm_df)
    all_df = all_df.sort(desc("nacc"))

    if verbose:
        print('Done joining all outputs to a single dataframe')

    fout = fout + "/" + short_date_string(date)

    # output_dataframe(fout + "/Aggregated/CMSSW/" + short_date_string(date), aggregated_cmssw_df, verbose)
    # output_dataframe(fout + "/Aggregated/AAA/" + short_date_string(date), aggregated_aaa_df, verbose)
    # output_dataframe(fout + "/Aggregated/EOS/" + short_date_string(date), aggregated_eos_df, verbose)
    # output_dataframe(fout + "/Aggregated/JobMonitoring/" + short_date_string(date), aggregated_jm_df, verbose)

    output_dataframe(fout, all_df, verbose)

    if verbose:
        cmssw_df_size = aggregated_cmssw_df.count()
        aaa_df_size = aggregated_aaa_df.count()
        eos_df_size = aggregated_eos_df.count()
        jm_df_size = aggregated_jm_df.count()
        all_df_size = all_df.count()

        print('CMSSW:')
        aggregated_cmssw_df.show(10)
        aggregated_cmssw_df.printSchema()

        print('AAA:')
        aggregated_aaa_df.show(10)
        aggregated_aaa_df.printSchema()

        print('EOS:')
        aggregated_eos_df.show(10)
        aggregated_eos_df.printSchema()

        print('JobMonitoring:')
        aggregated_jm_df.show(10)
        aggregated_jm_df.printSchema()

        print('Aggregated all:')
        all_df.show(10)
        all_df.printSchema()

        print('Output record count:')
        print('Output record count CMSSW         : %s' % cmssw_df_size)
        print('Output record count AAA           : %s' % aaa_df_size)
        print('Output record count EOS           : %s' % eos_df_size)
        print('Output record count JobMonitoring : %s' % jm_df_size)
        print('Output record count Total:        : %s' % all_df_size)

    ctx.stop()

    print('Start time         : %s' %
          time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(start_time)))
    print('End time           : %s' %
          time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(time.time())))
    print('Total elapsed time : %s' % elapsed_time(start_time))

    print('FileBlockSite elapsed time : %s' % f_b_s_elapsed_time)
    print('AAA elapsed time           : %s' % aaa_elapsed_time)
    print('CMSSW elapsed time         : %s' % cmssw_elapsed_time)
    print('EOS elapsed time           : %s' % eos_elapsed_time)
    print('JobMonitoring elapsed time : %s' % jm_elapsed_time)
Example #24
0
def run(date, fout, yarn=None, verbose=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = SQLContext(ctx)

    # read ASO and FTS tables
    date = aso_date(date)
    tables = {}
    tables.update(fts_tables(sqlContext, date=date, verbose=verbose))
    fts_df = tables['fts_df']  # fts table
    print_rows(fts_df, 'fts_df', verbose)
    tables.update(aso_tables(sqlContext, verbose=verbose))
    aso_df = tables['aso_df']  # aso table
    print_rows(aso_df, 'aso_df', verbose)

    fts = fts_df.select([
        'job_metadata.issuer', 'job_id', 'src_url', 't_final_transfer_state',
        'tr_timestamp_start', 'tr_timestamp_complete'
    ])

    fts = fts.filter("issuer = 'ASO'")
    fts_udf = udf(lambda x: x.split("/")[-1], StringType())
    fts = fts.withColumn("filename", fts_udf(fts.src_url))
    fts = fts.withColumn("fts_duration",
                         (fts.tr_timestamp_complete - fts.tr_timestamp_start) *
                         1. / 1000)

    aso = aso_df.select([
        'tm_source_lfn', 'tm_fts_id', 'tm_jobid', 'tm_type', 'tm_last_update',
        'tm_start_time', 'tm_transfer_state', 'tm_source', 'tm_destination',
        'tm_transfer_retry_count', 'tm_publish'
    ]).withColumnRenamed('tm_fts_id', 'job_id')

    aso_udf = udf(lambda x: x.split("/")[-1], StringType())
    aso = aso.withColumn("filename", aso_udf(aso.tm_source_lfn))
    aso = aso.filter((col("tm_transfer_state") == 3)
                     | (col("tm_transfer_state") == 2))
    aso = aso.filter((aso.tm_transfer_state.isNotNull()))
    aso = aso.filter((aso.job_id.isNotNull()))

    new_df = fts.join(aso, on=['filename', 'job_id'], how='left_outer')

    new_df = new_df.groupby([
        'job_id', 'tm_transfer_state', 'tm_publish', 'tm_transfer_retry_count'
    ]).agg(
        count(lit(1)).alias('Num Of Records'),
        mean(aso.tm_last_update - aso.tm_start_time).alias('Aso duration'),
        mean(new_df.tr_timestamp_start * 1. / 1000 -
             new_df.tm_start_time).alias('Aso delay start'),
        mean(new_df.tm_last_update -
             new_df.tr_timestamp_complete * 1. / 1000).alias('Aso delay'),
        mean(new_df.fts_duration).alias('fts duration'),
    )

    # keep table around
    new_df.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        new_df.write.format("com.databricks.spark.csv")\
                    .option("header", "true").save(fout)

    ctx.stop()
Example #25
0
def run(fout,
        date,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """

    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    sqlContext = HiveContext(ctx)

    fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:])
    todate = fromdate

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    tables.update(
        phedex_tables(sqlContext,
                      verbose=verbose,
                      fromdate=fromdate,
                      todate=todate))
    phedex = tables['phedex_df']

    daf = tables['daf']
    ddf = tables['ddf']
    fdf = tables['fdf']

    # DBS
    dbs_fdf_cols = ['f_dataset_id', 'f_file_size']
    dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id']
    dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type']

    fdf_df = fdf.select(dbs_fdf_cols)
    ddf_df = ddf.select(dbs_ddf_cols)
    daf_df = daf.select(dbs_daf_cols)

    # dataset, dbs_size, dataset_access_type_id
    dbs_df = fdf_df.join(ddf_df, fdf_df.f_dataset_id == ddf_df.d_dataset_id)\
                   .drop('f_dataset_id')\
                   .drop('d_dataset_id')\
                   .withColumnRenamed('d_dataset', 'dataset')\
                   .withColumnRenamed('f_file_size', 'size')\
                   .withColumnRenamed('d_dataset_access_type_id', 'dataset_access_type_id')

    # dataset, size, dataset_access_type
    dbs_df = dbs_df.join(daf_df, dbs_df.dataset_access_type_id == daf_df.dataset_access_type_id)\
                   .drop(dbs_df.dataset_access_type_id)\
                   .drop(daf_df.dataset_access_type_id)

    # dataset, dbs_size
    dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\
                   .groupBy('dataset')\
                   .agg({'size':'sum'})\
                   .withColumnRenamed('sum(size)', 'dbs_size')

    # PhEDEx

    size_on_disk_udf = udf(lambda site, size: 0 if site.endswith(
        ('_MSS', '_Buffer', '_Export')) else size)

    # dataset, size, site
    phedex_cols = ['dataset_name', 'block_bytes', 'node_name']
    phedex_df = phedex.select(phedex_cols)\
                      .withColumnRenamed('dataset_name', 'dataset')\
                      .withColumnRenamed('block_bytes', 'size')\
                      .withColumnRenamed('node_name', 'site')

    # dataset, phedex_size, size_on_disk
    phedex_df = phedex_df.withColumn('size_on_disk', size_on_disk_udf(phedex_df.site, phedex_df.size))\
                         .groupBy('dataset')\
                         .agg({'size':'sum', 'size_on_disk': 'sum'})\
                         .withColumnRenamed('sum(size)', 'phedex_size')\
                         .withColumnRenamed('sum(size_on_disk)', 'size_on_disk')

    # dataset, dbs_size, phedex_size, size_on_disk
    result = phedex_df.join(dbs_df, phedex_df.dataset == dbs_df.dataset)\
                      .drop(dbs_df.dataset)

    extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2])
    extract_tier_udf = udf(lambda dataset: dataset.split('/')[3])

    # campaign, tier, dbs_size, phedex_size, size_on_disk
    result = result.withColumn('campaign', extract_campaign_udf(result.dataset))\
                   .withColumn('tier', extract_tier_udf(result.dataset))\
                   .drop('dataset')\
                   .groupBy(['campaign', 'tier'])\
                   .agg({'dbs_size':'sum', 'phedex_size': 'sum', 'size_on_disk': 'sum'})\
                   .withColumnRenamed('sum(dbs_size)', 'dbs_size')\
                   .withColumnRenamed('sum(phedex_size)', 'phedex_size')\
                   .withColumnRenamed('sum(size_on_disk)', 'size_on_disk')

    # campaign, tier, dbs_size, phedex_size, size_on_disk
    result = result.withColumn('sum_size',
                               result.dbs_size + result.phedex_size)
    result = result.orderBy(result.sum_size, ascending=False)\
                   .drop('sum_size')\
                   .limit(LIMIT)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        result.write.format("com.databricks.spark.csv")\
                    .option("header", "true").save(fout)

    ctx.stop()
Example #26
0
def run(fout,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    tables.update(phedex_tables(sqlContext, verbose=verbose))
    phedex_df = tables['phedex_df']
    daf = tables['daf']
    ddf = tables['ddf']
    bdf = tables['bdf']
    fdf = tables['fdf']
    aef = tables['aef']
    pef = tables['pef']
    mcf = tables['mcf']
    ocf = tables['ocf']
    rvf = tables['rvf']

    print("### ddf from main", ddf)

    # aggregate phedex info into dataframe
    phedex_cols = [
        'node_name', 'dataset_name', 'dataset_is_open', 'block_bytes',
        'replica_time_create'
    ]
    newpdf = phedex_df.select(phedex_cols).groupBy(['node_name', 'dataset_name', 'dataset_is_open'])\
            .agg({'block_bytes':'sum', 'replica_time_create':'max'})\
            .withColumnRenamed('sum(block_bytes)', 'pbr_size')\
            .withColumnRenamed('max(replica_time_create)', 'max_replica_time')
    newpdf.registerTempTable('newpdf')
    #    print_rows(newpdf, 'newpdf', verbose)
    #    newpdf.persist(StorageLevel.MEMORY_AND_DISK)

    # join tables
    cols = ['*']  # to select all fields from table
    cols = [
        'd_dataset_id', 'd_dataset', 'd_creation_date', 'd_is_dataset_valid',
        'f_event_count', 'f_file_size', 'dataset_access_type',
        'acquisition_era_name', 'processing_version'
    ]

    # join tables
    stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join(
        cols)
    print(stmt)
    joins = sqlContext.sql(stmt)
    #    print_rows(joins, 'joins', verbose)

    # keep joins table around
    #    joins.persist(StorageLevel.MEMORY_AND_DISK)

    # construct conditions
    cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1'
    fjoin = joins.where(cond).distinct().select(cols)

    # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000'))

    newdf = fjoin\
            .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\
            .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\
            .withColumnRenamed('sum(f_event_count)', 'evts')\
            .withColumnRenamed('sum(f_file_size)', 'size')\
            .withColumnRenamed('max(d_creation_date)', 'date')

    # at this point we have ndf dataframe with our collected stats for every dataset
    # let's join it with release info
    newdf.registerTempTable('newdf')
    cols = [
        'd_dataset_id', 'd_dataset', 'evts', 'size', 'date',
        'dataset_access_type', 'acquisition_era_name', 'processing_version',
        'r_release_version'
    ]
    stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join(
        cols)
    agg_dbs_df = sqlContext.sql(stmt)
    agg_dbs_df.registerTempTable('agg_dbs_df')
    #    print_rows(agg_dbs_df, 'agg_dbs_df', verbose)

    # keep agg_dbs_df table around
    #    agg_dbs_df.persist(StorageLevel.MEMORY_AND_DISK)

    # join dbs and phedex tables
    #    cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version','dataset_name','node_name','pbr_size','dataset_is_open','max_replica_time']
    cols = [
        'd_dataset', 'evts', 'size', 'date', 'dataset_access_type',
        'acquisition_era_name', 'r_release_version', 'node_name', 'pbr_size',
        'dataset_is_open', 'max_replica_time'
    ]
    stmt = 'SELECT %s FROM agg_dbs_df JOIN newpdf ON agg_dbs_df.d_dataset = newpdf.dataset_name' % ','.join(
        cols)
    finaldf = sqlContext.sql(stmt)

    # keep agg_dbs_df table around
    #    finaldf.persist(StorageLevel.MEMORY_AND_DISK)
    #    print_rows(finaldf, stmt, verbose)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        ndf = split_dataset(finaldf, 'd_dataset')
        ndf.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()