Ejemplo n.º 1
0
def run(path, amq, stomp, yarn=None, aggregation_schema=False, verbose=False):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    if stomp and os.path.isfile(stomp):
        ctx.addPyFile(stomp)
    else:
        raise Exception('No stomp module egg is provided')
    if amq and os.path.isfile(amq):
        if amq.split('/')[-1] == 'amq_broker.json':
            ctx.addFile(amq)
        else:
            raise Exception(
                'Wrong AMQ broker file name, please name it as amq_broker.json'
            )
    else:
        raise Exception('No AMQ credential file is provided')
    sqlContext = HiveContext(ctx)

    hpath = "hadoop fs -ls %s | awk '{print $8}'" % path
    if verbose:
        print("### Read files: %s" % hpath)
    pipe = Popen(hpath,
                 shell=True,
                 stdin=PIPE,
                 stdout=PIPE,
                 stderr=PIPE,
                 close_fds=True)
    pipe.wait()
    pfiles = [
        f for f in pipe.stdout.read().split('\n') if f.find('part-') != -1
    ]
    df = []

    if aggregation_schema:
        df = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                    .options(treatEmptyValuesAsNulls='true', nullValue='null', header='true') \
                    .load(fname, schema=aggregated_data_schema()) for fname in pfiles])
    else:
        df = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                    .options(treatEmptyValuesAsNulls='true', nullValue='null', header='true') \
                    .load(fname) for fname in pfiles])

    # Register temporary tables to be able to use sqlContext.sql
    df.registerTempTable('df')
    print_rows(df, "DataFrame", verbose)

    print('Schema:')
    df.printSchema()

    # for testing uncomment line below
    # df.toJSON().foreachPartition(print_data)
    # send data to CERN MONIT via stomp AMQ, see send2monit function
    df.toJSON().foreachPartition(send2monit)

    ctx.stop()
Ejemplo n.º 2
0
def run(fout, yarn=None, verbose=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read Phedex tables
    tables = {}
    hdir = 'hdfs:///cms/users/vk/phedex'
    tables.update(phedex_summary_tables(sqlContext, hdir=hdir,
                                        verbose=verbose))
    phedex_summary_df = tables['phedex_summary_df']
    phedex_summary_df.persist(StorageLevel.MEMORY_AND_DISK)
    print("### schema", phedex_summary_df.printSchema())
    phedex_summary_df.show(5)

    # register user defined function
    days = udf(days_present, IntegerType())

    # aggregate phedex info into dataframe
    #    pdf = phedex_summary_df\
    #            .groupBy(['site', 'dataset'])\
    #            .agg(
    #                    F.min(col('date')).alias('min_date'),
    #                    F.max(col('date')).alias('max_date'),
    #                    F.min(col('replica_date')).alias('min_rdate'),
    #                    F.max(col('replica_date')).alias('max_rdate'),
    #                    F.max(col('size')).alias('max_size'),
    #                    F.min(col('size')).alias('min_size')
    #                )\
    #            .withColumn('days', days(col('min_date'), col('max_date'), col('min_rdate'), col('max_rdate')))
    pdf = phedex_summary_df\
            .groupBy(['site', 'dataset', 'size'])\
            .agg(
                    F.min(col('date')).alias('min_date'),
                    F.max(col('date')).alias('max_date'),
                    F.min(col('replica_date')).alias('min_rdate'),
                    F.max(col('replica_date')).alias('max_rdate')
                )\
            .withColumn('days', days(col('min_date'), col('max_date'), col('min_rdate'), col('max_rdate')))
    pdf.persist(StorageLevel.MEMORY_AND_DISK)
    pdf.show(5)
    print_rows(pdf, 'pdf', verbose=1)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        pdf.write.format("com.databricks.spark.csv")\
            .option("header", "true").save(fout)

    ctx.stop()
Ejemplo n.º 3
0
def run(fout,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = SQLContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    tables.update(phedex_tables(sqlContext, verbose=verbose))
    phedex_df = tables['phedex_df']
    ddf = tables['ddf']
    fdf = tables['fdf']

    print("### ddf from main", ddf)

    # join tables
    cols = ['*']  # to select all fields from table
    cols = ['d_dataset_id', 'd_dataset', 'f_logical_file_name', 'f_adler32']

    # join tables
    stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id' % ','.join(
        cols)
    print(stmt)
    joins = sqlContext.sql(stmt)

    # construct conditions
    adler = [
        'ad8f6ad2', '9c441343', 'f68d5dca', '81c90e2a', '471d2524', 'a3c1f077',
        '6f0018a0', '8bb03b60', 'd504882c', '5ede357f', 'b05303c3', '716d1776',
        '7e9cf258', '1945804b', 'ec7bc1d7', '12c87747', '94f2aa32'
    ]
    cond = 'f_adler32 in %s' % adler
    cond = cond.replace('[', '(').replace(']', ')')
    #    scols = ['f_logical_file_name']
    fjoin = joins.where(cond).distinct().select(cols)

    print_rows(fjoin, stmt, verbose)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        fjoin.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Ejemplo n.º 4
0
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    ddf = tables['ddf']  # dataset table
    fdf = tables['fdf']  # file table

    if verbose:
        for row in ddf.head(1):
            print("### ddf row", row)

    # read CMSSW avro rdd
    date = eos_date(date)
    tables.update(eos_tables(sqlContext, date=date, verbose=verbose))
    eos_df = tables['eos_df']  # EOS table

    if verbose:
        for row in eos_df.head(1):
            print("### eos_df row", row)

    # merge DBS and CMSSW data
    cols = ['d_dataset', 'd_dataset_id', 'f_logical_file_name', 'file_lfn']
    stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN eos_df ON fdf.f_logical_file_name = eos_df.file_lfn' % ','.join(
        cols)
    joins = sqlContext.sql(stmt)
    print_rows(joins, stmt, verbose)

    # perform aggregation
    fjoin = joins.groupBy(['d_dataset'])\
            .agg({'file_lfn':'count'})\
            .withColumnRenamed('count(file_lfn)', 'count')\
            .withColumnRenamed('d_dataset', 'dataset')\
            .withColumn('date', lit(eos_date_unix(date)))\
            .withColumn('count_type', lit('eos'))\

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        ndf = split_dataset(fjoin, 'dataset')
        ndf.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Ejemplo n.º 5
0
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and JobMonitoring tables
    tables = {}

    # read JobMonitoring avro rdd
    date = jm_date(date)
    jm_df = jm_tables(ctx, sqlContext, date=date, verbose=verbose)

    # DBS tables
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    ddf = tables['ddf']  # dataset table
    fdf = tables['fdf']  # file table

    # merge DBS and JobMonitoring data
    cols = [
        'd_dataset', 'd_dataset_id', 'f_logical_file_name', 'FileName',
        'FileType', 'Type', 'SiteName', 'WrapWC', 'WrapCPU', 'JobExecExitCode'
    ]
    stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN jm_df ON fdf.f_logical_file_name = jm_df.FileName' % ','.join(
        cols)
    joins = sqlContext.sql(stmt)
    print_rows(joins, stmt, verbose)

    # perform aggregation
    fjoin = joins.groupBy(['SiteName','JobExecExitCode','FileType','Type','d_dataset'])\
            .agg({'WrapWC':'sum','WrapCPU':'sum','JobExecExitCode':'count','FileType':'count','Type':'count'})\
            .withColumnRenamed('sum(WrapWC)', 'tot_wc')\
            .withColumnRenamed('sum(WrapCPU)', 'tot_cpu')\
            .withColumnRenamed('count(JobExecExitCode)', 'ecode_count')\
            .withColumnRenamed('count(FileType)', 'file_type_count')\
            .withColumnRenamed('count(Type)', 'type_count')\
            .withColumnRenamed('d_dataset', 'dataset')\
            .withColumn('date', lit(jm_date_unix(date)))\
            .withColumn('count_type', lit('jm'))\

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        ndf = split_dataset(fjoin, 'dataset')
        ndf.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Ejemplo n.º 6
0
def run(fout,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = SQLContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    bdf = tables['bdf']
    fdf = tables['fdf']
    flf = tables['flf']

    # join tables
    cols = ['*']  # to select all fields from table
    cols = ['b_block_id', 'b_block_name', 'f_block_id', 'f_logical_file_name']

    # join tables
    stmt = 'SELECT %s FROM bdf JOIN fdf on bdf.b_block_id = fdf.f_block_id' % ','.join(
        cols)
    print(stmt)
    joins = sqlContext.sql(stmt)

    # keep table around
    joins.persist(StorageLevel.MEMORY_AND_DISK)

    # construct conditions
    cols = ['b_block_name', 'f_logical_file_name']
    pat = '%00047DB7-9F77-E011-ADC8-00215E21D9A8.root'
    #    pat = '%02ACAA1A-9F32-E111-BB31-0002C90B743A.root'
    fjoin = joins.select(cols).where(col('f_logical_file_name').like(pat))

    print_rows(fjoin, stmt, verbose)

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        fjoin.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Ejemplo n.º 7
0
def create_file_block_site_table(ctx, sql_context, verbose=False):
    """
    Joins fdf, bdf, ddf and PhEDEx tables and produces table with file name, block name, dataset name and site name.
    Site name is obtained from PhEDEx. Before site name is used, it is cleaned with clean_site_name function.
    After join is complete, only unique records are left in the table by using DISTINCT function.
    """
    print('Starting file_block_site generation')

    cols = [
        'f_logical_file_name AS file_name', 'b_block_name AS block_name',
        'clean_site_name(node_name) AS site_name', 'd_dataset AS dataset_name'
    ]

    # Join FDF and BDF by f_block_id and b_block_id
    query = ("SELECT %s FROM fdf " \
             "JOIN bdf ON fdf.f_block_id = bdf.b_block_id "\
             "JOIN ddf ON fdf.f_dataset_id = ddf.d_dataset_id "\
             "JOIN phedex_df ON bdf.b_block_name = phedex_df.block_name") % ','.join(cols)

    if verbose:
        print('Will run query to generate temp file_block_site table')

    result = run_query(query, sql_context, verbose)
    result.registerTempTable('f_b_all_df')

    query_distinct = ("SELECT DISTINCT * FROM f_b_all_df ORDER  BY file_name")
    result_distinct = run_query(query_distinct, sql_context, verbose)
    result_distinct.registerTempTable('f_b_s_df')

    if verbose:
        print('Temp table from joined DDF, FDF, BDF and PhEDEx')
        print('After DISTINCT query count changed %s -> %s' %
              (result.count(), result_distinct.count()))
        result_distinct.show(20)
        print_rows(result_distinct, query_distinct, verbose, 5)
        result_distinct.printSchema()

    print('Finished file_block_site generation')
Ejemplo n.º 8
0
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = SQLContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    dtables = ['daf', 'ddf', 'bdf', 'fdf', 'aef', 'pef', 'mcf', 'ocf', 'rvf']
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose, tables=dtables))
#    tables.update(phedex_tables(sqlContext, verbose=verbose))
#    phedex_df = tables['phedex_df']
    daf = tables['daf'] # dataset access table
    ddf = tables['ddf'] # dataset table
    bdf = tables['bdf'] # block table
    fdf = tables['fdf'] # file table
    aef = tables['aef'] # acquisition era
    pef = tables['pef'] # processing era table
    mcf = tables['mcf'] # output mod config table
    ocf = tables['ocf'] # output module table
    rvf = tables['rvf'] # release version table

    # read Condor rdd
#    tables.update(condor_tables(sqlContext, hdir='hdfs:///cms/users/vk/condor', date=condor_date(date), verbose=verbose))
    tables.update(condor_tables(sqlContext, date=condor_date(date), verbose=verbose))
    condor_df = tables['condor_df'] # aaa table

    # aggregate dbs info into dataframe
    cols = ['d_dataset_id', 'd_dataset','d_creation_date','d_is_dataset_valid','f_event_count','f_file_size','dataset_access_type','acquisition_era_name','processing_version']
    stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join(cols)
    print(stmt)
    joins = sqlContext.sql(stmt)

    # construct conditions
    cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1'
    fjoin = joins.where(cond).distinct().select(cols)

    # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000'))

    newdf = fjoin\
            .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\
            .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\
            .withColumnRenamed('sum(f_event_count)', 'evts')\
            .withColumnRenamed('sum(f_file_size)', 'size')\
            .withColumnRenamed('max(d_creation_date)', 'date')

    # at this point we have ndf dataframe with our collected stats for every dataset
    # let's join it with release info
    newdf.registerTempTable('newdf')
    cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version']
    stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join(cols)
    agg_dbs_df = sqlContext.sql(stmt)
    agg_dbs_df.registerTempTable('agg_dbs_df')

    # merge dbs+phedex and Condor data
    cols = ['d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','r_release_version']
    cols = cols + ['data.KEvents', 'data.CMSSWKLumis', 'data.CMSSWWallHrs', 'data.Campaign', 'data.Workflow', 'data.CpuEff', 'data.CoreHr', 'data.QueueHrs', 'data.CRAB_UserHN', 'data.Type', 'data.ExitCode', 'data.TaskType', 'data.RecordTime']
    stmt = 'SELECT %s FROM condor_df JOIN agg_dbs_df ON agg_dbs_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.KEvents > 0' % ','.join(cols)
#     stmt = 'SELECT %s FROM condor_df JOIN dbs_phedex_df ON dbs_phedex_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.KEvents > 0' % ','.join(cols)

    final_df = sqlContext.sql(stmt)
    print_rows(final_df, stmt, verbose)

    # keep table around
    final_df.persist(StorageLevel.MEMORY_AND_DISK)

    # user defined function
    def rate(evts, cores):
        "Calculate the rate of events vs cores, if they're not defineed return -1"
        if evts and cores:
            return float(evts)/float(cores)
        return -1.
    func_rate = udf(rate, DoubleType())

    # our output
    store = {}

    # conditions

    # load pyspark functions to be used here to redefine any previous usage of those names
    from pyspark.sql.functions import lit, sum, count, col, split

    # here we split dataframe based on exitcode conditions to reduce dimentionality
    # of the input, otherwise job crashes with Integer.MAX_VALUE exception which
    # basically tells that input dataframe exceed number of available partitions
    for ecode in [0,1]:
        if ecode == 0:
            refdf = final_df.where(col('ExitCode') == 0)
            condf = condor_df.where(col('data.ExitCode') == 0)
        else:
            refdf = final_df.where(col('ExitCode') != 0)
            condf = condor_df.where(col('data.ExitCode') != 0)
        refdf.persist(StorageLevel.MEMORY_AND_DISK)
        condf.persist(StorageLevel.MEMORY_AND_DISK)

        # aggregate CMS datasets
        cols = ['data.DESIRED_CMSDataset', 'data.CRAB_UserHN', 'data.ExitCode', 'data.Type', 'data.TaskType', 'data.RecordTime']
        xdf = condf.groupBy(cols)\
                .agg(sum('data.KEvents').alias('sum_evts'),sum('data.CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumn("tier", split(col('DESIRED_CMSDataset'), "/").alias('tier').getItem(3))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('DESIRED_CMSDataset', 'dataset')
        store.setdefault('dataset', []).append(xdf)

        # aggregate across campaign
        cols = ['data.Campaign', 'data.CRAB_UserHN', 'data.ExitCode', 'data.Type', 'data.TaskType', 'data.RecordTime']
        xdf = condf.groupBy(cols)\
                .agg(sum('data.KEvents').alias('sum_evts'),sum('data.CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('Campaign', 'campaign')
        store.setdefault('campaign', []).append(xdf)

        # aggregate across DBS releases
        cols = ['r_release_version', 'CRAB_UserHN', 'ExitCode', 'Type', 'TaskType', 'RecordTime']
        xdf = refdf.groupBy(cols)\
                .agg(sum('KEvents').alias('sum_evts'),sum('CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('r_release_version', 'release')
        store.setdefault('release', []).append(xdf)

        # aggregate across DBS eras
        cols = ['acquisition_era_name', 'CRAB_UserHN', 'ExitCode', 'Type', 'TaskType', 'RecordTime']
        xdf = refdf.groupBy(cols)\
                .agg(sum('KEvents').alias('sum_evts'),sum('CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('acquisition_era_name', 'era')
        store.setdefault('era', []).append(xdf)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if  fout:
        year, month, day = split_date(date)
        for col in store.keys():
            out = '%s/%s/%s/%s/%s' % (fout, col, year, month, day)
            print("output: %s" % out)
            odf = unionAll(store[col])
            print("%s rows: %s" % (col, odf.count()))
            print_rows(odf, col, verbose=1)
            odf.write.format("com.databricks.spark.csv")\
                    .option("header", "true").save(out)

    ctx.stop()
Ejemplo n.º 9
0
def run(date, fout, yarn=None, verbose=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = SQLContext(ctx)

    # read ASO and FTS tables
    date = aso_date(date)
    tables = {}
    tables.update(fts_tables(sqlContext, date=date, verbose=verbose))
    fts_df = tables['fts_df']  # fts table
    print_rows(fts_df, 'fts_df', verbose)
    tables.update(aso_tables(sqlContext, verbose=verbose))
    aso_df = tables['aso_df']  # aso table
    print_rows(aso_df, 'aso_df', verbose)

    fts = fts_df.select([
        'job_metadata.issuer', 'job_id', 'src_url', 't_final_transfer_state',
        'tr_timestamp_start', 'tr_timestamp_complete'
    ])

    fts = fts.filter("issuer = 'ASO'")
    fts_udf = udf(lambda x: x.split("/")[-1], StringType())
    fts = fts.withColumn("filename", fts_udf(fts.src_url))
    fts = fts.withColumn("fts_duration",
                         (fts.tr_timestamp_complete - fts.tr_timestamp_start) *
                         1. / 1000)

    aso = aso_df.select([
        'tm_source_lfn', 'tm_fts_id', 'tm_jobid', 'tm_type', 'tm_last_update',
        'tm_start_time', 'tm_transfer_state', 'tm_source', 'tm_destination',
        'tm_transfer_retry_count', 'tm_publish'
    ]).withColumnRenamed('tm_fts_id', 'job_id')

    aso_udf = udf(lambda x: x.split("/")[-1], StringType())
    aso = aso.withColumn("filename", aso_udf(aso.tm_source_lfn))
    aso = aso.filter((col("tm_transfer_state") == 3)
                     | (col("tm_transfer_state") == 2))
    aso = aso.filter((aso.tm_transfer_state.isNotNull()))
    aso = aso.filter((aso.job_id.isNotNull()))

    new_df = fts.join(aso, on=['filename', 'job_id'], how='left_outer')

    new_df = new_df.groupby([
        'job_id', 'tm_transfer_state', 'tm_publish', 'tm_transfer_retry_count'
    ]).agg(
        count(lit(1)).alias('Num Of Records'),
        mean(aso.tm_last_update - aso.tm_start_time).alias('Aso duration'),
        mean(new_df.tr_timestamp_start * 1. / 1000 -
             new_df.tm_start_time).alias('Aso delay start'),
        mean(new_df.tm_last_update -
             new_df.tr_timestamp_complete * 1. / 1000).alias('Aso delay'),
        mean(new_df.fts_duration).alias('fts duration'),
    )

    # keep table around
    new_df.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        new_df.write.format("com.databricks.spark.csv")\
                    .option("header", "true").save(fout)

    ctx.stop()
Ejemplo n.º 10
0
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    tables.update(phedex_tables(sqlContext, verbose=verbose))
    phedex_df = tables['phedex_df']
    daf = tables['daf'] # dataset access table
    ddf = tables['ddf'] # dataset table
    bdf = tables['bdf'] # block table
    fdf = tables['fdf'] # file table
    aef = tables['aef'] # acquisition era
    pef = tables['pef'] # processing era table
    mcf = tables['mcf'] # output mod config table
    ocf = tables['ocf'] # output module table
    rvf = tables['rvf'] # release version table

    # read Condor rdd
    date = condor_date(date)
    tables.update(condor_tables(sqlContext, date=date, verbose=verbose))
    condor_df = tables['condor_df'] # aaa table

    # aggregate phedex info into dataframe
    phedex_cols = ['node_name', 'dataset_name', 'dataset_is_open', 'block_bytes', 'replica_time_create']
    newpdf = phedex_df.select(phedex_cols).groupBy(['node_name', 'dataset_name', 'dataset_is_open'])\
            .agg({'block_bytes':'sum', 'replica_time_create':'max'})\
            .withColumnRenamed('sum(block_bytes)', 'pbr_size')\
            .withColumnRenamed('max(replica_time_create)', 'max_replica_time')
    newpdf.registerTempTable('newpdf')

    # aggregate dbs info into dataframe
    cols = ['d_dataset_id', 'd_dataset','d_creation_date','d_is_dataset_valid','f_event_count','f_file_size','dataset_access_type','acquisition_era_name','processing_version']
    stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join(cols)
    print(stmt)
    joins = sqlContext.sql(stmt)

    # construct conditions
    cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1'
    fjoin = joins.where(cond).distinct().select(cols)

    # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000'))

    newdf = fjoin\
            .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\
            .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\
            .withColumnRenamed('sum(f_event_count)', 'evts')\
            .withColumnRenamed('sum(f_file_size)', 'size')\
            .withColumnRenamed('max(d_creation_date)', 'date')

    # at this point we have ndf dataframe with our collected stats for every dataset
    # let's join it with release info
    newdf.registerTempTable('newdf')
    cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version']
    stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join(cols)
    agg_dbs_df = sqlContext.sql(stmt)
    agg_dbs_df.registerTempTable('agg_dbs_df')

    # join dbs and phedex tables
    cols = ['d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','r_release_version','node_name','pbr_size','dataset_is_open','max_replica_time']
    stmt = 'SELECT %s FROM agg_dbs_df JOIN newpdf ON agg_dbs_df.d_dataset = newpdf.dataset_name' % ','.join(cols)
    dbs_phedex_df = sqlContext.sql(stmt)
    dbs_phedex_df.registerTempTable('dbs_phedex_df')

    # merge dbs+phedex and Condor data
#    cols = ['data.DESIRED_CMSDataset', 'data.KEvents', 'data.CMSSWWallHrs']
#    stmt = 'SELECT %s FROM condor_df WHERE condor_df.data.ExitCode=0 AND condor_df.data.KEvents > 0' % ','.join(cols)
    cols = cols + ['data.KEvents', 'data.CMSSWKLumis', 'data.CMSSWWallHrs', 'data.Campaign', 'data.Workflow', 'data.CpuEff', 'data.CoreHr', 'data.QueueHrs', 'data.CRAB_UserHN', 'data.Type']
    stmt = 'SELECT %s FROM condor_df JOIN dbs_phedex_df ON dbs_phedex_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.ExitCode=0 AND condor_df.data.KEvents > 0' % ','.join(cols)
    finel_df = sqlContext.sql(stmt)
    print_rows(finel_df, stmt, verbose)

    # keep table around
    finel_df.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if  fout:
        ndf = split_dataset(finel_df, 'd_dataset')
        ndf.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()