def run(date, fout, yarn=None, verbose=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:])
    todate = fromdate
    # read Phedex tables
    tables = {}
    tables.update(
        phedex_tables(sqlContext,
                      verbose=verbose,
                      fromdate=fromdate,
                      todate=todate))
    phedex_df = tables['phedex_df']

    # register user defined function
    unix2date = udf(unix2human, StringType())
    siteFilter = udf(site_filter, IntegerType())
    one_day = 60 * 60 * 24

    # aggregate phedex info into dataframe
    cols = [
        'node_name', 'dataset_name', 'block_bytes', 'replica_time_create',
        'br_user_group_id'
    ]
    pdf = phedex_df.select(cols).where(siteFilter(col('node_name')) == 1)\
            .groupBy(['node_name', 'dataset_name', 'replica_time_create', 'br_user_group_id'])\
            .agg({'block_bytes':'sum'})\
            .withColumn('date', lit(date))\
            .withColumn('replica_date', unix2date(col('replica_time_create')))\
            .withColumnRenamed('sum(block_bytes)', 'size')\
            .withColumnRenamed('dataset_name', 'dataset')\
            .withColumnRenamed('node_name', 'site')\
            .withColumnRenamed('br_user_group_id', 'groupid')
    pdf.registerTempTable('pdf')
    pdf.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        year, month, day = split_date(date)
        out = '%s/%s/%s/%s' % (fout, year, month, day)
        cols = ['date', 'site', 'dataset', 'size', 'replica_date', 'groupid']
        # don't write header since when we'll read back the data it will
        # mismatch the data types, i.e. headers are string and rows
        # may be different data types
        pdf.select(cols)\
            .write.format("com.databricks.spark.csv")\
            .option("header", "true").save(out)

    ctx.stop()
Exemple #2
0
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = SQLContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    dtables = ['daf', 'ddf', 'bdf', 'fdf', 'aef', 'pef', 'mcf', 'ocf', 'rvf']
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose, tables=dtables))
#    tables.update(phedex_tables(sqlContext, verbose=verbose))
#    phedex_df = tables['phedex_df']
    daf = tables['daf'] # dataset access table
    ddf = tables['ddf'] # dataset table
    bdf = tables['bdf'] # block table
    fdf = tables['fdf'] # file table
    aef = tables['aef'] # acquisition era
    pef = tables['pef'] # processing era table
    mcf = tables['mcf'] # output mod config table
    ocf = tables['ocf'] # output module table
    rvf = tables['rvf'] # release version table

    # read Condor rdd
#    tables.update(condor_tables(sqlContext, hdir='hdfs:///cms/users/vk/condor', date=condor_date(date), verbose=verbose))
    tables.update(condor_tables(sqlContext, date=condor_date(date), verbose=verbose))
    condor_df = tables['condor_df'] # aaa table

    # aggregate dbs info into dataframe
    cols = ['d_dataset_id', 'd_dataset','d_creation_date','d_is_dataset_valid','f_event_count','f_file_size','dataset_access_type','acquisition_era_name','processing_version']
    stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join(cols)
    print(stmt)
    joins = sqlContext.sql(stmt)

    # construct conditions
    cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1'
    fjoin = joins.where(cond).distinct().select(cols)

    # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000'))

    newdf = fjoin\
            .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\
            .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\
            .withColumnRenamed('sum(f_event_count)', 'evts')\
            .withColumnRenamed('sum(f_file_size)', 'size')\
            .withColumnRenamed('max(d_creation_date)', 'date')

    # at this point we have ndf dataframe with our collected stats for every dataset
    # let's join it with release info
    newdf.registerTempTable('newdf')
    cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version']
    stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join(cols)
    agg_dbs_df = sqlContext.sql(stmt)
    agg_dbs_df.registerTempTable('agg_dbs_df')

    # merge dbs+phedex and Condor data
    cols = ['d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','r_release_version']
    cols = cols + ['data.KEvents', 'data.CMSSWKLumis', 'data.CMSSWWallHrs', 'data.Campaign', 'data.Workflow', 'data.CpuEff', 'data.CoreHr', 'data.QueueHrs', 'data.CRAB_UserHN', 'data.Type', 'data.ExitCode', 'data.TaskType', 'data.RecordTime']
    stmt = 'SELECT %s FROM condor_df JOIN agg_dbs_df ON agg_dbs_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.KEvents > 0' % ','.join(cols)
#     stmt = 'SELECT %s FROM condor_df JOIN dbs_phedex_df ON dbs_phedex_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.KEvents > 0' % ','.join(cols)

    final_df = sqlContext.sql(stmt)
    print_rows(final_df, stmt, verbose)

    # keep table around
    final_df.persist(StorageLevel.MEMORY_AND_DISK)

    # user defined function
    def rate(evts, cores):
        "Calculate the rate of events vs cores, if they're not defineed return -1"
        if evts and cores:
            return float(evts)/float(cores)
        return -1.
    func_rate = udf(rate, DoubleType())

    # our output
    store = {}

    # conditions

    # load pyspark functions to be used here to redefine any previous usage of those names
    from pyspark.sql.functions import lit, sum, count, col, split

    # here we split dataframe based on exitcode conditions to reduce dimentionality
    # of the input, otherwise job crashes with Integer.MAX_VALUE exception which
    # basically tells that input dataframe exceed number of available partitions
    for ecode in [0,1]:
        if ecode == 0:
            refdf = final_df.where(col('ExitCode') == 0)
            condf = condor_df.where(col('data.ExitCode') == 0)
        else:
            refdf = final_df.where(col('ExitCode') != 0)
            condf = condor_df.where(col('data.ExitCode') != 0)
        refdf.persist(StorageLevel.MEMORY_AND_DISK)
        condf.persist(StorageLevel.MEMORY_AND_DISK)

        # aggregate CMS datasets
        cols = ['data.DESIRED_CMSDataset', 'data.CRAB_UserHN', 'data.ExitCode', 'data.Type', 'data.TaskType', 'data.RecordTime']
        xdf = condf.groupBy(cols)\
                .agg(sum('data.KEvents').alias('sum_evts'),sum('data.CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumn("tier", split(col('DESIRED_CMSDataset'), "/").alias('tier').getItem(3))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('DESIRED_CMSDataset', 'dataset')
        store.setdefault('dataset', []).append(xdf)

        # aggregate across campaign
        cols = ['data.Campaign', 'data.CRAB_UserHN', 'data.ExitCode', 'data.Type', 'data.TaskType', 'data.RecordTime']
        xdf = condf.groupBy(cols)\
                .agg(sum('data.KEvents').alias('sum_evts'),sum('data.CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('Campaign', 'campaign')
        store.setdefault('campaign', []).append(xdf)

        # aggregate across DBS releases
        cols = ['r_release_version', 'CRAB_UserHN', 'ExitCode', 'Type', 'TaskType', 'RecordTime']
        xdf = refdf.groupBy(cols)\
                .agg(sum('KEvents').alias('sum_evts'),sum('CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('r_release_version', 'release')
        store.setdefault('release', []).append(xdf)

        # aggregate across DBS eras
        cols = ['acquisition_era_name', 'CRAB_UserHN', 'ExitCode', 'Type', 'TaskType', 'RecordTime']
        xdf = refdf.groupBy(cols)\
                .agg(sum('KEvents').alias('sum_evts'),sum('CoreHr').alias('sum_chr'))\
                .withColumn('date', lit(date))\
                .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\
                .withColumnRenamed('CRAB_UserHN', 'user')\
                .withColumnRenamed('RecordTime', 'rec_time')\
                .withColumnRenamed('acquisition_era_name', 'era')
        store.setdefault('era', []).append(xdf)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if  fout:
        year, month, day = split_date(date)
        for col in store.keys():
            out = '%s/%s/%s/%s/%s' % (fout, col, year, month, day)
            print("output: %s" % out)
            odf = unionAll(store[col])
            print("%s rows: %s" % (col, odf.count()))
            print_rows(odf, col, verbose=1)
            odf.write.format("com.databricks.spark.csv")\
                    .option("header", "true").save(out)

    ctx.stop()