def run(path, amq, stomp, yarn=None, aggregation_schema=False, verbose=False): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) if stomp and os.path.isfile(stomp): ctx.addPyFile(stomp) else: raise Exception('No stomp module egg is provided') if amq and os.path.isfile(amq): if amq.split('/')[-1] == 'amq_broker.json': ctx.addFile(amq) else: raise Exception( 'Wrong AMQ broker file name, please name it as amq_broker.json' ) else: raise Exception('No AMQ credential file is provided') sqlContext = HiveContext(ctx) hpath = "hadoop fs -ls %s | awk '{print $8}'" % path if verbose: print("### Read files: %s" % hpath) pipe = Popen(hpath, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, close_fds=True) pipe.wait() pfiles = [ f for f in pipe.stdout.read().split('\n') if f.find('part-') != -1 ] df = [] if aggregation_schema: df = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null', header='true') \ .load(fname, schema=aggregated_data_schema()) for fname in pfiles]) else: df = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null', header='true') \ .load(fname) for fname in pfiles]) # Register temporary tables to be able to use sqlContext.sql df.registerTempTable('df') print_rows(df, "DataFrame", verbose) print('Schema:') df.printSchema() # for testing uncomment line below # df.toJSON().foreachPartition(print_data) # send data to CERN MONIT via stomp AMQ, see send2monit function df.toJSON().foreachPartition(send2monit) ctx.stop()
def run(fout, yarn=None, verbose=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read Phedex tables tables = {} hdir = 'hdfs:///cms/users/vk/phedex' tables.update(phedex_summary_tables(sqlContext, hdir=hdir, verbose=verbose)) phedex_summary_df = tables['phedex_summary_df'] phedex_summary_df.persist(StorageLevel.MEMORY_AND_DISK) print("### schema", phedex_summary_df.printSchema()) phedex_summary_df.show(5) # register user defined function days = udf(days_present, IntegerType()) # aggregate phedex info into dataframe # pdf = phedex_summary_df\ # .groupBy(['site', 'dataset'])\ # .agg( # F.min(col('date')).alias('min_date'), # F.max(col('date')).alias('max_date'), # F.min(col('replica_date')).alias('min_rdate'), # F.max(col('replica_date')).alias('max_rdate'), # F.max(col('size')).alias('max_size'), # F.min(col('size')).alias('min_size') # )\ # .withColumn('days', days(col('min_date'), col('max_date'), col('min_rdate'), col('max_rdate'))) pdf = phedex_summary_df\ .groupBy(['site', 'dataset', 'size'])\ .agg( F.min(col('date')).alias('min_date'), F.max(col('date')).alias('max_date'), F.min(col('replica_date')).alias('min_rdate'), F.max(col('replica_date')).alias('max_rdate') )\ .withColumn('days', days(col('min_date'), col('max_date'), col('min_rdate'), col('max_rdate'))) pdf.persist(StorageLevel.MEMORY_AND_DISK) pdf.show(5) print_rows(pdf, 'pdf', verbose=1) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: pdf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(fout, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = SQLContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) tables.update(phedex_tables(sqlContext, verbose=verbose)) phedex_df = tables['phedex_df'] ddf = tables['ddf'] fdf = tables['fdf'] print("### ddf from main", ddf) # join tables cols = ['*'] # to select all fields from table cols = ['d_dataset_id', 'd_dataset', 'f_logical_file_name', 'f_adler32'] # join tables stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id' % ','.join( cols) print(stmt) joins = sqlContext.sql(stmt) # construct conditions adler = [ 'ad8f6ad2', '9c441343', 'f68d5dca', '81c90e2a', '471d2524', 'a3c1f077', '6f0018a0', '8bb03b60', 'd504882c', '5ede357f', 'b05303c3', '716d1776', '7e9cf258', '1945804b', 'ec7bc1d7', '12c87747', '94f2aa32' ] cond = 'f_adler32 in %s' % adler cond = cond.replace('[', '(').replace(']', ')') # scols = ['f_logical_file_name'] fjoin = joins.where(cond).distinct().select(cols) print_rows(fjoin, stmt, verbose) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: fjoin.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) ddf = tables['ddf'] # dataset table fdf = tables['fdf'] # file table if verbose: for row in ddf.head(1): print("### ddf row", row) # read CMSSW avro rdd date = eos_date(date) tables.update(eos_tables(sqlContext, date=date, verbose=verbose)) eos_df = tables['eos_df'] # EOS table if verbose: for row in eos_df.head(1): print("### eos_df row", row) # merge DBS and CMSSW data cols = ['d_dataset', 'd_dataset_id', 'f_logical_file_name', 'file_lfn'] stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN eos_df ON fdf.f_logical_file_name = eos_df.file_lfn' % ','.join( cols) joins = sqlContext.sql(stmt) print_rows(joins, stmt, verbose) # perform aggregation fjoin = joins.groupBy(['d_dataset'])\ .agg({'file_lfn':'count'})\ .withColumnRenamed('count(file_lfn)', 'count')\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumn('date', lit(eos_date_unix(date)))\ .withColumn('count_type', lit('eos'))\ # keep table around fjoin.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: ndf = split_dataset(fjoin, 'dataset') ndf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and JobMonitoring tables tables = {} # read JobMonitoring avro rdd date = jm_date(date) jm_df = jm_tables(ctx, sqlContext, date=date, verbose=verbose) # DBS tables tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) ddf = tables['ddf'] # dataset table fdf = tables['fdf'] # file table # merge DBS and JobMonitoring data cols = [ 'd_dataset', 'd_dataset_id', 'f_logical_file_name', 'FileName', 'FileType', 'Type', 'SiteName', 'WrapWC', 'WrapCPU', 'JobExecExitCode' ] stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN jm_df ON fdf.f_logical_file_name = jm_df.FileName' % ','.join( cols) joins = sqlContext.sql(stmt) print_rows(joins, stmt, verbose) # perform aggregation fjoin = joins.groupBy(['SiteName','JobExecExitCode','FileType','Type','d_dataset'])\ .agg({'WrapWC':'sum','WrapCPU':'sum','JobExecExitCode':'count','FileType':'count','Type':'count'})\ .withColumnRenamed('sum(WrapWC)', 'tot_wc')\ .withColumnRenamed('sum(WrapCPU)', 'tot_cpu')\ .withColumnRenamed('count(JobExecExitCode)', 'ecode_count')\ .withColumnRenamed('count(FileType)', 'file_type_count')\ .withColumnRenamed('count(Type)', 'type_count')\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumn('date', lit(jm_date_unix(date)))\ .withColumn('count_type', lit('jm'))\ # keep table around fjoin.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: ndf = split_dataset(fjoin, 'dataset') ndf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(fout, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = SQLContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) bdf = tables['bdf'] fdf = tables['fdf'] flf = tables['flf'] # join tables cols = ['*'] # to select all fields from table cols = ['b_block_id', 'b_block_name', 'f_block_id', 'f_logical_file_name'] # join tables stmt = 'SELECT %s FROM bdf JOIN fdf on bdf.b_block_id = fdf.f_block_id' % ','.join( cols) print(stmt) joins = sqlContext.sql(stmt) # keep table around joins.persist(StorageLevel.MEMORY_AND_DISK) # construct conditions cols = ['b_block_name', 'f_logical_file_name'] pat = '%00047DB7-9F77-E011-ADC8-00215E21D9A8.root' # pat = '%02ACAA1A-9F32-E111-BB31-0002C90B743A.root' fjoin = joins.select(cols).where(col('f_logical_file_name').like(pat)) print_rows(fjoin, stmt, verbose) # keep table around fjoin.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: fjoin.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def create_file_block_site_table(ctx, sql_context, verbose=False): """ Joins fdf, bdf, ddf and PhEDEx tables and produces table with file name, block name, dataset name and site name. Site name is obtained from PhEDEx. Before site name is used, it is cleaned with clean_site_name function. After join is complete, only unique records are left in the table by using DISTINCT function. """ print('Starting file_block_site generation') cols = [ 'f_logical_file_name AS file_name', 'b_block_name AS block_name', 'clean_site_name(node_name) AS site_name', 'd_dataset AS dataset_name' ] # Join FDF and BDF by f_block_id and b_block_id query = ("SELECT %s FROM fdf " \ "JOIN bdf ON fdf.f_block_id = bdf.b_block_id "\ "JOIN ddf ON fdf.f_dataset_id = ddf.d_dataset_id "\ "JOIN phedex_df ON bdf.b_block_name = phedex_df.block_name") % ','.join(cols) if verbose: print('Will run query to generate temp file_block_site table') result = run_query(query, sql_context, verbose) result.registerTempTable('f_b_all_df') query_distinct = ("SELECT DISTINCT * FROM f_b_all_df ORDER BY file_name") result_distinct = run_query(query_distinct, sql_context, verbose) result_distinct.registerTempTable('f_b_s_df') if verbose: print('Temp table from joined DDF, FDF, BDF and PhEDEx') print('After DISTINCT query count changed %s -> %s' % (result.count(), result_distinct.count())) result_distinct.show(20) print_rows(result_distinct, query_distinct, verbose, 5) result_distinct.printSchema() print('Finished file_block_site generation')
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = SQLContext(ctx) # read DBS and Phedex tables tables = {} dtables = ['daf', 'ddf', 'bdf', 'fdf', 'aef', 'pef', 'mcf', 'ocf', 'rvf'] tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose, tables=dtables)) # tables.update(phedex_tables(sqlContext, verbose=verbose)) # phedex_df = tables['phedex_df'] daf = tables['daf'] # dataset access table ddf = tables['ddf'] # dataset table bdf = tables['bdf'] # block table fdf = tables['fdf'] # file table aef = tables['aef'] # acquisition era pef = tables['pef'] # processing era table mcf = tables['mcf'] # output mod config table ocf = tables['ocf'] # output module table rvf = tables['rvf'] # release version table # read Condor rdd # tables.update(condor_tables(sqlContext, hdir='hdfs:///cms/users/vk/condor', date=condor_date(date), verbose=verbose)) tables.update(condor_tables(sqlContext, date=condor_date(date), verbose=verbose)) condor_df = tables['condor_df'] # aaa table # aggregate dbs info into dataframe cols = ['d_dataset_id', 'd_dataset','d_creation_date','d_is_dataset_valid','f_event_count','f_file_size','dataset_access_type','acquisition_era_name','processing_version'] stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join(cols) print(stmt) joins = sqlContext.sql(stmt) # construct conditions cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1' fjoin = joins.where(cond).distinct().select(cols) # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000')) newdf = fjoin\ .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\ .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\ .withColumnRenamed('sum(f_event_count)', 'evts')\ .withColumnRenamed('sum(f_file_size)', 'size')\ .withColumnRenamed('max(d_creation_date)', 'date') # at this point we have ndf dataframe with our collected stats for every dataset # let's join it with release info newdf.registerTempTable('newdf') cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version'] stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join(cols) agg_dbs_df = sqlContext.sql(stmt) agg_dbs_df.registerTempTable('agg_dbs_df') # merge dbs+phedex and Condor data cols = ['d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','r_release_version'] cols = cols + ['data.KEvents', 'data.CMSSWKLumis', 'data.CMSSWWallHrs', 'data.Campaign', 'data.Workflow', 'data.CpuEff', 'data.CoreHr', 'data.QueueHrs', 'data.CRAB_UserHN', 'data.Type', 'data.ExitCode', 'data.TaskType', 'data.RecordTime'] stmt = 'SELECT %s FROM condor_df JOIN agg_dbs_df ON agg_dbs_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.KEvents > 0' % ','.join(cols) # stmt = 'SELECT %s FROM condor_df JOIN dbs_phedex_df ON dbs_phedex_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.KEvents > 0' % ','.join(cols) final_df = sqlContext.sql(stmt) print_rows(final_df, stmt, verbose) # keep table around final_df.persist(StorageLevel.MEMORY_AND_DISK) # user defined function def rate(evts, cores): "Calculate the rate of events vs cores, if they're not defineed return -1" if evts and cores: return float(evts)/float(cores) return -1. func_rate = udf(rate, DoubleType()) # our output store = {} # conditions # load pyspark functions to be used here to redefine any previous usage of those names from pyspark.sql.functions import lit, sum, count, col, split # here we split dataframe based on exitcode conditions to reduce dimentionality # of the input, otherwise job crashes with Integer.MAX_VALUE exception which # basically tells that input dataframe exceed number of available partitions for ecode in [0,1]: if ecode == 0: refdf = final_df.where(col('ExitCode') == 0) condf = condor_df.where(col('data.ExitCode') == 0) else: refdf = final_df.where(col('ExitCode') != 0) condf = condor_df.where(col('data.ExitCode') != 0) refdf.persist(StorageLevel.MEMORY_AND_DISK) condf.persist(StorageLevel.MEMORY_AND_DISK) # aggregate CMS datasets cols = ['data.DESIRED_CMSDataset', 'data.CRAB_UserHN', 'data.ExitCode', 'data.Type', 'data.TaskType', 'data.RecordTime'] xdf = condf.groupBy(cols)\ .agg(sum('data.KEvents').alias('sum_evts'),sum('data.CoreHr').alias('sum_chr'))\ .withColumn('date', lit(date))\ .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\ .withColumn("tier", split(col('DESIRED_CMSDataset'), "/").alias('tier').getItem(3))\ .withColumnRenamed('CRAB_UserHN', 'user')\ .withColumnRenamed('RecordTime', 'rec_time')\ .withColumnRenamed('DESIRED_CMSDataset', 'dataset') store.setdefault('dataset', []).append(xdf) # aggregate across campaign cols = ['data.Campaign', 'data.CRAB_UserHN', 'data.ExitCode', 'data.Type', 'data.TaskType', 'data.RecordTime'] xdf = condf.groupBy(cols)\ .agg(sum('data.KEvents').alias('sum_evts'),sum('data.CoreHr').alias('sum_chr'))\ .withColumn('date', lit(date))\ .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\ .withColumnRenamed('CRAB_UserHN', 'user')\ .withColumnRenamed('RecordTime', 'rec_time')\ .withColumnRenamed('Campaign', 'campaign') store.setdefault('campaign', []).append(xdf) # aggregate across DBS releases cols = ['r_release_version', 'CRAB_UserHN', 'ExitCode', 'Type', 'TaskType', 'RecordTime'] xdf = refdf.groupBy(cols)\ .agg(sum('KEvents').alias('sum_evts'),sum('CoreHr').alias('sum_chr'))\ .withColumn('date', lit(date))\ .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\ .withColumnRenamed('CRAB_UserHN', 'user')\ .withColumnRenamed('RecordTime', 'rec_time')\ .withColumnRenamed('r_release_version', 'release') store.setdefault('release', []).append(xdf) # aggregate across DBS eras cols = ['acquisition_era_name', 'CRAB_UserHN', 'ExitCode', 'Type', 'TaskType', 'RecordTime'] xdf = refdf.groupBy(cols)\ .agg(sum('KEvents').alias('sum_evts'),sum('CoreHr').alias('sum_chr'))\ .withColumn('date', lit(date))\ .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\ .withColumnRenamed('CRAB_UserHN', 'user')\ .withColumnRenamed('RecordTime', 'rec_time')\ .withColumnRenamed('acquisition_era_name', 'era') store.setdefault('era', []).append(xdf) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: year, month, day = split_date(date) for col in store.keys(): out = '%s/%s/%s/%s/%s' % (fout, col, year, month, day) print("output: %s" % out) odf = unionAll(store[col]) print("%s rows: %s" % (col, odf.count())) print_rows(odf, col, verbose=1) odf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(out) ctx.stop()
def run(date, fout, yarn=None, verbose=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = SQLContext(ctx) # read ASO and FTS tables date = aso_date(date) tables = {} tables.update(fts_tables(sqlContext, date=date, verbose=verbose)) fts_df = tables['fts_df'] # fts table print_rows(fts_df, 'fts_df', verbose) tables.update(aso_tables(sqlContext, verbose=verbose)) aso_df = tables['aso_df'] # aso table print_rows(aso_df, 'aso_df', verbose) fts = fts_df.select([ 'job_metadata.issuer', 'job_id', 'src_url', 't_final_transfer_state', 'tr_timestamp_start', 'tr_timestamp_complete' ]) fts = fts.filter("issuer = 'ASO'") fts_udf = udf(lambda x: x.split("/")[-1], StringType()) fts = fts.withColumn("filename", fts_udf(fts.src_url)) fts = fts.withColumn("fts_duration", (fts.tr_timestamp_complete - fts.tr_timestamp_start) * 1. / 1000) aso = aso_df.select([ 'tm_source_lfn', 'tm_fts_id', 'tm_jobid', 'tm_type', 'tm_last_update', 'tm_start_time', 'tm_transfer_state', 'tm_source', 'tm_destination', 'tm_transfer_retry_count', 'tm_publish' ]).withColumnRenamed('tm_fts_id', 'job_id') aso_udf = udf(lambda x: x.split("/")[-1], StringType()) aso = aso.withColumn("filename", aso_udf(aso.tm_source_lfn)) aso = aso.filter((col("tm_transfer_state") == 3) | (col("tm_transfer_state") == 2)) aso = aso.filter((aso.tm_transfer_state.isNotNull())) aso = aso.filter((aso.job_id.isNotNull())) new_df = fts.join(aso, on=['filename', 'job_id'], how='left_outer') new_df = new_df.groupby([ 'job_id', 'tm_transfer_state', 'tm_publish', 'tm_transfer_retry_count' ]).agg( count(lit(1)).alias('Num Of Records'), mean(aso.tm_last_update - aso.tm_start_time).alias('Aso duration'), mean(new_df.tr_timestamp_start * 1. / 1000 - new_df.tm_start_time).alias('Aso delay start'), mean(new_df.tm_last_update - new_df.tr_timestamp_complete * 1. / 1000).alias('Aso delay'), mean(new_df.fts_duration).alias('fts duration'), ) # keep table around new_df.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: new_df.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) tables.update(phedex_tables(sqlContext, verbose=verbose)) phedex_df = tables['phedex_df'] daf = tables['daf'] # dataset access table ddf = tables['ddf'] # dataset table bdf = tables['bdf'] # block table fdf = tables['fdf'] # file table aef = tables['aef'] # acquisition era pef = tables['pef'] # processing era table mcf = tables['mcf'] # output mod config table ocf = tables['ocf'] # output module table rvf = tables['rvf'] # release version table # read Condor rdd date = condor_date(date) tables.update(condor_tables(sqlContext, date=date, verbose=verbose)) condor_df = tables['condor_df'] # aaa table # aggregate phedex info into dataframe phedex_cols = ['node_name', 'dataset_name', 'dataset_is_open', 'block_bytes', 'replica_time_create'] newpdf = phedex_df.select(phedex_cols).groupBy(['node_name', 'dataset_name', 'dataset_is_open'])\ .agg({'block_bytes':'sum', 'replica_time_create':'max'})\ .withColumnRenamed('sum(block_bytes)', 'pbr_size')\ .withColumnRenamed('max(replica_time_create)', 'max_replica_time') newpdf.registerTempTable('newpdf') # aggregate dbs info into dataframe cols = ['d_dataset_id', 'd_dataset','d_creation_date','d_is_dataset_valid','f_event_count','f_file_size','dataset_access_type','acquisition_era_name','processing_version'] stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join(cols) print(stmt) joins = sqlContext.sql(stmt) # construct conditions cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1' fjoin = joins.where(cond).distinct().select(cols) # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000')) newdf = fjoin\ .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\ .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\ .withColumnRenamed('sum(f_event_count)', 'evts')\ .withColumnRenamed('sum(f_file_size)', 'size')\ .withColumnRenamed('max(d_creation_date)', 'date') # at this point we have ndf dataframe with our collected stats for every dataset # let's join it with release info newdf.registerTempTable('newdf') cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version'] stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join(cols) agg_dbs_df = sqlContext.sql(stmt) agg_dbs_df.registerTempTable('agg_dbs_df') # join dbs and phedex tables cols = ['d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','r_release_version','node_name','pbr_size','dataset_is_open','max_replica_time'] stmt = 'SELECT %s FROM agg_dbs_df JOIN newpdf ON agg_dbs_df.d_dataset = newpdf.dataset_name' % ','.join(cols) dbs_phedex_df = sqlContext.sql(stmt) dbs_phedex_df.registerTempTable('dbs_phedex_df') # merge dbs+phedex and Condor data # cols = ['data.DESIRED_CMSDataset', 'data.KEvents', 'data.CMSSWWallHrs'] # stmt = 'SELECT %s FROM condor_df WHERE condor_df.data.ExitCode=0 AND condor_df.data.KEvents > 0' % ','.join(cols) cols = cols + ['data.KEvents', 'data.CMSSWKLumis', 'data.CMSSWWallHrs', 'data.Campaign', 'data.Workflow', 'data.CpuEff', 'data.CoreHr', 'data.QueueHrs', 'data.CRAB_UserHN', 'data.Type'] stmt = 'SELECT %s FROM condor_df JOIN dbs_phedex_df ON dbs_phedex_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.ExitCode=0 AND condor_df.data.KEvents > 0' % ','.join(cols) finel_df = sqlContext.sql(stmt) print_rows(finel_df, stmt, verbose) # keep table around finel_df.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: ndf = split_dataset(finel_df, 'd_dataset') ndf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()