def run(fin, attrs, yarn, fout, verbose, nparts=3000): # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = SQLContext(ctx) # Reading all the files in a directory paths = [fin] res = sqlContext.read.json(paths) data = res.select("data.*") data.repartition(nparts) print("### number of new data paritions", data.rdd.getNumPartitions()) anonymize = udf(hash_private_info, returnType=StringType()) # Use the above udf to anonymize data for attr in attrs: col = attr+'_hash' data = data.withColumn(col, anonymize(getattr(data, attr))) # drop attributes data = data.drop(*attrs) # Save to csv data.write.option("compression","gzip").json(fout)
def run(fout, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) bdf = tables['bdf'] fdf = tables['fdf'] flf = tables['flf'] # join tables cols = ['*'] # to select all fields from table cols = [ 'b_block_id', 'b_block_name', 'f_block_id', 'f_file_id', 'fl_file_id', 'fl_lumi_section_num' ] # join tables stmt = 'SELECT %s FROM bdf JOIN fdf on bdf.b_block_id = fdf.f_block_id JOIN flf on fdf.f_file_id=flf.fl_file_id' % ','.join( cols) print(stmt) joins = sqlContext.sql(stmt) # keep table around joins.persist(StorageLevel.MEMORY_AND_DISK) # construct conditions # adler = ['ad8f6ad2','9c441343','f68d5dca','81c90e2a','471d2524','a3c1f077','6f0018a0','8bb03b60','d504882c','5ede357f','b05303c3','716d1776','7e9cf258','1945804b','ec7bc1d7','12c87747','94f2aa32'] # cond = 'f_adler32 in %s' % adler # cond = cond.replace('[', '(').replace(']', ')') # fjoin = joins.where(cond).distinct().select(cols) # print_rows(fjoin, stmt, verbose) fjoin = joins\ .groupBy(['b_block_name'])\ .agg({'fl_lumi_section_num':'count'})\ .withColumnRenamed('count(fl_lumi_section_num)', 'nlumis')\ # keep table around fjoin.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: fjoin.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(path, amq, stomp, yarn=None, aggregation_schema=False, verbose=False): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) if stomp and os.path.isfile(stomp): ctx.addPyFile(stomp) else: raise Exception('No stomp module egg is provided') if amq and os.path.isfile(amq): if amq.split('/')[-1] == 'amq_broker.json': ctx.addFile(amq) else: raise Exception( 'Wrong AMQ broker file name, please name it as amq_broker.json' ) else: raise Exception('No AMQ credential file is provided') sqlContext = HiveContext(ctx) hpath = "hadoop fs -ls %s | awk '{print $8}'" % path if verbose: print("### Read files: %s" % hpath) pipe = Popen(hpath, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, close_fds=True) pipe.wait() pfiles = [ f for f in pipe.stdout.read().split('\n') if f.find('part-') != -1 ] df = [] if aggregation_schema: df = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null', header='true') \ .load(fname, schema=aggregated_data_schema()) for fname in pfiles]) else: df = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null', header='true') \ .load(fname) for fname in pfiles]) # Register temporary tables to be able to use sqlContext.sql df.registerTempTable('df') print_rows(df, "DataFrame", verbose) print('Schema:') df.printSchema() # for testing uncomment line below # df.toJSON().foreachPartition(print_data) # send data to CERN MONIT via stomp AMQ, see send2monit function df.toJSON().foreachPartition(send2monit) ctx.stop()
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() print("Input arguments: %s" % opts) start_time = time.time() verbose = opts.verbose yarn = opts.yarn inst = opts.inst date = opts.date fout = opts.fout if inst.lower() in ['global', 'phys01', 'phys02', 'phys03']: inst = inst.upper() else: raise Exception('Unsupported DBS instance "%s"' % inst) # Create spark context ctx = spark_context('cms', yarn, verbose) # Create SQL context to be used for SQL queries sql_context = HiveContext(ctx) # Initialize DBS tables (will be used with AAA, CMSSW) dbs_tables(sql_context, inst=inst, verbose=verbose) aaa_start_time = time.time() run_aaa(date, fout, ctx, sql_context, verbose) aaa_elapsed_time = elapsed_time(aaa_start_time) cmssw_start_time = time.time() run_cmssw(date, fout, ctx, sql_context, verbose) cmssw_elapsed_time = elapsed_time(cmssw_start_time) eos_start_time = time.time() run_eos(date, fout, ctx, sql_context, verbose) eos_elapsed_time = elapsed_time(eos_start_time) jm_start_time = time.time() run_jm(date, fout, ctx, sql_context, verbose) jm_elapsed_time = elapsed_time(jm_start_time) ctx.stop() print('Start time : %s' % time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(start_time))) print('End time : %s' % time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(time.time()))) print('Total elapsed time : %s' % elapsed_time(start_time)) print('AAA elapsed time : %s' % aaa_elapsed_time) print('CMSSW elapsed time : %s' % cmssw_elapsed_time) print('EOS elapsed time : %s' % eos_elapsed_time) print('JM elapsed time : %s' % jm_elapsed_time)
def run(date, fout, yarn=None, verbose=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:]) todate = fromdate # read Phedex tables tables = {} tables.update( phedex_tables(sqlContext, verbose=verbose, fromdate=fromdate, todate=todate)) phedex_df = tables['phedex_df'] # register user defined function unix2date = udf(unix2human, StringType()) siteFilter = udf(site_filter, IntegerType()) one_day = 60 * 60 * 24 # aggregate phedex info into dataframe cols = [ 'node_name', 'dataset_name', 'block_bytes', 'replica_time_create', 'br_user_group_id' ] pdf = phedex_df.select(cols).where(siteFilter(col('node_name')) == 1)\ .groupBy(['node_name', 'dataset_name', 'replica_time_create', 'br_user_group_id'])\ .agg({'block_bytes':'sum'})\ .withColumn('date', lit(date))\ .withColumn('replica_date', unix2date(col('replica_time_create')))\ .withColumnRenamed('sum(block_bytes)', 'size')\ .withColumnRenamed('dataset_name', 'dataset')\ .withColumnRenamed('node_name', 'site')\ .withColumnRenamed('br_user_group_id', 'groupid') pdf.registerTempTable('pdf') pdf.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: year, month, day = split_date(date) out = '%s/%s/%s/%s' % (fout, year, month, day) cols = ['date', 'site', 'dataset', 'size', 'replica_date', 'groupid'] # don't write header since when we'll read back the data it will # mismatch the data types, i.e. headers are string and rows # may be different data types pdf.select(cols)\ .write.format("com.databricks.spark.csv")\ .option("header", "true").save(out) ctx.stop()
def run(fout, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) ddf = tables['ddf'] bdf = tables['bdf'] fdf = tables['fdf'] # join tables cols = [ 'd_dataset', 'd_dataset_id', 'b_block_id', 'b_file_count', 'f_block_id', 'f_file_id', 'f_dataset_id', 'f_event_count', 'f_file_size' ] # join tables stmt = 'SELECT %s FROM ddf JOIN bdf on ddf.d_dataset_id = bdf.b_dataset_id JOIN fdf on bdf.b_block_id=fdf.f_block_id' % ','.join( cols) print(stmt) joins = sqlContext.sql(stmt) # keep table around joins.persist(StorageLevel.MEMORY_AND_DISK) # construct aggregation fjoin = joins\ .groupBy(['d_dataset'])\ .agg({'b_file_count':'sum', 'f_event_count':'sum', 'f_file_size':'sum'})\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumnRenamed('sum(b_file_count)', 'nfiles')\ .withColumnRenamed('sum(f_event_count)', 'nevents')\ .withColumnRenamed('sum(f_file_size)', 'size') # keep table around fjoin.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: fjoin.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', opts.yarn, opts.verbose) sqlContext = SQLContext(ctx) convert(ctx, sqlContext, opts.date, opts.fin, opts.fout, opts.verbose)
def run(fout, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = SQLContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) tables.update(phedex_tables(sqlContext, verbose=verbose)) phedex_df = tables['phedex_df'] ddf = tables['ddf'] fdf = tables['fdf'] print("### ddf from main", ddf) # join tables cols = ['*'] # to select all fields from table cols = ['d_dataset_id', 'd_dataset', 'f_logical_file_name', 'f_adler32'] # join tables stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id' % ','.join( cols) print(stmt) joins = sqlContext.sql(stmt) # construct conditions adler = [ 'ad8f6ad2', '9c441343', 'f68d5dca', '81c90e2a', '471d2524', 'a3c1f077', '6f0018a0', '8bb03b60', 'd504882c', '5ede357f', 'b05303c3', '716d1776', '7e9cf258', '1945804b', 'ec7bc1d7', '12c87747', '94f2aa32' ] cond = 'f_adler32 in %s' % adler cond = cond.replace('[', '(').replace(']', ')') # scols = ['f_logical_file_name'] fjoin = joins.where(cond).distinct().select(cols) print_rows(fjoin, stmt, verbose) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: fjoin.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(fout, yarn=None, verbose=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read Phedex tables tables = {} hdir = 'hdfs:///cms/users/vk/phedex' tables.update(phedex_summary_tables(sqlContext, hdir=hdir, verbose=verbose)) phedex_summary_df = tables['phedex_summary_df'] phedex_summary_df.persist(StorageLevel.MEMORY_AND_DISK) print("### schema", phedex_summary_df.printSchema()) phedex_summary_df.show(5) # register user defined function days = udf(days_present, IntegerType()) # aggregate phedex info into dataframe # pdf = phedex_summary_df\ # .groupBy(['site', 'dataset'])\ # .agg( # F.min(col('date')).alias('min_date'), # F.max(col('date')).alias('max_date'), # F.min(col('replica_date')).alias('min_rdate'), # F.max(col('replica_date')).alias('max_rdate'), # F.max(col('size')).alias('max_size'), # F.min(col('size')).alias('min_size') # )\ # .withColumn('days', days(col('min_date'), col('max_date'), col('min_rdate'), col('max_rdate'))) pdf = phedex_summary_df\ .groupBy(['site', 'dataset', 'size'])\ .agg( F.min(col('date')).alias('min_date'), F.max(col('date')).alias('max_date'), F.min(col('replica_date')).alias('min_rdate'), F.max(col('replica_date')).alias('max_rdate') )\ .withColumn('days', days(col('min_date'), col('max_date'), col('min_rdate'), col('max_rdate'))) pdf.persist(StorageLevel.MEMORY_AND_DISK) pdf.show(5) print_rows(pdf, 'pdf', verbose=1) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: pdf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and JobMonitoring tables tables = {} # read JobMonitoring avro rdd date = jm_date(date) jm_df = jm_tables(ctx, sqlContext, date=date, verbose=verbose) # DBS tables tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) ddf = tables['ddf'] # dataset table fdf = tables['fdf'] # file table # merge DBS and JobMonitoring data cols = [ 'd_dataset', 'd_dataset_id', 'f_logical_file_name', 'FileName', 'FileType', 'Type', 'SiteName', 'WrapWC', 'WrapCPU', 'JobExecExitCode' ] stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN jm_df ON fdf.f_logical_file_name = jm_df.FileName' % ','.join( cols) joins = sqlContext.sql(stmt) print_rows(joins, stmt, verbose) # perform aggregation fjoin = joins.groupBy(['SiteName','JobExecExitCode','FileType','Type','d_dataset'])\ .agg({'WrapWC':'sum','WrapCPU':'sum','JobExecExitCode':'count','FileType':'count','Type':'count'})\ .withColumnRenamed('sum(WrapWC)', 'tot_wc')\ .withColumnRenamed('sum(WrapCPU)', 'tot_cpu')\ .withColumnRenamed('count(JobExecExitCode)', 'ecode_count')\ .withColumnRenamed('count(FileType)', 'file_type_count')\ .withColumnRenamed('count(Type)', 'type_count')\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumn('date', lit(jm_date_unix(date)))\ .withColumn('count_type', lit('jm'))\ # keep table around fjoin.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: ndf = split_dataset(fjoin, 'dataset') ndf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) ddf = tables['ddf'] # dataset table fdf = tables['fdf'] # file table if verbose: for row in ddf.head(1): print("### ddf row", row) # read CMSSW avro rdd date = eos_date(date) tables.update(eos_tables(sqlContext, date=date, verbose=verbose)) eos_df = tables['eos_df'] # EOS table if verbose: for row in eos_df.head(1): print("### eos_df row", row) # merge DBS and CMSSW data cols = ['d_dataset', 'd_dataset_id', 'f_logical_file_name', 'file_lfn'] stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN eos_df ON fdf.f_logical_file_name = eos_df.file_lfn' % ','.join( cols) joins = sqlContext.sql(stmt) print_rows(joins, stmt, verbose) # perform aggregation fjoin = joins.groupBy(['d_dataset'])\ .agg({'file_lfn':'count'})\ .withColumnRenamed('count(file_lfn)', 'count')\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumn('date', lit(eos_date_unix(date)))\ .withColumn('count_type', lit('eos'))\ # keep table around fjoin.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: ndf = split_dataset(fjoin, 'dataset') ndf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(fout, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = SQLContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) bdf = tables['bdf'] fdf = tables['fdf'] flf = tables['flf'] # join tables cols = ['*'] # to select all fields from table cols = ['b_block_id', 'b_block_name', 'f_block_id', 'f_logical_file_name'] # join tables stmt = 'SELECT %s FROM bdf JOIN fdf on bdf.b_block_id = fdf.f_block_id' % ','.join( cols) print(stmt) joins = sqlContext.sql(stmt) # keep table around joins.persist(StorageLevel.MEMORY_AND_DISK) # construct conditions cols = ['b_block_name', 'f_logical_file_name'] pat = '%00047DB7-9F77-E011-ADC8-00215E21D9A8.root' # pat = '%02ACAA1A-9F32-E111-BB31-0002C90B743A.root' fjoin = joins.select(cols).where(col('f_logical_file_name').like(pat)) print_rows(fjoin, stmt, verbose) # keep table around fjoin.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: fjoin.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(date, fout, yarn=None, verbose=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read FTS tables date = fts_date(date) tables = {} tables.update(fts_tables(sqlContext, date=date, verbose=verbose)) fts_df = tables['fts_df'] # fts table # example to extract transfer records for ASO # VK: the commented lines show how to extract some info from fts_df via SQL # cols = ['data.job_metadata.issuer', 'data.f_size'] # stmt = 'SELECT %s FROM fts_df' % ','.join(cols) # joins = sqlContext.sql(stmt) # fjoin = joins.groupBy(['issuer'])\ # .agg({'f_size':'sum'})\ # .withColumnRenamed('sum(f_size)', 'sum_file_size')\ # .withColumnRenamed('issuer', 'issuer') # we can use fts_df directly for groupby/aggregated tasks fjoin = fts_df.groupBy(['job_metadata.issuer'])\ .agg(agg_sum(fts_df.f_size).alias("sum_f_size")) # keep table around fjoin.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: fjoin.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(fout, date, yarn=None, verbose=None, inst='GLOBAL', limit=100): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) sqlContext = HiveContext(ctx) fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:]) todate = fromdate # read Phedex and DBS tables tables = {} tables.update(phedex_tables(sqlContext, verbose=verbose, fromdate=fromdate, todate=todate)) phedex = tables['phedex_df'] instances = ['GLOBAL'] # , 'PHYS01', 'PHYS02', 'PHYS03' for instance in instances: dbs_dict = dbs_tables(sqlContext, inst=instance, verbose=verbose) for key, val in dbs_dict.items(): new_key = '%s_%s' % (key, instance) tables[new_key] = val daf = reduce(lambda a,b: a.unionAll(b), [tables['daf_%s' % x] for x in instances]) ddf = reduce(lambda a,b: a.unionAll(b), [tables['ddf_%s' % x] for x in instances]) fdf = reduce(lambda a,b: a.unionAll(b), [tables['fdf_%s' % x] for x in instances]) dbs_fdf_cols = ['f_dataset_id', 'f_file_size'] dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id'] dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type'] fdf_df = fdf.select(dbs_fdf_cols) ddf_df = ddf.select(dbs_ddf_cols) daf_df = daf.select(dbs_daf_cols) # d_dataset_id, d_dataset, dataset_access_type dbs_df = ddf_df.join(daf_df, ddf_df.d_dataset_access_type_id == daf_df.dataset_access_type_id)\ .drop(ddf_df.d_dataset_access_type_id)\ .drop(daf_df.dataset_access_type_id) # dataset, dbs_size dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\ .join(fdf_df, dbs_df.d_dataset_id == fdf_df.f_dataset_id)\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumnRenamed('f_file_size', 'dbs_size')\ .drop(dbs_df.d_dataset_id)\ .drop(fdf_df.f_dataset_id)\ .drop(dbs_df.dataset_access_type) # dataset, dbs_size dbs_df = dbs_df.groupBy(['dataset'])\ .agg({'dbs_size':'sum'})\ .withColumnRenamed('sum(dbs_size)', 'dbs_size') # dataset_name, block_bytes, node_name phedex_cols = ['dataset_name', 'block_bytes', 'node_name'] phedex_all_df = phedex.select(phedex_cols) aggregate(sqlContext, fout, phedex_all_df, dbs_df, limit) aggregate(sqlContext, fout, phedex_all_df, dbs_df, limit, disk_only=True) ctx.stop()
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = SQLContext(ctx) # read DBS and Phedex tables tables = {} dtables = ['daf', 'ddf', 'bdf', 'fdf', 'aef', 'pef', 'mcf', 'ocf', 'rvf'] tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose, tables=dtables)) # tables.update(phedex_tables(sqlContext, verbose=verbose)) # phedex_df = tables['phedex_df'] daf = tables['daf'] # dataset access table ddf = tables['ddf'] # dataset table bdf = tables['bdf'] # block table fdf = tables['fdf'] # file table aef = tables['aef'] # acquisition era pef = tables['pef'] # processing era table mcf = tables['mcf'] # output mod config table ocf = tables['ocf'] # output module table rvf = tables['rvf'] # release version table # read Condor rdd # tables.update(condor_tables(sqlContext, hdir='hdfs:///cms/users/vk/condor', date=condor_date(date), verbose=verbose)) tables.update(condor_tables(sqlContext, date=condor_date(date), verbose=verbose)) condor_df = tables['condor_df'] # aaa table # aggregate dbs info into dataframe cols = ['d_dataset_id', 'd_dataset','d_creation_date','d_is_dataset_valid','f_event_count','f_file_size','dataset_access_type','acquisition_era_name','processing_version'] stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join(cols) print(stmt) joins = sqlContext.sql(stmt) # construct conditions cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1' fjoin = joins.where(cond).distinct().select(cols) # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000')) newdf = fjoin\ .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\ .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\ .withColumnRenamed('sum(f_event_count)', 'evts')\ .withColumnRenamed('sum(f_file_size)', 'size')\ .withColumnRenamed('max(d_creation_date)', 'date') # at this point we have ndf dataframe with our collected stats for every dataset # let's join it with release info newdf.registerTempTable('newdf') cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version'] stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join(cols) agg_dbs_df = sqlContext.sql(stmt) agg_dbs_df.registerTempTable('agg_dbs_df') # merge dbs+phedex and Condor data cols = ['d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','r_release_version'] cols = cols + ['data.KEvents', 'data.CMSSWKLumis', 'data.CMSSWWallHrs', 'data.Campaign', 'data.Workflow', 'data.CpuEff', 'data.CoreHr', 'data.QueueHrs', 'data.CRAB_UserHN', 'data.Type', 'data.ExitCode', 'data.TaskType', 'data.RecordTime'] stmt = 'SELECT %s FROM condor_df JOIN agg_dbs_df ON agg_dbs_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.KEvents > 0' % ','.join(cols) # stmt = 'SELECT %s FROM condor_df JOIN dbs_phedex_df ON dbs_phedex_df.d_dataset = condor_df.data.DESIRED_CMSDataset WHERE condor_df.data.KEvents > 0' % ','.join(cols) final_df = sqlContext.sql(stmt) print_rows(final_df, stmt, verbose) # keep table around final_df.persist(StorageLevel.MEMORY_AND_DISK) # user defined function def rate(evts, cores): "Calculate the rate of events vs cores, if they're not defineed return -1" if evts and cores: return float(evts)/float(cores) return -1. func_rate = udf(rate, DoubleType()) # our output store = {} # conditions # load pyspark functions to be used here to redefine any previous usage of those names from pyspark.sql.functions import lit, sum, count, col, split # here we split dataframe based on exitcode conditions to reduce dimentionality # of the input, otherwise job crashes with Integer.MAX_VALUE exception which # basically tells that input dataframe exceed number of available partitions for ecode in [0,1]: if ecode == 0: refdf = final_df.where(col('ExitCode') == 0) condf = condor_df.where(col('data.ExitCode') == 0) else: refdf = final_df.where(col('ExitCode') != 0) condf = condor_df.where(col('data.ExitCode') != 0) refdf.persist(StorageLevel.MEMORY_AND_DISK) condf.persist(StorageLevel.MEMORY_AND_DISK) # aggregate CMS datasets cols = ['data.DESIRED_CMSDataset', 'data.CRAB_UserHN', 'data.ExitCode', 'data.Type', 'data.TaskType', 'data.RecordTime'] xdf = condf.groupBy(cols)\ .agg(sum('data.KEvents').alias('sum_evts'),sum('data.CoreHr').alias('sum_chr'))\ .withColumn('date', lit(date))\ .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\ .withColumn("tier", split(col('DESIRED_CMSDataset'), "/").alias('tier').getItem(3))\ .withColumnRenamed('CRAB_UserHN', 'user')\ .withColumnRenamed('RecordTime', 'rec_time')\ .withColumnRenamed('DESIRED_CMSDataset', 'dataset') store.setdefault('dataset', []).append(xdf) # aggregate across campaign cols = ['data.Campaign', 'data.CRAB_UserHN', 'data.ExitCode', 'data.Type', 'data.TaskType', 'data.RecordTime'] xdf = condf.groupBy(cols)\ .agg(sum('data.KEvents').alias('sum_evts'),sum('data.CoreHr').alias('sum_chr'))\ .withColumn('date', lit(date))\ .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\ .withColumnRenamed('CRAB_UserHN', 'user')\ .withColumnRenamed('RecordTime', 'rec_time')\ .withColumnRenamed('Campaign', 'campaign') store.setdefault('campaign', []).append(xdf) # aggregate across DBS releases cols = ['r_release_version', 'CRAB_UserHN', 'ExitCode', 'Type', 'TaskType', 'RecordTime'] xdf = refdf.groupBy(cols)\ .agg(sum('KEvents').alias('sum_evts'),sum('CoreHr').alias('sum_chr'))\ .withColumn('date', lit(date))\ .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\ .withColumnRenamed('CRAB_UserHN', 'user')\ .withColumnRenamed('RecordTime', 'rec_time')\ .withColumnRenamed('r_release_version', 'release') store.setdefault('release', []).append(xdf) # aggregate across DBS eras cols = ['acquisition_era_name', 'CRAB_UserHN', 'ExitCode', 'Type', 'TaskType', 'RecordTime'] xdf = refdf.groupBy(cols)\ .agg(sum('KEvents').alias('sum_evts'),sum('CoreHr').alias('sum_chr'))\ .withColumn('date', lit(date))\ .withColumn('rate', func_rate(col('sum_evts'),col('sum_chr')))\ .withColumnRenamed('CRAB_UserHN', 'user')\ .withColumnRenamed('RecordTime', 'rec_time')\ .withColumnRenamed('acquisition_era_name', 'era') store.setdefault('era', []).append(xdf) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: year, month, day = split_date(date) for col in store.keys(): out = '%s/%s/%s/%s/%s' % (fout, col, year, month, day) print("output: %s" % out) odf = unionAll(store[col]) print("%s rows: %s" % (col, odf.count())) print_rows(odf, col, verbose=1) odf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(out) ctx.stop()
def run(fout, date, yarn=None, verbose=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) sqlContext = HiveContext(ctx) fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:]) todate = fromdate # read Phedex and DBS tables tables = {} tables.update( phedex_tables(sqlContext, verbose=verbose, fromdate=fromdate, todate=todate)) phedex = tables['phedex_df'] instances = [inst] if inst == 'all': instances = ['GLOBAL', 'PHYS01', 'PHYS02', 'PHYS03'] for instance in instances: dbs_dict = dbs_tables(sqlContext, inst=instance, verbose=verbose) for key, val in dbs_dict.items(): new_key = '%s_%s' % (key, instance) tables[new_key] = val daf = reduce(lambda a, b: a.unionAll(b), [tables['daf_%s' % x] for x in instances]) ddf = reduce(lambda a, b: a.unionAll(b), [tables['ddf_%s' % x] for x in instances]) fdf = reduce(lambda a, b: a.unionAll(b), [tables['fdf_%s' % x] for x in instances]) dbs_fdf_cols = ['f_dataset_id', 'f_file_size'] dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id'] dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type'] fdf_df = fdf.select(dbs_fdf_cols) ddf_df = ddf.select(dbs_ddf_cols) daf_df = daf.select(dbs_daf_cols) # Aggregate by campaign and find total PhEDEx and DBS size of each campaign # d_dataset_id, d_dataset, dataset_access_type dbs_df = ddf_df.join(daf_df, ddf_df.d_dataset_access_type_id == daf_df.dataset_access_type_id)\ .drop(ddf_df.d_dataset_access_type_id)\ .drop(daf_df.dataset_access_type_id) # dataset, dbs_size dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\ .join(fdf_df, dbs_df.d_dataset_id == fdf_df.f_dataset_id)\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumnRenamed('f_file_size', 'dbs_size')\ .drop(dbs_df.d_dataset_id)\ .drop(fdf_df.f_dataset_id)\ .drop(dbs_df.dataset_access_type) # dataset, dbs_size dbs_df = dbs_df.groupBy(['dataset'])\ .agg({'dbs_size':'sum'})\ .withColumnRenamed('sum(dbs_size)', 'dbs_size') # dataset, site, phedex_size phedex_cols = ['dataset_name', 'block_bytes', 'node_name'] phedex_df = phedex.select(phedex_cols) phedex_df = phedex_df.withColumnRenamed('block_bytes', 'phedex_size')\ .withColumnRenamed('dataset_name', 'dataset')\ .withColumnRenamed('node_name', 'site') # dataset, sites, phedex_size phedex_df = phedex_df.groupBy(['dataset'])\ .agg({'phedex_size':'sum', 'site': 'collect_set'})\ .withColumnRenamed('sum(phedex_size)', 'phedex_size')\ .withColumnRenamed('collect_set(site)', 'sites') # Subtract to get leftovers extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2]) # dataset leftover_datasets_df = phedex_df.select('dataset').subtract( dbs_df.select('dataset')) # dataset, campaign, sites, phedex_size leftovers_df = leftover_datasets_df.select('dataset').join( phedex_df, 'dataset') #.join(dbs_df, 'dataset') leftovers_df = leftovers_df.withColumn('campaign', extract_campaign_udf(leftovers_df.dataset))\ .select(['dataset', 'campaign', 'sites', 'phedex_size']) # Subtract to get leftovers that don't even exist in DBS (orphans) ddf_datasets_df = ddf_df.withColumnRenamed('d_dataset', 'dataset').select('dataset') leftover_datasets_df = phedex_df.select('dataset').subtract( ddf_datasets_df) # dataset, campaign, sites, phedex_size leftovers_orphans_df = leftover_datasets_df.select('dataset').join( phedex_df, 'dataset') #.join(dbs_df, 'dataset') leftovers_orphans_df = leftovers_orphans_df.withColumn('campaign', extract_campaign_udf(leftovers_orphans_df.dataset))\ .select(['dataset', 'campaign', 'sites', 'phedex_size']) # Sum total size of leftovers all_leftovers_size = leftovers_df.select( 'phedex_size').groupBy().sum().rdd.map(lambda x: x[0]).collect()[0] orphan_leftovers_size = leftovers_orphans_df.select( 'phedex_size').groupBy().sum().rdd.map(lambda x: x[0]).collect()[0] print 'All leftovers PhEDEx size: %s' % bytes_to_readable( all_leftovers_size) print 'Orphan leftovers PhEDEx size: %s' % bytes_to_readable( orphan_leftovers_size) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: leftovers_df.write.format("com.databricks.spark.csv")\ .option("header", "true").save('%s/all' % fout) leftovers_orphans_df.write.format("com.databricks.spark.csv")\ .option("header", "true").save('%s/orphans' % fout) ctx.stop()
def run(yarn=None, verbose=None, campaign=None, tier=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) sqlContext = HiveContext(ctx) sqlContext.setConf("spark.sql.files.ignoreCorruptFiles", "true") sqlContext.sql("set spark.sql.files.ignoreCorruptFiles=true") df = sqlContext.read.format('com.databricks.spark.csv')\ .options(header='true', treatEmptyValuesAsNulls='true', nullValue='null')\ .load('hdfs:///cms/aggregation/sizes/part-*') if campaign != None and tier != None: campaign_tier_df = df.where(df.campaign == campaign)\ .where(df.tier == tier) campaign_tier = map(lambda row: row.asDict(), campaign_tier_df.collect()) print 'Average size: %s' % bytes_to_readable( float(campaign_tier[0]['size_average'])) print 'Average in period of existence: %s' % bytes_to_readable( float(campaign_tier[0]['average_size_in_period'])) print 'Max size: %s' % bytes_to_readable( float(campaign_tier[0]['size_max'])) print 'T1 size: %s' % bytes_to_readable( float(campaign_tier[0]['t1_size'])) print 'T2 size: %s' % bytes_to_readable( float(campaign_tier[0]['t2_size'])) print 'T3 size: %s' % bytes_to_readable( float(campaign_tier[0]['t3_size'])) date_to_timestamp_udf = udf(lambda date: time.mktime( datetime.datetime.strptime(date, "%Y%m%d").timetuple())) months = [1, 2, 3, 4, 5, 6, 9, 12] for month in months: now = (datetime.datetime.now() - datetime.datetime(1970, 1, 1)).total_seconds() seconds = month * 30 * 24 * 60 * 60 not_accessed_df = df.withColumn( 'date_timestamp', date_to_timestamp_udf(df.last_access_date)) not_accessed_df = not_accessed_df.where( now - not_accessed_df.date_timestamp > seconds) not_accessed_df = not_accessed_df.withColumn( "size_average", not_accessed_df["size_average"].cast(DoubleType())) total_size = not_accessed_df.groupBy().sum('size_average').rdd.map( lambda x: x[0]).collect()[0] or 0 print 'Size of data not accessed for last %d month(s): %s' % ( month, bytes_to_readable(total_size)) ctx.stop()
def run(fout, hdir, date, yarn=None, verbose=None): """ Main function to run pyspark job. """ if not date: raise Exception("Not date is provided") # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = SQLContext(ctx) # read DBS and Phedex tables # construct here hdfs path and pass empty string as a date rdd = avro_rdd(ctx, sqlContext, hdfs_path(hdir, date), date='', verbose=verbose) def getdata(row): """ Helper function to extract useful data from WMArchive records. You may adjust it to your needs. Given row is a dict object. """ meta = row.get('meta_data', {}) sites = [] out = {'host': meta.get('host', ''), 'task': row.get('task', '')} for step in row['steps']: if step['name'].lower().startswith('cmsrun'): site = step.get('site', '') output = step.get('output', []) perf = step.get('performance', {}) cpu = perf.get('cpu', {}) mem = perf.get('memory', {}) storage = perf.get('storage', {}) out['ncores'] = cpu['NumberOfStreams'] out['nthreads'] = cpu['NumberOfThreads'] out['site'] = site out['jobCPU'] = cpu['TotalJobCPU'] out['jobTime'] = cpu['TotalJobTime'] out['evtCPU'] = cpu['TotalEventCPU'] out['evtThroughput'] = cpu['EventThroughput'] if output: output = output[0] out['appName'] = output.get('applicationName', '') out['appVer'] = output.get('applicationName', '') out['globalTag'] = output.get('globalTag', '') out['era'] = output.get('acquisitionEra', '') else: out['appName'] = '' out['appVer'] = '' out['globalTag'] = '' out['era'] = '' break return out out = rdd.map(lambda r: getdata(r)) if verbose: print(out.take(1)) # out here is RDD object # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: # output will be saved as-is, in this case the out is an RDD which # contains json records, therefore the output will be records # coming out from getdata helper function above. out.saveAsTextFile(fout) ctx.stop()
def run(fout, yarn=None, verbose=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) sqlContext = HiveContext(ctx) sqlContext.setConf("spark.sql.files.ignoreCorruptFiles","true") sqlContext.sql("set spark.sql.files.ignoreCorruptFiles=true") # date, site, dataset, size, replica_date, groupid schema = StructType([ StructField("date", StringType(), True), StructField("site", StringType(), True), StructField("dataset", StringType(), True), StructField("size", DoubleType(), True), StructField("replica_date", StringType(), True), StructField("groupid", StringType(), True) ]) df = sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load('hdfs:///cms/phedex/*/*/*/part-*', schema=schema) # .load('hdfs:///cms/phedex/2017/03/*/part-00000', schema=schema) # Remove all tape sites is_tape = lambda site: site.endswith('_MSS') | site.endswith('_Buffer') | site.endswith('_Export') df = df.where(is_tape(df.site) == False) extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2].split('-')[0]) extract_tier_udf = udf(lambda dataset: dataset.split('/')[3]) date_to_timestamp_udf = udf(lambda date: time.mktime(datetime.datetime.strptime(date, "%Y%m%d").timetuple())) timestamp_to_date_udf = udf(lambda timestamp: datetime.datetime.fromtimestamp(float(timestamp)).strftime('%Y%m%d')) days_delta_udf = udf(lambda t1, t2: (datetime.datetime.fromtimestamp(float(t1)) - datetime.datetime.fromtimestamp(float(t2))).days + 1) count_udf = udf(lambda list: len(list)) df = df.withColumn('campaign', extract_campaign_udf(df.dataset)) df = df.withColumn('tier', extract_tier_udf(df.dataset)) df = df.withColumn('date_min', date_to_timestamp_udf(df.date)) df = df.withColumn('date_max', date_to_timestamp_udf(df.date)) df = df.withColumn('size_average', df.size) df = df.groupBy(['campaign', 'tier'])\ .agg({'date_min': 'min', 'date_max': 'max', 'date': 'collect_set', 'size_average': 'avg', 'size': 'max'})\ .withColumnRenamed('min(date_min)', 'date_min')\ .withColumnRenamed('max(date_max)', 'date_max')\ .withColumnRenamed('collect_set(date)', 'days_count')\ .withColumnRenamed('avg(size_average)', 'size_average')\ .withColumnRenamed('max(size)', 'size_max')\ df = df.withColumn('period_days', days_delta_udf(df.date_max, df.date_min))\ .withColumn('days_count', count_udf(df.days_count))\ .withColumn('date_min', timestamp_to_date_udf(df.date_min))\ .withColumn('date_max', timestamp_to_date_udf(df.date_max)) df = df.withColumn('existence_in_period', df.days_count / df.period_days) df = df.withColumn('average_size_in_period', df.size_average * df.existence_in_period) # campaign, tier, date_max, date_min, days_count, size_max, size_average, period_days, existence_in_period, average_size_in_period # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: df.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(fout, date, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) sqlContext = HiveContext(ctx) fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:]) todate = fromdate # read Phedex and DBS tables tables = {} tables.update( phedex_tables(sqlContext, verbose=verbose, fromdate=fromdate, todate=todate)) phedex = tables['phedex_df'] instances = ['GLOBAL', 'PHYS01', 'PHYS02', 'PHYS03'] for instance in instances: dbs_dict = dbs_tables(sqlContext, inst=instance, verbose=verbose) for key, val in dbs_dict.items(): new_key = '%s_%s' % (key, instance) tables[new_key] = val daf = reduce(lambda a, b: a.unionAll(b), [tables['daf_%s' % x] for x in instances]) ddf = reduce(lambda a, b: a.unionAll(b), [tables['ddf_%s' % x] for x in instances]) fdf = reduce(lambda a, b: a.unionAll(b), [tables['fdf_%s' % x] for x in instances]) dbs_fdf_cols = ['f_dataset_id', 'f_file_size'] dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id'] dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type'] fdf_df = fdf.select(dbs_fdf_cols) ddf_df = ddf.select(dbs_ddf_cols) daf_df = daf.select(dbs_daf_cols) # Aggregate by campaign and find total PhEDEx and DBS size of each campaign extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2]) # dataset, size, dataset_access_type_id dbs_df = fdf_df.join(ddf_df, fdf_df.f_dataset_id == ddf_df.d_dataset_id)\ .drop('f_dataset_id')\ .drop('d_dataset_id')\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumnRenamed('f_file_size', 'size')\ .withColumnRenamed('d_dataset_access_type_id', 'dataset_access_type_id') # dataset, size, dataset_access_type dbs_df = dbs_df.join(daf_df, dbs_df.dataset_access_type_id == daf_df.dataset_access_type_id)\ .drop(dbs_df.dataset_access_type_id)\ .drop(daf_df.dataset_access_type_id) # campaign, dbs_size dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\ .withColumn('campaign', extract_campaign_udf(dbs_df.dataset))\ .groupBy(['campaign'])\ .agg({'size':'sum'})\ .withColumnRenamed('sum(size)', 'dbs_size')\ .drop('dataset') # campaign, phedex_size phedex_cols = ['dataset_name', 'block_bytes'] phedex_df = phedex.select(phedex_cols) phedex_df = phedex_df.withColumn('campaign', extract_campaign_udf(phedex_df.dataset_name))\ .groupBy(['campaign'])\ .agg({'block_bytes':'sum'})\ .withColumnRenamed('sum(block_bytes)', 'phedex_size') # campaign, dbs_size, phedex_size dbs_phedex_df = dbs_df.join(phedex_df, dbs_df.campaign == phedex_df.campaign)\ .drop(dbs_df.campaign) print 'DISTINCT DBS AND PHEDEX CAMPAIGN COUNT:' print dbs_phedex_df.select('campaign').distinct().count() # Select campaign - site pairs and their sizes (from PhEDEx) # campaign, site, size phedex_cols = ['dataset_name', 'node_name', 'block_bytes'] campaign_site_df = phedex.select(phedex_cols) campaign_site_df = campaign_site_df.withColumn('campaign', extract_campaign_udf(campaign_site_df.dataset_name))\ .groupBy(['campaign', 'node_name'])\ .agg({'block_bytes':'sum'})\ .withColumnRenamed('sum(block_bytes)', 'size')\ .withColumnRenamed('node_name', 'site') # Aggregate data for site - campaign count table # site, count site_campaign_count_df = campaign_site_df.groupBy(['site'])\ .agg(countDistinct('campaign'))\ .withColumnRenamed('count(campaign)', 'campaign_count')\ .orderBy('campaign_count', ascending=False)\ .limit(LIMIT) # Find two most significant sites for each campaign columns_before_pivot = campaign_site_df.columns result = campaign_site_df.groupBy(['campaign'])\ .pivot('site')\ .sum('size')\ .na.fill(0) columns_after_pivot = result.columns sites_columns = [ x for x in columns_after_pivot if x not in columns_before_pivot ] number_of_sites_udf = udf(lambda row: len([x for x in row if x != 0]), IntegerType()) mss_udf = udf(get_mss, LongType()) second_mss_udf = udf(get_second_mss, LongType()) mss_name_udf = udf(lambda row: get_mss_name(row, sites_columns), StringType()) second_mss_name_udf = udf( lambda row: get_second_mss_name(row, sites_columns), StringType()) result = result.withColumn('sites', number_of_sites_udf(struct([result[x] for x in sites_columns])))\ .withColumn('mss', mss_udf(struct([result[x] for x in sites_columns])))\ .withColumn('mss_name', mss_name_udf(struct([result[x] for x in sites_columns])))\ .withColumn('second_mss', second_mss_udf(struct([result[x] for x in sites_columns])))\ .withColumn('second_mss_name', second_mss_name_udf(struct([result[x] for x in sites_columns]))) # campaign, phedex_size, dbs_size, mss, mss_name, second_mss, second_mss_name, sites result = result.join(dbs_phedex_df, result.campaign == dbs_phedex_df.campaign)\ .drop(result.campaign) sorted_by_phedex = result.orderBy(result.phedex_size, ascending=False).limit(LIMIT) sorted_by_dbs = result.orderBy(result.dbs_size, ascending=False).limit(LIMIT) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: sorted_by_phedex.write.format("com.databricks.spark.csv")\ .option("header", "true").save('%s/phedex' % fout) sorted_by_dbs.write.format("com.databricks.spark.csv")\ .option("header", "true").save('%s/dbs' % fout) site_campaign_count_df.write.format("com.databricks.spark.csv")\ .option("header", "true").save('%s/site_campaign_count' % fout) ctx.stop()
def run(fout, yarn=None, verbose=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) sqlContext = HiveContext(ctx) sqlContext.setConf("spark.sql.files.ignoreCorruptFiles","true") sqlContext.sql("set spark.sql.files.ignoreCorruptFiles=true") # date, site, dataset, size, replica_date, groupid schema = StructType([ StructField("date", StringType(), True), StructField("site", StringType(), True), StructField("dataset", StringType(), True), StructField("size", DoubleType(), True), StructField("replica_date", StringType(), True), StructField("groupid", StringType(), True) ]) df = sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(PHEDEX_HDFS_URL, schema=schema) # Remove all tape sites is_tape = lambda site: site.endswith('_MSS') | site.endswith('_Buffer') | site.endswith('_Export') df = df.where(is_tape(df.site) == False) # Remove all non VALID datasets remove_invalid_datasets(df, sqlContext, verbose) # Get accesses data frame accesses_df = get_dataset_access_dates(sqlContext) # extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2].split('-')[0]) extract_tier_udf = udf(lambda dataset: dataset.split('/')[3]) days_delta_udf = udf(lambda t1, t2: (datetime.datetime.fromtimestamp(float(t1)) - datetime.datetime.fromtimestamp(float(t2))).days + 1) count_udf = udf(lambda list: len(list)) get_t1_size = udf(lambda size, site: size if site.startswith('T1') else 0) get_t2_size = udf(lambda size, site: size if site.startswith('T2') else 0) get_t3_size = udf(lambda size, site: size if site.startswith('T3') else 0) df = df.withColumn('campaign', get_extract_campaign_udf()(df.dataset))\ .withColumn('tier', extract_tier_udf(df.dataset))\ .withColumn('date_min', get_date_to_timestamp_udf()(df.date))\ .withColumn('date_max', get_date_to_timestamp_udf()(df.date))\ .withColumn('size_average', df.size)\ .withColumn('t1_size', get_t1_size(df.size, df.site))\ .withColumn('t2_size', get_t2_size(df.size, df.site))\ .withColumn('t3_size', get_t3_size(df.size, df.site)) df = df.groupBy(['campaign', 'tier'])\ .agg({'date_min': 'min', 'date_max': 'max', 'date': 'collect_set', 'size_average': 'avg', 'size': 'max', 't1_size': 'avg', 't2_size': 'avg', 't3_size': 'avg'})\ .withColumnRenamed('min(date_min)', 'date_min')\ .withColumnRenamed('max(date_max)', 'date_max')\ .withColumnRenamed('collect_set(date)', 'days_count')\ .withColumnRenamed('avg(size_average)', 'size_average')\ .withColumnRenamed('max(size)', 'size_max')\ .withColumnRenamed('avg(t1_size)', 't1_size')\ .withColumnRenamed('avg(t2_size)', 't2_size')\ .withColumnRenamed('avg(t3_size)', 't3_size')\ df = df.withColumn('period_days', days_delta_udf(df.date_max, df.date_min))\ .withColumn('days_count', count_udf(df.days_count))\ .withColumn('date_min', get_timestamp_to_date_udf()(df.date_min))\ .withColumn('date_max', get_timestamp_to_date_udf()(df.date_max)) df = df.withColumn('existence_in_period', df.days_count / df.period_days) df = df.withColumn('average_size_in_period', df.size_average * df.existence_in_period) df.show(100, truncate=False) # campaign, tier, date_max, date_min, days_count, size_max, size_average, period_days, existence_in_period, average_size_in_period, t1_size, t2_size, t3_size, last_access_date df = df.join(accesses_df, 'campaign') df.show(100, truncate=False) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: df.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def get_spark_session(yarn=True, verbose=False): """ Get or create the spark context and session. """ sc = spark_context("cms-eos-dataset", yarn, verbose) return SparkSession.builder.config(conf=sc._conf).getOrCreate()
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() print("Input arguments: %s" % opts) start_time = time.time() verbose = opts.verbose yarn = opts.yarn inst = opts.inst date = opts.date fout = opts.fout aaa_hdir = opts.aaa_hdir if inst.lower() in ['global', 'phys01', 'phys02', 'phys03']: inst = inst.upper() else: raise Exception('Unsupported DBS instance "%s"' % inst) # Create spark context ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) # Create SQL context to be used for SQL queries sql_context = SQLContext(ctx) # Initialize DBS tables dbs_tables(sql_context, inst=inst, verbose=verbose, tables=['fdf', 'bdf', 'ddf']) # Initialize PhEDEx table to be used in file_block_site table phedex_tables(sql_context, verbose=verbose) # Register clean_site_name to be used with SQL queries sql_context.udf.register("clean_site_name", clean_site_name) # Register tier_from_site_name to be used with SQL queries sql_context.udf.register("tier_from_site_name", tier_from_site_name) # Register dn2uuid to be used with SQL queries sql_context.udf.register("dn2uuid", dn2uuid) # Register parse_app to be used with SQL queries sql_context.udf.register("parse_app", parse_app) # Register stream4app to be used with SQL queries sql_context.udf.register("stream4app", stream4app) # Register parse_dn to be used with SQL queries sql_context.udf.register("parse_dn", parse_dn) f_b_s_start_time = time.time() # Create temp table with file name, block name, site name and site from PhEDEx create_file_block_site_table(ctx, sql_context, verbose) f_b_s_elapsed_time = elapsed_time(f_b_s_start_time) cmssw_start_time = time.time() aggregated_cmssw_df = run_agg_cmssw(date, ctx, sql_context, verbose) cmssw_elapsed_time = elapsed_time(cmssw_start_time) aaa_start_time = time.time() if len(aaa_hdir) > 0: aggregated_aaa_df = run_agg_aaa(date, ctx, sql_context, aaa_hdir, verbose) else: aggregated_aaa_df = run_agg_aaa(date, ctx, sql_context, verbose=verbose) aaa_elapsed_time = elapsed_time(aaa_start_time) eos_start_time = time.time() aggregated_eos_df = run_agg_eos(date, ctx, sql_context, verbose) eos_elapsed_time = elapsed_time(eos_start_time) jm_start_time = time.time() aggregated_jm_df = run_agg_jm(date, ctx, sql_context, verbose) jm_elapsed_time = elapsed_time(jm_start_time) if verbose: print('Will union outputs from all streams to a single dataframe') # Schema for output is: # site name, dataset name, number of accesses, distinct users, stream all_df = aggregated_cmssw_df.unionAll(aggregated_aaa_df) all_df = all_df.unionAll(aggregated_eos_df) all_df = all_df.unionAll(aggregated_jm_df) all_df = all_df.sort(desc("nacc")) if verbose: print('Done joining all outputs to a single dataframe') fout = fout + "/" + short_date_string(date) # output_dataframe(fout + "/Aggregated/CMSSW/" + short_date_string(date), aggregated_cmssw_df, verbose) # output_dataframe(fout + "/Aggregated/AAA/" + short_date_string(date), aggregated_aaa_df, verbose) # output_dataframe(fout + "/Aggregated/EOS/" + short_date_string(date), aggregated_eos_df, verbose) # output_dataframe(fout + "/Aggregated/JobMonitoring/" + short_date_string(date), aggregated_jm_df, verbose) output_dataframe(fout, all_df, verbose) if verbose: cmssw_df_size = aggregated_cmssw_df.count() aaa_df_size = aggregated_aaa_df.count() eos_df_size = aggregated_eos_df.count() jm_df_size = aggregated_jm_df.count() all_df_size = all_df.count() print('CMSSW:') aggregated_cmssw_df.show(10) aggregated_cmssw_df.printSchema() print('AAA:') aggregated_aaa_df.show(10) aggregated_aaa_df.printSchema() print('EOS:') aggregated_eos_df.show(10) aggregated_eos_df.printSchema() print('JobMonitoring:') aggregated_jm_df.show(10) aggregated_jm_df.printSchema() print('Aggregated all:') all_df.show(10) all_df.printSchema() print('Output record count:') print('Output record count CMSSW : %s' % cmssw_df_size) print('Output record count AAA : %s' % aaa_df_size) print('Output record count EOS : %s' % eos_df_size) print('Output record count JobMonitoring : %s' % jm_df_size) print('Output record count Total: : %s' % all_df_size) ctx.stop() print('Start time : %s' % time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(start_time))) print('End time : %s' % time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(time.time()))) print('Total elapsed time : %s' % elapsed_time(start_time)) print('FileBlockSite elapsed time : %s' % f_b_s_elapsed_time) print('AAA elapsed time : %s' % aaa_elapsed_time) print('CMSSW elapsed time : %s' % cmssw_elapsed_time) print('EOS elapsed time : %s' % eos_elapsed_time) print('JobMonitoring elapsed time : %s' % jm_elapsed_time)
def run(date, fout, yarn=None, verbose=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = SQLContext(ctx) # read ASO and FTS tables date = aso_date(date) tables = {} tables.update(fts_tables(sqlContext, date=date, verbose=verbose)) fts_df = tables['fts_df'] # fts table print_rows(fts_df, 'fts_df', verbose) tables.update(aso_tables(sqlContext, verbose=verbose)) aso_df = tables['aso_df'] # aso table print_rows(aso_df, 'aso_df', verbose) fts = fts_df.select([ 'job_metadata.issuer', 'job_id', 'src_url', 't_final_transfer_state', 'tr_timestamp_start', 'tr_timestamp_complete' ]) fts = fts.filter("issuer = 'ASO'") fts_udf = udf(lambda x: x.split("/")[-1], StringType()) fts = fts.withColumn("filename", fts_udf(fts.src_url)) fts = fts.withColumn("fts_duration", (fts.tr_timestamp_complete - fts.tr_timestamp_start) * 1. / 1000) aso = aso_df.select([ 'tm_source_lfn', 'tm_fts_id', 'tm_jobid', 'tm_type', 'tm_last_update', 'tm_start_time', 'tm_transfer_state', 'tm_source', 'tm_destination', 'tm_transfer_retry_count', 'tm_publish' ]).withColumnRenamed('tm_fts_id', 'job_id') aso_udf = udf(lambda x: x.split("/")[-1], StringType()) aso = aso.withColumn("filename", aso_udf(aso.tm_source_lfn)) aso = aso.filter((col("tm_transfer_state") == 3) | (col("tm_transfer_state") == 2)) aso = aso.filter((aso.tm_transfer_state.isNotNull())) aso = aso.filter((aso.job_id.isNotNull())) new_df = fts.join(aso, on=['filename', 'job_id'], how='left_outer') new_df = new_df.groupby([ 'job_id', 'tm_transfer_state', 'tm_publish', 'tm_transfer_retry_count' ]).agg( count(lit(1)).alias('Num Of Records'), mean(aso.tm_last_update - aso.tm_start_time).alias('Aso duration'), mean(new_df.tr_timestamp_start * 1. / 1000 - new_df.tm_start_time).alias('Aso delay start'), mean(new_df.tm_last_update - new_df.tr_timestamp_complete * 1. / 1000).alias('Aso delay'), mean(new_df.fts_duration).alias('fts duration'), ) # keep table around new_df.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: new_df.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(fout, date, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) sqlContext = HiveContext(ctx) fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:]) todate = fromdate # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) tables.update( phedex_tables(sqlContext, verbose=verbose, fromdate=fromdate, todate=todate)) phedex = tables['phedex_df'] daf = tables['daf'] ddf = tables['ddf'] fdf = tables['fdf'] # DBS dbs_fdf_cols = ['f_dataset_id', 'f_file_size'] dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id'] dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type'] fdf_df = fdf.select(dbs_fdf_cols) ddf_df = ddf.select(dbs_ddf_cols) daf_df = daf.select(dbs_daf_cols) # dataset, dbs_size, dataset_access_type_id dbs_df = fdf_df.join(ddf_df, fdf_df.f_dataset_id == ddf_df.d_dataset_id)\ .drop('f_dataset_id')\ .drop('d_dataset_id')\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumnRenamed('f_file_size', 'size')\ .withColumnRenamed('d_dataset_access_type_id', 'dataset_access_type_id') # dataset, size, dataset_access_type dbs_df = dbs_df.join(daf_df, dbs_df.dataset_access_type_id == daf_df.dataset_access_type_id)\ .drop(dbs_df.dataset_access_type_id)\ .drop(daf_df.dataset_access_type_id) # dataset, dbs_size dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\ .groupBy('dataset')\ .agg({'size':'sum'})\ .withColumnRenamed('sum(size)', 'dbs_size') # PhEDEx size_on_disk_udf = udf(lambda site, size: 0 if site.endswith( ('_MSS', '_Buffer', '_Export')) else size) # dataset, size, site phedex_cols = ['dataset_name', 'block_bytes', 'node_name'] phedex_df = phedex.select(phedex_cols)\ .withColumnRenamed('dataset_name', 'dataset')\ .withColumnRenamed('block_bytes', 'size')\ .withColumnRenamed('node_name', 'site') # dataset, phedex_size, size_on_disk phedex_df = phedex_df.withColumn('size_on_disk', size_on_disk_udf(phedex_df.site, phedex_df.size))\ .groupBy('dataset')\ .agg({'size':'sum', 'size_on_disk': 'sum'})\ .withColumnRenamed('sum(size)', 'phedex_size')\ .withColumnRenamed('sum(size_on_disk)', 'size_on_disk') # dataset, dbs_size, phedex_size, size_on_disk result = phedex_df.join(dbs_df, phedex_df.dataset == dbs_df.dataset)\ .drop(dbs_df.dataset) extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2]) extract_tier_udf = udf(lambda dataset: dataset.split('/')[3]) # campaign, tier, dbs_size, phedex_size, size_on_disk result = result.withColumn('campaign', extract_campaign_udf(result.dataset))\ .withColumn('tier', extract_tier_udf(result.dataset))\ .drop('dataset')\ .groupBy(['campaign', 'tier'])\ .agg({'dbs_size':'sum', 'phedex_size': 'sum', 'size_on_disk': 'sum'})\ .withColumnRenamed('sum(dbs_size)', 'dbs_size')\ .withColumnRenamed('sum(phedex_size)', 'phedex_size')\ .withColumnRenamed('sum(size_on_disk)', 'size_on_disk') # campaign, tier, dbs_size, phedex_size, size_on_disk result = result.withColumn('sum_size', result.dbs_size + result.phedex_size) result = result.orderBy(result.sum_size, ascending=False)\ .drop('sum_size')\ .limit(LIMIT) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: result.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(fout, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) tables.update(phedex_tables(sqlContext, verbose=verbose)) phedex_df = tables['phedex_df'] daf = tables['daf'] ddf = tables['ddf'] bdf = tables['bdf'] fdf = tables['fdf'] aef = tables['aef'] pef = tables['pef'] mcf = tables['mcf'] ocf = tables['ocf'] rvf = tables['rvf'] print("### ddf from main", ddf) # aggregate phedex info into dataframe phedex_cols = [ 'node_name', 'dataset_name', 'dataset_is_open', 'block_bytes', 'replica_time_create' ] newpdf = phedex_df.select(phedex_cols).groupBy(['node_name', 'dataset_name', 'dataset_is_open'])\ .agg({'block_bytes':'sum', 'replica_time_create':'max'})\ .withColumnRenamed('sum(block_bytes)', 'pbr_size')\ .withColumnRenamed('max(replica_time_create)', 'max_replica_time') newpdf.registerTempTable('newpdf') # print_rows(newpdf, 'newpdf', verbose) # newpdf.persist(StorageLevel.MEMORY_AND_DISK) # join tables cols = ['*'] # to select all fields from table cols = [ 'd_dataset_id', 'd_dataset', 'd_creation_date', 'd_is_dataset_valid', 'f_event_count', 'f_file_size', 'dataset_access_type', 'acquisition_era_name', 'processing_version' ] # join tables stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join( cols) print(stmt) joins = sqlContext.sql(stmt) # print_rows(joins, 'joins', verbose) # keep joins table around # joins.persist(StorageLevel.MEMORY_AND_DISK) # construct conditions cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1' fjoin = joins.where(cond).distinct().select(cols) # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000')) newdf = fjoin\ .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\ .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\ .withColumnRenamed('sum(f_event_count)', 'evts')\ .withColumnRenamed('sum(f_file_size)', 'size')\ .withColumnRenamed('max(d_creation_date)', 'date') # at this point we have ndf dataframe with our collected stats for every dataset # let's join it with release info newdf.registerTempTable('newdf') cols = [ 'd_dataset_id', 'd_dataset', 'evts', 'size', 'date', 'dataset_access_type', 'acquisition_era_name', 'processing_version', 'r_release_version' ] stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join( cols) agg_dbs_df = sqlContext.sql(stmt) agg_dbs_df.registerTempTable('agg_dbs_df') # print_rows(agg_dbs_df, 'agg_dbs_df', verbose) # keep agg_dbs_df table around # agg_dbs_df.persist(StorageLevel.MEMORY_AND_DISK) # join dbs and phedex tables # cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version','dataset_name','node_name','pbr_size','dataset_is_open','max_replica_time'] cols = [ 'd_dataset', 'evts', 'size', 'date', 'dataset_access_type', 'acquisition_era_name', 'r_release_version', 'node_name', 'pbr_size', 'dataset_is_open', 'max_replica_time' ] stmt = 'SELECT %s FROM agg_dbs_df JOIN newpdf ON agg_dbs_df.d_dataset = newpdf.dataset_name' % ','.join( cols) finaldf = sqlContext.sql(stmt) # keep agg_dbs_df table around # finaldf.persist(StorageLevel.MEMORY_AND_DISK) # print_rows(finaldf, stmt, verbose) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: ndf = split_dataset(finaldf, 'd_dataset') ndf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()