def run_eos(date, fout, ctx, sql_context, verbose=False): if verbose: print('Starting EOS part') # Create fout by adding stream name and date paths fout = fout + "/EOS/" + short_date_string(date) # Convert date date = short_date_string(date) # Create EOS tables in sql_context eos_tables(sql_context, date=date, verbose=verbose) if verbose: print('Will build query for EOS and DBS tables') # - file name + # - file size + # - primds + # - procds + # - tier + # - site name # - file replicas # - user dn + # - start/end time + # - read bytes # - cpu/wc values # - source: eos + # EOS columns # Same timestamp is used in both start and end times eos_cols = ['file_lfn AS file_name', 'user_dn', '"eos" AS source', 'timestamp AS start_time', 'timestamp AS end_time'] # DBS columns ddf_cols = ['d_dataset'] fdf_cols = ['f_file_size AS file_size'] # Concatenate arrays with column names (i.e. use all column names from arrays) cols = eos_cols + ddf_cols + fdf_cols # Build a query with "cols" columns. Join DDF, FDF and EOS tables query = ("SELECT %s FROM ddf " "JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id " "JOIN eos_df ON fdf.f_logical_file_name = eos_df.file_lfn") % ','.join(cols) result = run_query(query, sql_context, verbose) # Split "dataset" column into "primds", "procds" and "tier" result = split_dataset(result, 'd_dataset') output_dataframe(fout, result, verbose) if verbose: print('Finished EOS part')
def run_aaa(date, fout, ctx, sql_context, verbose=False): if verbose: print('Starting AAA part') # Create fout by adding stream name and date paths fout = fout + "/AAA/" + short_date_string(date) # Convert date date = short_date_string(date) # Create AAA tables in sql_context aaa_tables(sql_context, date=date, verbose=verbose) if verbose: print('Will build query for AAA and DBS tables') # - file name + # - file size + # - primds + # - procds + # - tier + # - site name # - file replicas # - user dn + # - start/end time + # - read bytes + # - cpu/wc values # - source: xrootd + # AAA columns aaa_cols = ['file_lfn AS file_name', 'file_size', 'user_dn', 'start_time', 'end_time', 'read_bytes', '"xrootd" AS source'] # DBS columns ddf_cols = ['d_dataset'] # Concatenate arrays with column names (i.e. use all column names from arrays) cols = aaa_cols + ddf_cols # Build a query with "cols" columns. Join DDF, FDF and AAA tables query = ("SELECT %s FROM ddf " "JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id " "JOIN aaa_df ON fdf.f_logical_file_name = aaa_df.file_lfn") % ','.join(cols) result = run_query(query, sql_context, verbose) # Split "dataset" column into "primds", "procds" and "tier" result = split_dataset(result, 'd_dataset') output_dataframe(fout, result, verbose) if verbose: print('Finished AAA part')
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) ddf = tables['ddf'] # dataset table fdf = tables['fdf'] # file table if verbose: for row in ddf.head(1): print("### ddf row", row) # read CMSSW avro rdd date = eos_date(date) tables.update(eos_tables(sqlContext, date=date, verbose=verbose)) eos_df = tables['eos_df'] # EOS table if verbose: for row in eos_df.head(1): print("### eos_df row", row) # merge DBS and CMSSW data cols = ['d_dataset', 'd_dataset_id', 'f_logical_file_name', 'file_lfn'] stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN eos_df ON fdf.f_logical_file_name = eos_df.file_lfn' % ','.join( cols) joins = sqlContext.sql(stmt) print_rows(joins, stmt, verbose) # perform aggregation fjoin = joins.groupBy(['d_dataset'])\ .agg({'file_lfn':'count'})\ .withColumnRenamed('count(file_lfn)', 'count')\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumn('date', lit(eos_date_unix(date)))\ .withColumn('count_type', lit('eos'))\ # keep table around fjoin.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: ndf = split_dataset(fjoin, 'dataset') ndf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and JobMonitoring tables tables = {} # read JobMonitoring avro rdd date = jm_date(date) jm_df = jm_tables(ctx, sqlContext, date=date, verbose=verbose) # DBS tables tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) ddf = tables['ddf'] # dataset table fdf = tables['fdf'] # file table # merge DBS and JobMonitoring data cols = [ 'd_dataset', 'd_dataset_id', 'f_logical_file_name', 'FileName', 'FileType', 'Type', 'SiteName', 'WrapWC', 'WrapCPU', 'JobExecExitCode' ] stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN jm_df ON fdf.f_logical_file_name = jm_df.FileName' % ','.join( cols) joins = sqlContext.sql(stmt) print_rows(joins, stmt, verbose) # perform aggregation fjoin = joins.groupBy(['SiteName','JobExecExitCode','FileType','Type','d_dataset'])\ .agg({'WrapWC':'sum','WrapCPU':'sum','JobExecExitCode':'count','FileType':'count','Type':'count'})\ .withColumnRenamed('sum(WrapWC)', 'tot_wc')\ .withColumnRenamed('sum(WrapCPU)', 'tot_cpu')\ .withColumnRenamed('count(JobExecExitCode)', 'ecode_count')\ .withColumnRenamed('count(FileType)', 'file_type_count')\ .withColumnRenamed('count(Type)', 'type_count')\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumn('date', lit(jm_date_unix(date)))\ .withColumn('count_type', lit('jm'))\ # keep table around fjoin.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: ndf = split_dataset(fjoin, 'dataset') ndf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run(fout, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) tables.update(phedex_tables(sqlContext, verbose=verbose)) phedex_df = tables['phedex_df'] daf = tables['daf'] ddf = tables['ddf'] bdf = tables['bdf'] fdf = tables['fdf'] aef = tables['aef'] pef = tables['pef'] mcf = tables['mcf'] ocf = tables['ocf'] rvf = tables['rvf'] print("### ddf from main", ddf) # aggregate phedex info into dataframe phedex_cols = [ 'node_name', 'dataset_name', 'dataset_is_open', 'block_bytes', 'replica_time_create' ] newpdf = phedex_df.select(phedex_cols).groupBy(['node_name', 'dataset_name', 'dataset_is_open'])\ .agg({'block_bytes':'sum', 'replica_time_create':'max'})\ .withColumnRenamed('sum(block_bytes)', 'pbr_size')\ .withColumnRenamed('max(replica_time_create)', 'max_replica_time') newpdf.registerTempTable('newpdf') # print_rows(newpdf, 'newpdf', verbose) # newpdf.persist(StorageLevel.MEMORY_AND_DISK) # join tables cols = ['*'] # to select all fields from table cols = [ 'd_dataset_id', 'd_dataset', 'd_creation_date', 'd_is_dataset_valid', 'f_event_count', 'f_file_size', 'dataset_access_type', 'acquisition_era_name', 'processing_version' ] # join tables stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join( cols) print(stmt) joins = sqlContext.sql(stmt) # print_rows(joins, 'joins', verbose) # keep joins table around # joins.persist(StorageLevel.MEMORY_AND_DISK) # construct conditions cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1' fjoin = joins.where(cond).distinct().select(cols) # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000')) newdf = fjoin\ .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\ .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\ .withColumnRenamed('sum(f_event_count)', 'evts')\ .withColumnRenamed('sum(f_file_size)', 'size')\ .withColumnRenamed('max(d_creation_date)', 'date') # at this point we have ndf dataframe with our collected stats for every dataset # let's join it with release info newdf.registerTempTable('newdf') cols = [ 'd_dataset_id', 'd_dataset', 'evts', 'size', 'date', 'dataset_access_type', 'acquisition_era_name', 'processing_version', 'r_release_version' ] stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join( cols) agg_dbs_df = sqlContext.sql(stmt) agg_dbs_df.registerTempTable('agg_dbs_df') # print_rows(agg_dbs_df, 'agg_dbs_df', verbose) # keep agg_dbs_df table around # agg_dbs_df.persist(StorageLevel.MEMORY_AND_DISK) # join dbs and phedex tables # cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version','dataset_name','node_name','pbr_size','dataset_is_open','max_replica_time'] cols = [ 'd_dataset', 'evts', 'size', 'date', 'dataset_access_type', 'acquisition_era_name', 'r_release_version', 'node_name', 'pbr_size', 'dataset_is_open', 'max_replica_time' ] stmt = 'SELECT %s FROM agg_dbs_df JOIN newpdf ON agg_dbs_df.d_dataset = newpdf.dataset_name' % ','.join( cols) finaldf = sqlContext.sql(stmt) # keep agg_dbs_df table around # finaldf.persist(StorageLevel.MEMORY_AND_DISK) # print_rows(finaldf, stmt, verbose) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: ndf = split_dataset(finaldf, 'd_dataset') ndf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run_jm(date, fout, ctx, sql_context, verbose=False): if verbose: print('Starting JobMonitoring part') # Create fout by adding stream name and date paths fout = fout + "/CRAB/" + short_date_string(date) # Convert date date = long_date_string(date) # Create JobMonitoring tables in sql_context jm_tables(ctx, sql_context, date=date, verbose=verbose) if verbose: print('Will build query for JM and DBS tables') # - file name + # - file size + # - primds + # - procds + # - tier + # - site name + # - file replicas # - user dn # - start/end time + # - read bytes # - cpu/wc values + # - source: crab + # JobMonitoring (CRAB) columns # For cpu value WrapCPU is used. Records also have ExeCPU. jm_cols = ['FileName AS file_name', 'SiteName AS site_name', 'WrapWC AS wc', 'WrapCPU AS cpu', 'StartedRunningTimeStamp AS start_time', 'FinishedTimeStamp AS end_time', '"crab" AS source'] # DBS columns ddf_cols = ['d_dataset'] fdf_cols = ['f_file_size AS file_size'] # Concatenate arrays with column names (i.e. use all column names from arrays) cols = jm_cols + ddf_cols + fdf_cols # Build a query with "cols" columns. Join DDF, FDF and JobMonitoring tables query = ("SELECT %s FROM ddf " "JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id " "JOIN jm_df ON fdf.f_logical_file_name = jm_df.FileName") % ','.join(cols) result = run_query(query, sql_context, verbose) # Split "dataset" column into "primds", "procds" and "tier" result = split_dataset(result, 'd_dataset') output_dataframe(fout, result, verbose) if verbose: print('Finished JobMonitoring part')
def run_cmssw(date, fout, ctx, sql_context, verbose=False): if verbose: print('Starting CMSSW part') # Create fout by adding stream name and date paths fout = fout + "/CMSSW/" + short_date_string(date) # Convert date date = long_date_string(date) # Create CMSSW tables in sql_context cmssw_tables(ctx, sql_context, date=date, verbose=verbose) if verbose: print('Will build query for CMSSW and DBS tables') # - file name + # - file size + # - primds + # - procds + # - tier + # - site name + # - file replicas # - user dn + # - start/end time + # - read bytes + # - cpu/wc values # - source: cmssw + # CMSSW columns cmssw_cols = ['FILE_LFN AS file_name', 'FILE_SIZE AS file_size', 'SITE_NAME AS site_name', 'user_dn', 'START_TIME AS start_time', 'END_TIME as end_time', 'READ_BYTES as read_bytes', '"cmssw" AS source'] # DBS columns ddf_cols = ['d_dataset'] # Concatenate arrays with column names (i.e. use all column names from arrays) cols = cmssw_cols + ddf_cols # Build a query with "cols" columns. Join DDF, FDF and CMSSW tables query = ("SELECT %s FROM ddf " "JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id " "JOIN cmssw_df ON fdf.f_logical_file_name = cmssw_df.FILE_LFN") % ','.join(cols) result = run_query(query, sql_context, verbose) if verbose: print('Query done. Will split "dataset" column') # Split "dataset" column into "primds", "procds" and "tier" result = split_dataset(result, 'd_dataset') output_dataframe(fout, result, verbose) if verbose: print('Finished CMSSW part')