def run_agg_jm(date, ctx, sql_context, verbose=False): """ Runs aggregation for JobMonitoring stream for a certain date. Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream. Result dataframe is sorted by nacc. """ print('Starting JobMonitoring part') # Make a UNIX timestamp from date unix_date = short_date_to_unix(short_date_string(date)) # Convert date date = long_date_string(date) # Create JobMonitoring tables in sql_context jm_df = jm_tables(ctx, sql_context, date=date, verbose=verbose) if verbose: print('Found %s records in JobMonitoring stream' % jm_df['jm_df'].count()) # - site name + # - dataset name + # - app + # - uid + # - dn + # - number of access (nacc) + # - distinct users + # - stream: crab + # - timestamp + # - site tier + # - cpu time + cols = [ 'SiteName AS site_name', 'dataset_name', 'count(dataset_name) AS nacc', 'count(distinct(UserId)) AS distinct_users', '\"crab\" AS stream', '%s AS timestamp' % unix_date, 'first(tier_from_site_name(SiteName)) AS site_tier', 'SUM(WrapCPU) AS cpu_time' ] # Build a query with "cols" columns # query = ("SELECT %s FROM jm_df "\ # "JOIN f_b_s_df ON f_b_s_df.file_name = jm_df.FileName "\ # "GROUP BY jm_df.SiteName, dataset_name") \ cols = [ 'SiteName AS site_name', 'dataset_name', 'stream4app(jm_df.SubmissionTool) AS app', 'dn2uuid(GridName) AS uid', 'parse_dn(GridName) AS dn', '\"crab\" AS stream', '%s AS timestamp' % unix_date, 'WrapCPU AS cpu', 'WrapWC as wc' ] query = "SELECT %s FROM jm_df "\ "JOIN f_b_s_df ON f_b_s_df.file_name = jm_df.FileName " \ % ','.join(cols) cols = [ 'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid', 'first(stream) as stream', 'first(timestamp) as timestamp', 'count(dataset_name) AS nacc', 'count(dn) AS distinct_users', 'first(tier_from_site_name(site_name)) AS site_tier', 'SUM(cpu) AS cpu_time', 'SUM(wc) AS wc_time' ] query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \ % (','.join(cols), query) result = run_query(query, sql_context, verbose) # result = result.sort(desc("nacc")) # Split "dataset" column into "primds", "procds" and "tier" result = split_dataset_col(result, 'dataset_name') if verbose: print('Finished JobMonitoring part (output is %s records)' % result.count()) else: print('Finished JobMonitoring part') return result
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() print("Input arguments: %s" % opts) start_time = time.time() verbose = opts.verbose yarn = opts.yarn inst = opts.inst date = opts.date fout = opts.fout aaa_hdir = opts.aaa_hdir if inst.lower() in ['global', 'phys01', 'phys02', 'phys03']: inst = inst.upper() else: raise Exception('Unsupported DBS instance "%s"' % inst) # Create spark context ctx = spark_context('cms', yarn, verbose) quiet_logs(ctx) # Create SQL context to be used for SQL queries sql_context = SQLContext(ctx) # Initialize DBS tables dbs_tables(sql_context, inst=inst, verbose=verbose, tables=['fdf', 'bdf', 'ddf']) # Initialize PhEDEx table to be used in file_block_site table phedex_tables(sql_context, verbose=verbose) # Register clean_site_name to be used with SQL queries sql_context.udf.register("clean_site_name", clean_site_name) # Register tier_from_site_name to be used with SQL queries sql_context.udf.register("tier_from_site_name", tier_from_site_name) # Register dn2uuid to be used with SQL queries sql_context.udf.register("dn2uuid", dn2uuid) # Register parse_app to be used with SQL queries sql_context.udf.register("parse_app", parse_app) # Register stream4app to be used with SQL queries sql_context.udf.register("stream4app", stream4app) # Register parse_dn to be used with SQL queries sql_context.udf.register("parse_dn", parse_dn) f_b_s_start_time = time.time() # Create temp table with file name, block name, site name and site from PhEDEx create_file_block_site_table(ctx, sql_context, verbose) f_b_s_elapsed_time = elapsed_time(f_b_s_start_time) cmssw_start_time = time.time() aggregated_cmssw_df = run_agg_cmssw(date, ctx, sql_context, verbose) cmssw_elapsed_time = elapsed_time(cmssw_start_time) aaa_start_time = time.time() if len(aaa_hdir) > 0: aggregated_aaa_df = run_agg_aaa(date, ctx, sql_context, aaa_hdir, verbose) else: aggregated_aaa_df = run_agg_aaa(date, ctx, sql_context, verbose=verbose) aaa_elapsed_time = elapsed_time(aaa_start_time) eos_start_time = time.time() aggregated_eos_df = run_agg_eos(date, ctx, sql_context, verbose) eos_elapsed_time = elapsed_time(eos_start_time) jm_start_time = time.time() aggregated_jm_df = run_agg_jm(date, ctx, sql_context, verbose) jm_elapsed_time = elapsed_time(jm_start_time) if verbose: print('Will union outputs from all streams to a single dataframe') # Schema for output is: # site name, dataset name, number of accesses, distinct users, stream all_df = aggregated_cmssw_df.unionAll(aggregated_aaa_df) all_df = all_df.unionAll(aggregated_eos_df) all_df = all_df.unionAll(aggregated_jm_df) all_df = all_df.sort(desc("nacc")) if verbose: print('Done joining all outputs to a single dataframe') fout = fout + "/" + short_date_string(date) # output_dataframe(fout + "/Aggregated/CMSSW/" + short_date_string(date), aggregated_cmssw_df, verbose) # output_dataframe(fout + "/Aggregated/AAA/" + short_date_string(date), aggregated_aaa_df, verbose) # output_dataframe(fout + "/Aggregated/EOS/" + short_date_string(date), aggregated_eos_df, verbose) # output_dataframe(fout + "/Aggregated/JobMonitoring/" + short_date_string(date), aggregated_jm_df, verbose) output_dataframe(fout, all_df, verbose) if verbose: cmssw_df_size = aggregated_cmssw_df.count() aaa_df_size = aggregated_aaa_df.count() eos_df_size = aggregated_eos_df.count() jm_df_size = aggregated_jm_df.count() all_df_size = all_df.count() print('CMSSW:') aggregated_cmssw_df.show(10) aggregated_cmssw_df.printSchema() print('AAA:') aggregated_aaa_df.show(10) aggregated_aaa_df.printSchema() print('EOS:') aggregated_eos_df.show(10) aggregated_eos_df.printSchema() print('JobMonitoring:') aggregated_jm_df.show(10) aggregated_jm_df.printSchema() print('Aggregated all:') all_df.show(10) all_df.printSchema() print('Output record count:') print('Output record count CMSSW : %s' % cmssw_df_size) print('Output record count AAA : %s' % aaa_df_size) print('Output record count EOS : %s' % eos_df_size) print('Output record count JobMonitoring : %s' % jm_df_size) print('Output record count Total: : %s' % all_df_size) ctx.stop() print('Start time : %s' % time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(start_time))) print('End time : %s' % time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(time.time()))) print('Total elapsed time : %s' % elapsed_time(start_time)) print('FileBlockSite elapsed time : %s' % f_b_s_elapsed_time) print('AAA elapsed time : %s' % aaa_elapsed_time) print('CMSSW elapsed time : %s' % cmssw_elapsed_time) print('EOS elapsed time : %s' % eos_elapsed_time) print('JobMonitoring elapsed time : %s' % jm_elapsed_time)
def run_agg_aaa(date, ctx, sql_context, hdir='hdfs:///project/monitoring/archive/xrootd/enr/gled', verbose=False): """ Runs aggregation for JobMonitoring stream for a certain date. Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream. Data is taken from /project/monitoring/archive/xrootd/enr/gled and not the default location (raw instead of enr) because enr records have src_experiment_site. src_experiment_site is used as site_name. Result dataframe is sorted by nacc. """ print('Starting AAA part') # Make a UNIX timestamp from date unix_date = short_date_to_unix(short_date_string(date)) # Convert date date = short_date_string(date) # Create AAA tables in sql_context aaa_df = aaa_tables_enr(sql_context, hdir=hdir, date=date, verbose=verbose) if verbose: print('Found %s records in AAA stream' % aaa_df['aaa_df'].count()) # - site name + # - dataset name + # - app + # - uid + # - dn + # - number of access (nacc) + # - distinct users + # - stream: aaa + # - timestamp + # - site tier + # - cpu time -1 cols = [ 'src_experiment_site AS site_name', 'dataset_name', 'count(dataset_name) AS nacc', 'count(distinct(aaa_df.user_dn)) AS distinct_users', '\"aaa\" as stream', '%s AS timestamp' % unix_date, 'first(tier_from_site_name(src_experiment_site)) AS site_tier', '-1 AS cpu_time' ] # Build a query with "cols" columns # query = ("SELECT %s FROM aaa_df " \ # "JOIN f_b_s_df ON f_b_s_df.file_name = aaa_df.file_lfn " \ # "GROUP BY src_experiment_site, dataset_name") \ # % ','.join(cols) cols = [ 'src_experiment_site AS site_name', 'dataset_name', '\"xrootd\" AS app', 'dn2uuid(aaa_df.user_dn) AS uid', 'parse_dn(aaa_df.user_dn) AS dn', '\"aaa\" as stream', '%s AS timestamp' % unix_date, '-1 AS cpu' ] query = "SELECT %s FROM aaa_df " \ "JOIN f_b_s_df ON f_b_s_df.file_name = aaa_df.file_lfn " \ % ','.join(cols) cols = [ 'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid', 'first(stream) as stream', 'first(timestamp) as timestamp', 'count(dataset_name) AS nacc', 'count(dn) AS distinct_users', 'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time', '-1 AS wc_time' ] query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \ % (','.join(cols), query) result = run_query(query, sql_context, verbose) # result = result.sort(desc("nacc")) # Split "dataset" column into "primds", "procds" and "tier" result = split_dataset_col(result, 'dataset_name') if verbose: print('Finished AAA part (output is %s records)' % result.count()) else: print('Finished AAA part') return result
def run_agg_cmssw(date, ctx, sql_context, verbose=False): """ Runs aggregation for CMSSW stream for a certain date. Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream. Result dataframe is sorted by nacc. """ print('Starting CMSSW part') # Make a UNIX timestamp from date unix_date = short_date_to_unix(short_date_string(date)) # Convert date date = long_date_string(date) # Create CMSSW tables in sql_context cmssw_df = cmssw_tables(ctx, sql_context, date=date, verbose=verbose) if verbose: print('Found %s records in CMSSW stream' % cmssw_df['cmssw_df'].count()) # - site name + # - dataset name + # - app + # - uid + # - dn + # - number of access (nacc) + # - distinct users + # - stream: cmssw + # - timestamp + # - site tier + # - cpu time -1 cols = [ 'cmssw_df.SITE_NAME AS site_name', 'dataset_name', 'count(dataset_name) AS nacc', 'count(distinct(USER_DN)) AS distinct_users', '\"cmssw\" as stream', '%s AS timestamp' % unix_date, 'first(tier_from_site_name(cmssw_df.SITE_NAME)) AS site_tier', '-1 AS cpu_time' ] # Build a query with "cols" columns # query = ("SELECT %s FROM cmssw_df "\ # "JOIN f_b_s_df ON f_b_s_df.file_name = cmssw_df.FILE_LFN "\ # "GROUP BY cmssw_df.SITE_NAME, dataset_name") \ # % ','.join(cols) cols = [ 'cmssw_df.SITE_NAME AS site_name', 'dataset_name', 'parse_app(cmssw_df.APP_INFO) AS app', 'dn2uuid(cmssw_df.USER_DN) AS uid', 'parse_dn(cmssw_df.USER_DN) AS dn', 'stream4app(cmssw_df.APP_INFO) as stream', '%s AS timestamp' % unix_date, '-1 AS cpu' ] query = "SELECT %s FROM cmssw_df "\ "JOIN f_b_s_df ON f_b_s_df.file_name = cmssw_df.FILE_LFN " \ % ','.join(cols) cols = [ 'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid', 'first(stream) as stream', 'first(timestamp) as timestamp', 'count(dataset_name) AS nacc', 'count(dn) AS distinct_users', 'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time', '-1 AS wc_time' ] query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \ % (','.join(cols), query) result = run_query(query, sql_context, verbose) # result = result.sort(desc("nacc")) # Split "dataset" column into "primds", "procds" and "tier" result = split_dataset_col(result, 'dataset_name') if verbose: print('Finished CMSSW part (output is %s records)' % result.count()) else: print('Finished CMSSW part') return result
def run_agg_eos(date, ctx, sql_context, verbose=False): """ Runs aggregation for EOS stream for a certain date. Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream. Site name is taken from f_b_s_df table which is joined by file name. Result dataframe is sorted by nacc. """ print('Starting EOS part') # Make a UNIX timestamp from date unix_date = short_date_to_unix(short_date_string(date)) # Convert date date = short_date_string(date) # Create EOS tables in sql_context eos_df = eos_tables(sql_context, date=date, verbose=verbose) if verbose: print('Found %s records in EOS stream' % eos_df['eos_df'].count()) # - site name + # - dataset name + # - app + # - uid + # - dn + # - number of access (nacc) + # - distinct users + # - stream: eos + # - timestamp + # - site tier + # - cpu time -1 cols = [ 'site_name', 'dataset_name', 'count(dataset_name) AS nacc', 'count(distinct(eos_df.user_dn)) AS distinct_users', '\"eos\" as stream', '%s AS timestamp' % unix_date, 'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time' ] # Build a query with "cols" columns # query = ("SELECT %s FROM eos_df " \ # "JOIN f_b_s_df ON f_b_s_df.file_name = eos_df.file_lfn " \ # "GROUP BY site_name, dataset_name") \ # % ','.join(cols) cols = [ 'site_name', 'dataset_name', 'parse_app(eos_df.application) AS app', 'dn2uuid(eos_df.user_dn) AS uid', 'parse_dn(eos_df.user_dn) AS dn', '\"eos\" as stream', '%s AS timestamp' % unix_date, '-1 AS cpu' ] query = "SELECT %s FROM eos_df " \ "JOIN f_b_s_df ON f_b_s_df.file_name = eos_df.file_lfn " \ % ','.join(cols) cols = [ 'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid', 'first(stream) as stream', 'first(timestamp) as timestamp', 'count(dataset_name) AS nacc', 'count(dn) AS distinct_users', 'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time', '-1 AS wc_time' ] query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \ % (','.join(cols), query) result = run_query(query, sql_context, verbose) # result = result.sort(desc("nacc")) # Split "dataset" column into "primds", "procds" and "tier" result = split_dataset_col(result, 'dataset_name') if verbose: print('Finished EOS part (output is %s records)' % result.count()) else: print('Finished EOS part') return result