def run(date, fout, yarn=None, verbose=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and JobMonitoring tables tables = {} # read JobMonitoring avro rdd date = jm_date(date) jm_df = jm_tables(ctx, sqlContext, date=date, verbose=verbose) # DBS tables tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) ddf = tables['ddf'] # dataset table fdf = tables['fdf'] # file table # merge DBS and JobMonitoring data cols = [ 'd_dataset', 'd_dataset_id', 'f_logical_file_name', 'FileName', 'FileType', 'Type', 'SiteName', 'WrapWC', 'WrapCPU', 'JobExecExitCode' ] stmt = 'SELECT %s FROM ddf JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id JOIN jm_df ON fdf.f_logical_file_name = jm_df.FileName' % ','.join( cols) joins = sqlContext.sql(stmt) print_rows(joins, stmt, verbose) # perform aggregation fjoin = joins.groupBy(['SiteName','JobExecExitCode','FileType','Type','d_dataset'])\ .agg({'WrapWC':'sum','WrapCPU':'sum','JobExecExitCode':'count','FileType':'count','Type':'count'})\ .withColumnRenamed('sum(WrapWC)', 'tot_wc')\ .withColumnRenamed('sum(WrapCPU)', 'tot_cpu')\ .withColumnRenamed('count(JobExecExitCode)', 'ecode_count')\ .withColumnRenamed('count(FileType)', 'file_type_count')\ .withColumnRenamed('count(Type)', 'type_count')\ .withColumnRenamed('d_dataset', 'dataset')\ .withColumn('date', lit(jm_date_unix(date)))\ .withColumn('count_type', lit('jm'))\ # keep table around fjoin.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: ndf = split_dataset(fjoin, 'dataset') ndf.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def run_agg_jm(date, ctx, sql_context, verbose=False): """ Runs aggregation for JobMonitoring stream for a certain date. Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream. Result dataframe is sorted by nacc. """ print('Starting JobMonitoring part') # Make a UNIX timestamp from date unix_date = short_date_to_unix(short_date_string(date)) # Convert date date = long_date_string(date) # Create JobMonitoring tables in sql_context jm_df = jm_tables(ctx, sql_context, date=date, verbose=verbose) if verbose: print('Found %s records in JobMonitoring stream' % jm_df['jm_df'].count()) # - site name + # - dataset name + # - app + # - uid + # - dn + # - number of access (nacc) + # - distinct users + # - stream: crab + # - timestamp + # - site tier + # - cpu time + cols = [ 'SiteName AS site_name', 'dataset_name', 'count(dataset_name) AS nacc', 'count(distinct(UserId)) AS distinct_users', '\"crab\" AS stream', '%s AS timestamp' % unix_date, 'first(tier_from_site_name(SiteName)) AS site_tier', 'SUM(WrapCPU) AS cpu_time' ] # Build a query with "cols" columns # query = ("SELECT %s FROM jm_df "\ # "JOIN f_b_s_df ON f_b_s_df.file_name = jm_df.FileName "\ # "GROUP BY jm_df.SiteName, dataset_name") \ cols = [ 'SiteName AS site_name', 'dataset_name', 'stream4app(jm_df.SubmissionTool) AS app', 'dn2uuid(GridName) AS uid', 'parse_dn(GridName) AS dn', '\"crab\" AS stream', '%s AS timestamp' % unix_date, 'WrapCPU AS cpu', 'WrapWC as wc' ] query = "SELECT %s FROM jm_df "\ "JOIN f_b_s_df ON f_b_s_df.file_name = jm_df.FileName " \ % ','.join(cols) cols = [ 'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid', 'first(stream) as stream', 'first(timestamp) as timestamp', 'count(dataset_name) AS nacc', 'count(dn) AS distinct_users', 'first(tier_from_site_name(site_name)) AS site_tier', 'SUM(cpu) AS cpu_time', 'SUM(wc) AS wc_time' ] query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \ % (','.join(cols), query) result = run_query(query, sql_context, verbose) # result = result.sort(desc("nacc")) # Split "dataset" column into "primds", "procds" and "tier" result = split_dataset_col(result, 'dataset_name') if verbose: print('Finished JobMonitoring part (output is %s records)' % result.count()) else: print('Finished JobMonitoring part') return result
def run_jm(date, fout, ctx, sql_context, verbose=False): if verbose: print('Starting JobMonitoring part') # Create fout by adding stream name and date paths fout = fout + "/CRAB/" + short_date_string(date) # Convert date date = long_date_string(date) # Create JobMonitoring tables in sql_context jm_tables(ctx, sql_context, date=date, verbose=verbose) if verbose: print('Will build query for JM and DBS tables') # - file name + # - file size + # - primds + # - procds + # - tier + # - site name + # - file replicas # - user dn # - start/end time + # - read bytes # - cpu/wc values + # - source: crab + # JobMonitoring (CRAB) columns # For cpu value WrapCPU is used. Records also have ExeCPU. jm_cols = ['FileName AS file_name', 'SiteName AS site_name', 'WrapWC AS wc', 'WrapCPU AS cpu', 'StartedRunningTimeStamp AS start_time', 'FinishedTimeStamp AS end_time', '"crab" AS source'] # DBS columns ddf_cols = ['d_dataset'] fdf_cols = ['f_file_size AS file_size'] # Concatenate arrays with column names (i.e. use all column names from arrays) cols = jm_cols + ddf_cols + fdf_cols # Build a query with "cols" columns. Join DDF, FDF and JobMonitoring tables query = ("SELECT %s FROM ddf " "JOIN fdf ON ddf.d_dataset_id = fdf.f_dataset_id " "JOIN jm_df ON fdf.f_logical_file_name = jm_df.FileName") % ','.join(cols) result = run_query(query, sql_context, verbose) # Split "dataset" column into "primds", "procds" and "tier" result = split_dataset(result, 'd_dataset') output_dataframe(fout, result, verbose) if verbose: print('Finished JobMonitoring part')