def fetch_classads_nanoaod(hdfs_path): """Fetch HDFS ClassAds records from a particular path""" # DBS dataset info csvreader = spark.read.format("com.databricks.spark.csv").option( "nullValue", "null").option("mode", "FAILFAST") # Path where the input files are basepath = "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current" # Get the information about Blocks so that we can map the block name of the job to the block id dbs_datasets = csvreader.schema( schemas.schema_datasets()).load(basepath + "/DATASETS") # Read input file jobreports = spark.read.json(hdfs_path) # The following regexps describe what is in the cache regexp1 = "/*/Run2016.*-03Feb2017.*/NANOAOD" regexp2 = "/*/RunIISummer16MiniAODv2-PUMoriond17_80X_.*/NANOAODSIM" regexp3 = "/*/.*-31Mar2018.*/NANOAOD" regexp4 = "/*/.*RunIIFall17NanoAODv2.*/NANOAODSIM" # Desired sites sites = ["T2_US_UCSD", "T2_US_Caltech", "T3_US_UCR"] df = (jobreports # Joing dataset DBS table with jobreports .join(dbs_datasets, col('data.DESIRED_CMSDataset')==col('d_dataset')) # Require datasets from cache .filter( col('d_dataset').rlike(regexp1) | col('d_dataset').rlike(regexp2) | col('d_dataset').rlike(regexp3) | col('d_dataset').rlike(regexp4) ) # Require at UCSD, Caltech, or UCR .filter(col('data.CMSSite').isin(sites)) # Require CMS jobs .filter(col('data.VO') == "cms") # Require analysis jobs .filter(col('data.Type') == 'analysis') # Require completed jobs .filter(col('data.Status') == 'Completed') # There are other utility CRAB jobs we don't want to read .filter(col('data.JobUniverse') == 5) # Select columns to save .select( col('data.CRAB_Workflow').alias('workflow_id'), col('data.CRAB_Id').alias('crab_id'), col('data.CRAB_Retry').alias('num_retries'), col('data.ScheddName').alias('schedd_name'), col('data.CRAB_UserHN').alias('user_hn'), col('data.CoreHr').alias('walltime'), col('data.CpuTimeHr').alias('cpu_time'), col('data.ExitCode').alias('exit_code'), col('data.RequestCpus').alias('num_cpus'), col('data.ChirpCMSSWReadBytes').alias('read_bytes') ) ) print("[script] Fetched {}".format(hdfs_path)) return df
def __init__(self, out): self.out = out conf = SparkConf().setMaster("yarn").setAppName("CMS Working Set") sc = SparkContext(conf=conf) self.spark = SparkSession(sc) avroreader = self.spark.read.format("com.databricks.spark.avro") csvreader = self.spark.read.format("com.databricks.spark.csv").option("nullValue","null").option("mode","FAILFAST") ## check if phedex_path exist or not on hdfs area and you can directly assign it phedex_path = "/project/awg/cms/phedex/block-replicas-snapshots/csv/time=" + str((date.today() - timedelta(days=2)).strftime("%Y-%m-%d")) + "_*/part-m-00000" self.phedex_block_replicas = csvreader.schema(schemas.schema_phedex()).load(phedex_path) self.dbs_files = csvreader.schema(schemas.schema_files()).load("/project/awg/cms/CMS_DBS3_PROD_GLOBAL/new/FILES/part-m-00000") self.dbs_blocks = csvreader.schema(schemas.schema_blocks()).load("/project/awg/cms/CMS_DBS3_PROD_GLOBAL/new/BLOCKS/part-m-00000") self.dbs_datasets = csvreader.schema(schemas.schema_datasets()).load("/project/awg/cms/CMS_DBS3_PROD_GLOBAL/new/DATASETS/part-m-00000")
def fileMismatch(args): conf = SparkConf().setMaster("yarn").setAppName("CMS Working Set") sc = SparkContext(conf=conf) spark = SparkSession(sc) print( "Initiated spark session on yarn, web URL: http://ithdp1101.cern.ch:8088/proxy/%s" % sc.applicationId) avroreader = spark.read.format("com.databricks.spark.avro") csvreader = spark.read.format("com.databricks.spark.csv").option( "nullValue", "null").option("mode", "FAILFAST") dbs_files = csvreader.schema(schemas.schema_files()).load( "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/FILES/part-m-00000") dbs_datasets = csvreader.schema(schemas.schema_datasets()).load( "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/DATASETS/part-m-00000") current = time.time() past_n_days = args.days delta_t = current - past_n_days * 60 * 60 * 24 delta_t_str = str(delta_t) delta_t = delta_t_str[:10] if args.out_path: mismatch_df = ( dbs_files.filter(col('f_is_file_valid') == '0').filter( col('f_last_modification_date') > delta_t).join( dbs_datasets, col('f_dataset_id') == col('d_dataset_id')).filter( (col('d_dataset_access_type_id') == '1') | (col('d_dataset_access_type_id') == '41')). filter(col('f_logical_file_name').isNotNull()).where( ~(dbs_files.f_last_modified_by.contains('dmielaik') | dbs_files.f_last_modified_by.contains('ogarzonm'))).select( 'd_dataset', 'f_last_modified_by', 'f_logical_file_name').distinct()) mismatch_df.select('f_logical_file_name', 'f_last_modified_by').repartition(1).write.format( "com.databricks.spark.csv").option( "header", "true").save(args.out_path) mismatch_df.groupby('d_dataset').agg( fn.count(fn.col("f_logical_file_name")).alias( 'extra_lfn_phedex')).show()
inputfile = "/project/monitoring/archive/condor/raw/metric/" + year + "/" + month + "/*/*.json.gz" outputfile = "hdfs://analytix/user/ddavila/model/data_tier_days_" + year + month + ".parquet" print("===========================================================") print("reading: " + inputfile) print("writing: " + outputfile) print("===========================================================") conf = SparkConf().setMaster("yarn").setAppName("CMS Working Set") sc = SparkContext(conf=conf) spark = SparkSession(sc) # Get information from DBS about datatsets IDs csvreader = spark.read.format("com.databricks.spark.csv").option( "nullValue", "null").option("mode", "FAILFAST") dbs_datasets = csvreader.schema(schemas.schema_datasets()).load( "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/DATASETS/part-m-00000") dbs_data_tiers = csvreader.schema(schemas.schema_data_tiers()).load( "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/DATA_TIERS/part-m-00000") schema = types.StructType([ types.StructField( "data", types.StructType([ types.StructField("Status", types.StringType(), True), types.StructField("Type", types.StringType(), True), types.StructField("JobUniverse", types.StringType(), True), types.StructField("DESIRED_CMSDataset", types.StringType(), True), types.StructField("RecordTime", types.LongType(), True), ]), False), ])
def run(args): conf = SparkConf().setMaster("yarn").setAppName("CMS Working Set") sc = SparkContext(conf=conf) spark = SparkSession(sc) print( "Initiated spark session on yarn, web URL: http://ithdp1101.cern.ch:8088/proxy/%s" % sc.applicationId) csvreader = (spark.read.format("csv").option("nullValue", "null").option( "mode", "FAILFAST")) dbs_files = csvreader.schema(schemas.schema_files()).load( "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/FILES/part-m-00000") dbs_datasets = (csvreader.schema(schemas.schema_datasets()).load( "/project/awg/cms/CMS_DBS3_PROD_GLOBAL/current/DATASETS/part-m-00000" ).withColumn( "input_campaign", fn.regexp_extract( col("d_dataset"), r"^/[^/]*/((?:HI|PA|PN|XeXe|)Run201\d\w-[^-]+|CMSSW_\d+|[^-]+)[^/]*/", 1, ), )) if args.source == "classads": working_set_day = (get_df_condor(spark, args.dates).withColumn( "day", (col("timestamp") - col("timestamp") % fn.lit(86400))).join( dbs_datasets, col("dataset_name") == col("d_dataset")).groupBy( "day", "input_campaign", "d_data_tier_id").agg( fn.collect_set("d_dataset_id").alias("working_set"), )) working_set_day.write.parquet(args.out) elif args.source == "cmssw": working_set_day = (get_df_cmssw(spark, args.dates).withColumn( "day", (col("timestamp") - col("timestamp") % fn.lit(86400))).join( dbs_files, col("file_lfn") == col("f_logical_file_name")).join( dbs_datasets, col("f_dataset_id") == col("d_dataset_id")).groupBy( "day", "input_campaign", "d_data_tier_id", "site_name", "is_crab").agg( fn.collect_set("f_block_id").alias( "working_set_blocks"), )) working_set_day.write.parquet(args.out) elif args.source == "xrootd": working_set_day = (get_df_xrootd(spark, args.dates).withColumn( "day", (col("timestamp") - col("timestamp") % fn.lit(86400))).join( dbs_files, col("file_lfn") == col("f_logical_file_name")).join( dbs_datasets, col("f_dataset_id") == col("d_dataset_id")).groupBy( "day", "input_campaign", "d_data_tier_id", "client_domain").agg( fn.collect_set("f_block_id").alias( "working_set_blocks"), )) working_set_day.write.parquet(args.out) elif args.source == "fwjr": working_set_day = (get_df_wmarchive(spark, args.dates).withColumn( "day", (col("timestamp") - col("timestamp") % fn.lit(86400))).join( dbs_files, col("file_lfn") == col("f_logical_file_name")).join( dbs_datasets, col("f_dataset_id") == col("d_dataset_id")).groupBy( "day", "input_campaign", "d_data_tier_id", "site_name").agg( fn.collect_set("f_block_id").alias( "working_set_blocks"), )) working_set_day.write.parquet(args.out)