def get_filenames(args):
    base_path, sample = args[0], args[1]

    filename_replacements = {
        "srm://grid-srm.physik.rwth-aachen.de:8443/srm/managerv2?SFN=/pnfs/physik.rwth-aachen.de/cms/store/user/":
        "root://grid-vo-cms.physik.rwth-aachen.de:1094//store/user/"
    }

    filenames_per_sample_per_pipeline = {}

    stdout, stderr = tools.subprocessCall(
        shlex.split("gfal-ls " + os.path.join(base_path, sample)))
    filenames = [
        filename for filename in stdout.decode().strip().split("\n")
        if (("SvfitCache" in filename) and filename.endswith(".root"))
    ]
    if len(filenames) > 0:
        filenames = [
            os.path.join(base_path, sample, filename) for filename in filenames
        ]
        for filename in filenames:
            for src, dst in filename_replacements.iteritems():
                filename = filename.replace(src, dst)
            pipeline = re.search("SvfitCache(?P<pipeline>.*)\d+.root",
                                 filename).groupdict()["pipeline"]
            filenames_per_sample_per_pipeline.setdefault(
                sample, {}).setdefault(pipeline, []).append(filename)

    return filenames_per_sample_per_pipeline
Beispiel #2
0
def list_of_files(path, recursive=False, gfal_ls_args=""):
	if (not "*" in path) and (not recursive):
		return [path]
	
	splitted_path = re.split("/(?:(?=[^/]*\*))", path, maxsplit=1)
	command = "gfal-ls {gfal_ls_args} {path}".format(gfal_ls_args=gfal_ls_args, path=splitted_path[0])
	log.debug(command)
	stdout, stderr = tools.subprocessCall(shlex.split(command))
	stdout = stdout.strip()
	if stdout == "":
		return []
	if stdout == path:
		return [stdout]
	
	new_paths = [os.path.join(splitted_path[0], item) for item in stdout.strip().split("\n")]
	if len(splitted_path) > 1:
		splitted_path_with_wildcard = re.split("(\*[^/]*/)", path, maxsplit=1)
		path_to_match = "".join(splitted_path_with_wildcard[:2]).rstrip("/")
		rest = "".join(splitted_path_with_wildcard[2:])
		new_paths = [os.path.join(path, rest).rstrip("/") for path in new_paths if fnmatch.fnmatch(path, path_to_match)]
	
	results = []
	for new_path in new_paths:
		results.extend(list_of_files(path=new_path, recursive=recursive, gfal_ls_args=gfal_ls_args))
	return sorted(results)
def submission(base_paths, n_processes=1):

    # retrieve and prepare input files
    filenames_per_sample_per_pipeline = {}
    for base_path in base_paths:
        stdout_directories, stderr_directories = tools.subprocessCall(
            shlex.split("gfal-ls " + base_path))
        tmp_filenames_per_sample_per_pipeline = tools.parallelize(
            get_filenames,
            [[base_path, sample]
             for sample in stdout_directories.decode().strip().split("\n")],
            n_processes=n_processes,
            description="Retrieving inputs")
        for item in tmp_filenames_per_sample_per_pipeline:
            for sample, filenames_per_pipeline in item.iteritems():
                for pipeline, tmp_filenames in filenames_per_pipeline.iteritems(
                ):
                    filenames_per_sample_per_pipeline.setdefault(
                        sample, {}).setdefault("pipeline",
                                               []).extend(tmp_filenames)
    configs, jobfiles = build_configs(filenames_per_sample_per_pipeline)

    # submit tasks
    submit_args = []
    for config, jobfile in zip(configs, jobfiles):
        submit_args.append([config, jobfile])
    tools.parallelize(submit,
                      submit_args,
                      n_processes=1,
                      description="Submitting crab tasks")
Beispiel #4
0
def list_of_files(path, recursive=False, gfal_ls_args=""):
    if (not "*" in path) and (not recursive):
        return [path]

    splitted_path = re.split("/(?:(?=[^/]*\*))", path, maxsplit=1)
    command = "gfal-ls {gfal_ls_args} {path}".format(gfal_ls_args=gfal_ls_args,
                                                     path=splitted_path[0])
    log.debug(command)
    stdout, stderr = tools.subprocessCall(shlex.split(command))
    stdout = stdout.strip()
    if stdout == "":
        return []
    if stdout == path:
        return [stdout]

    new_paths = [
        os.path.join(splitted_path[0], item)
        for item in stdout.strip().split("\n")
    ]
    if len(splitted_path) > 1:
        splitted_path_with_wildcard = re.split("(\*[^/]*/)", path, maxsplit=1)
        path_to_match = "".join(splitted_path_with_wildcard[:2]).rstrip("/")
        rest = "".join(splitted_path_with_wildcard[2:])
        new_paths = [
            os.path.join(path, rest).rstrip("/") for path in new_paths
            if fnmatch.fnmatch(path, path_to_match)
        ]

    results = []
    for new_path in new_paths:
        results.extend(
            list_of_files(path=new_path,
                          recursive=recursive,
                          gfal_ls_args=gfal_ls_args))
    return sorted(results)
def _download_untar(args):
	tar_file = args[0]
	output_dir = args[1]
	downloaded_tar_file = os.path.join(tempfile.mkdtemp(), os.path.basename(tar_file))
	tools.subprocessCall(shlex.split("gfal-copy {tar_file} {downloaded_tar_file}".format(tar_file=tar_file, downloaded_tar_file=downloaded_tar_file)))
	tools.subprocessCall(shlex.split("tar -x -f {downloaded_tar_file} -C {output_dir} --overwrite".format(downloaded_tar_file=downloaded_tar_file, output_dir=output_dir)))
	tools.subprocessCall(shlex.split("rm -rf {temp_dir}".format(temp_dir=os.path.dirname(downloaded_tar_file))))
def get_filenames(args):
	base_path, sample = args[0], args[1]
	
	filenames_per_sample_per_pipeline = {}
	
	stdout, stderr = tools.subprocessCall(shlex.split("gfal-ls " + os.path.join(base_path, sample)))
	filenames = [filename for filename in stdout.decode().strip().split("\n") if (("SvfitCache" in filename) and filename.endswith(".root"))]
	if len(filenames) > 0:
		filenames = [os.path.join(base_path, sample, filename) for filename in filenames]
		for filename in filenames:
			filename = dcachetools.xrd2xrd(dcachetools.srm2xrd(dcachetools.dcap2xrd(dcachetools.local2xrd(filename))))
			pipeline = re.search("SvfitCache(?P<pipeline>.*)\d+.root", filename).groupdict()["pipeline"]
			filenames_per_sample_per_pipeline.setdefault(sample, {}).setdefault(pipeline, []).append(filename)
	
	return filenames_per_sample_per_pipeline
def _get_crab_outputs(args):
	crab_dir = args[0]
	jobids = args[1]
	command = "crab getoutput --dump --jobids {jobids} -d {crab_dir}".format(crab_dir=crab_dir, jobids=jobids)
	log.debug(command)
	stdout, stderr = tools.subprocessCall(shlex.split(command))
	files = re.findall("PFN:\s*(?P<path>.*)\s", stdout)
	#return files
	
	search_pattern = re.sub("SvfitCache_[0-9]*.tar", "SvfitCache_*.tar", files[0])
	while True:
		new_search_pattern = re.sub("/[0-9_]+/", "/*/", search_pattern)
		if search_pattern == new_search_pattern:
			break
		search_pattern = new_search_pattern
	files = dcachetools.list_of_files(path=search_pattern, recursive=False, gfal_ls_args="")
	return files
def get_filenames(args):
	base_path, sample = args[0], args[1]

	filename_replacements = {
		"srm://grid-srm.physik.rwth-aachen.de:8443/srm/managerv2?SFN=/pnfs/physik.rwth-aachen.de/cms/store/user/" : "root://grid-vo-cms.physik.rwth-aachen.de:1094//store/user/"
	}
	
	filenames_per_sample_per_pipeline = {}
	
	stdout, stderr = tools.subprocessCall(shlex.split("gfal-ls " + os.path.join(base_path, sample)))
	filenames = [filename for filename in stdout.decode().strip().split("\n") if (("SvfitCache" in filename) and filename.endswith(".root"))]
	if len(filenames) > 0:
		filenames = [os.path.join(base_path, sample, filename) for filename in filenames]
		for filename in filenames:
			for src, dst in filename_replacements.iteritems():
				filename = filename.replace(src, dst)
			pipeline = re.search("SvfitCache(?P<pipeline>.*)\d+.root", filename).groupdict()["pipeline"]
			filenames_per_sample_per_pipeline.setdefault(sample, {}).setdefault(pipeline, []).append(filename)
	
	return filenames_per_sample_per_pipeline
Beispiel #9
0
def _get_crab_outputs(args):
    crab_dir = args[0]
    jobids = args[1]
    command = "crab getoutput --dump --jobids {jobids} -d {crab_dir}".format(
        crab_dir=crab_dir, jobids=jobids)
    log.debug(command)
    stdout, stderr = tools.subprocessCall(shlex.split(command))
    files = re.findall("PFN:\s*(?P<path>.*)\s", stdout)
    #return files

    search_pattern = re.sub("SvfitCache_[0-9]*.tar", "SvfitCache_*.tar",
                            files[0])
    while True:
        new_search_pattern = re.sub("/[0-9_]+/", "/*/", search_pattern)
        if search_pattern == new_search_pattern:
            break
        search_pattern = new_search_pattern
    files = dcachetools.list_of_files(path=search_pattern,
                                      recursive=False,
                                      gfal_ls_args="")
    return files
Beispiel #10
0
def _download_untar(args):
    tar_file = args[0]
    output_dir = args[1]
    downloaded_tar_file = os.path.join(tempfile.mkdtemp(),
                                       os.path.basename(tar_file))
    tools.subprocessCall(
        shlex.split("gfal-copy {tar_file} {downloaded_tar_file}".format(
            tar_file=tar_file, downloaded_tar_file=downloaded_tar_file)))
    tools.subprocessCall(
        shlex.split(
            "tar -x -f {downloaded_tar_file} -C {output_dir} --overwrite".
            format(downloaded_tar_file=downloaded_tar_file,
                   output_dir=output_dir)))
    tools.subprocessCall(
        shlex.split("rm -rf {temp_dir}".format(
            temp_dir=os.path.dirname(downloaded_tar_file))))
def submission(base_paths, n_processes=1):
	
	# retrieve and prepare input files
	filenames_per_sample_per_pipeline = {}
	for base_path in base_paths:
		stdout_directories, stderr_directories = tools.subprocessCall(shlex.split("gfal-ls " + base_path))
		tmp_filenames_per_sample_per_pipeline = tools.parallelize(
				get_filenames,
				[[base_path, sample] for sample in stdout_directories.decode().strip().split("\n")],
				n_processes=n_processes,
				description="Retrieving inputs"
		)
		for item in tmp_filenames_per_sample_per_pipeline:
			for sample, filenames_per_pipeline in item.iteritems():
				for pipeline, tmp_filenames in filenames_per_pipeline.iteritems():
					filenames_per_sample_per_pipeline.setdefault(sample, {}).setdefault("pipeline", []).extend(tmp_filenames)
	configs, jobfiles = build_configs(filenames_per_sample_per_pipeline)
	
	# submit tasks
	submit_args = []
	for config, jobfile in zip(configs, jobfiles):
		submit_args.append([config, jobfile])
	tools.parallelize(submit, submit_args, n_processes=1, description="Submitting crab tasks")
def submission(base_path):

	today = datetime.date.today().strftime("%Y-%m-%d")
	max_n_files_per_task = 8000
	filename_replacements = {
		"srm://grid-srm.physik.rwth-aachen.de:8443/srm/managerv2?SFN=/pnfs/physik.rwth-aachen.de/cms/store/user/" : "root://grid-vo-cms.physik.rwth-aachen.de:1094//store/user/"
	}
	
	# retrieve and prepare input files
	stdout_directories, stderr_directories = tools.subprocessCall(shlex.split("gfal-ls " + args.base_path))
	for sample in stdout_directories.decode().strip().split("\n"):
		stdout_files, stderr_files = tools.subprocessCall(shlex.split("gfal-ls " + os.path.join(args.base_path, sample)))
		filenames = [filename for filename in stdout_files.decode().strip().split("\n") if (("SvfitCache" in filename) and filename.endswith(".root"))]
		if len(filenames) > 0:
			filenames = [os.path.join(args.base_path, sample, filename) for filename in filenames]
			pipelines_filenames = {}
			for filename in filenames:
				for src, dst in filename_replacements.iteritems():
					filename = filename.replace(src, dst)
				pipeline = re.search("SvfitCache(?P<pipeline>.*)\d+.root", filename).groupdict()["pipeline"]
				pipelines_filenames.setdefault(pipeline, []).append(filename)
			
			for pipeline, filenames in pipelines_filenames.iteritems():
				filenames_chunks = [filenames[index:index+max_n_files_per_task] for index in xrange(0, len(filenames), max_n_files_per_task)]
				for index, filenames_chunk in enumerate(filenames_chunks):
					
					# create job scripts
					jobfile_name = str("svfit_%s_%s_%s_%d.sh" % (today, sample, pipeline, index))
					with open(jobfile_name, "w+") as jobfile:
						jobfile.write(read_file(os.path.expandvars("$CMSSW_BASE/src/HiggsAnalysis/KITHiggsToTauTau/data/templates/crab_userjob_prefix.sh")))
						
						svfit_code = string.Template(read_file(os.path.expandvars("$CMSSW_BASE/src/HiggsAnalysis/KITHiggsToTauTau/data/templates/crab_svfit.sh")))
						jobfile.write(svfit_code.safe_substitute(
								input_files = "\n".join("arr[%d,0]=%s" % (i+1, f) for i, f in enumerate(filenames_chunk)),
								cwd=os.getcwd()
						))
						
						jobfile.close()
					
					# crab configuration
					config = CRABClient.UserUtilities.config()
					config.General.workArea = os.path.abspath(os.path.expandvars("$ARTUS_WORK_BASE/../svfit_caches/%s/" % (today)))
					config.General.transferOutputs = True
					config.General.transferLogs = True
					config.General.requestName = ("%s_%s_%d" % (sample, pipeline, index))[:100]
					log.info("Job name: " + config.General.requestName)
					config.Data.outputPrimaryDataset = "Svfit"
					config.Data.splitting = "EventBased"
					config.Data.unitsPerJob = 1
					config.Data.totalUnits = len(filenames_chunk)
					config.Data.publication = False
					config.Data.outputDatasetTag = config.General.requestName
					config.Data.outLFNDirBase = "/store/user/%s/higgs-kit/Svfit/%s/"%(getUsernameFromSiteDB(), today)
					log.info("Output directory: " + config.Data.outLFNDirBase)
					config.Data.publication = False
		
					config.User.voGroup = "dcms"
		
					config.JobType.pluginName = "PrivateMC"
					config.JobType.psetName = os.environ["CMSSW_BASE"]+"/src/CombineHarvester/CombineTools/scripts/do_nothing_cfg.py"
					# config.JobType.inputFiles = ["Kappa/lib/libKappa.so", os.environ["CMSSW_BASE"]+"/bin/"+os.environ["SCRAM_ARCH"]+"/ComputeSvfit", jobfile_name]
					config.JobType.inputFiles = [os.path.expandvars("$CMSSW_BASE/bin/$SCRAM_ARCH/ComputeSvfit"), jobfile_name]
					config.JobType.allowUndistributedCMSSW = True
					config.JobType.scriptExe = jobfile_name
					config.JobType.outputFiles = ["SvfitCache.tar"]
		
					config.Site.storageSite = "T2_DE_DESY"
					# config.Site.blacklist = ["T3_US_PuertoRico", "T2_ES_CIEMAT", "T2_DE_RWTH", "T3_US_Colorado", "T2_BR_UERJ", "T2_ES_IFCA", "T2_RU_JINR", "T2_UA_KIPT", "T2_EE_Estonia", "T2_FR_GRIF_LLR", "T2_CH_CERN", "T2_FR_GRIF_LLR", "T3_IT_Bologna", "T2_US_Nebraska", "T2_US_Nebraska", "T3_TW_NTU_HEP", "T2_US_Caltech", "T3_US_Cornell", "T2_IT_Legnaro", "T2_HU_Budapest", "T2_IT_Pisa", "T2_US_Florida", "T2_IT_Bari", "T2_FR_GRIF_IRFU", "T2_IT_Rome", "T2_FR_GRIF_IRFU", "T2_CH_CSCS", "T3_TW_NCU"]
					p = Process(target=submit, args=(config,))
					p.start()
					p.join()
					
					os.remove(jobfile_name)