Beispiel #1
0
 def cat_job(self, jobids, jobinfo, print_stderr=None, store=False):
     """ print standard output of a given job"""
     dir_name = self.get_stdout_dir_name(
         self.get_local_dir_name(jobinfo["runcard"], jobinfo["runfolder"]))
     # jobids = length 1 for SLURM jobs - just take the only element here
     jobid = jobids[0]
     output = []
     if jobinfo["jobtype"] == "Production" or "Socket" in jobinfo["jobtype"]:
         for subjobno in range(1, int(jobinfo["no_runs"]) + 1):
             stdoutfile = os.path.join(
                 dir_name, "slurm-{0}_{1}.out".format(jobid, subjobno))
             if print_stderr:
                 stdoutfile = stdoutfile.replace(".out", ".err")
             cmd = ["cat", stdoutfile]
             if not store:
                 util.spCall(cmd)
             else:
                 output.append(
                     util.getOutputCall(cmd,
                                        suppress_errors=True,
                                        include_return_code=False))
     else:
         stdoutfile = os.path.join(dir_name, F"slurm-{jobid}.out")
         if print_stderr:
             stdoutfile = stdoutfile.replace(".out", ".err")
         cmd = ["cat", stdoutfile]
         if not store:
             util.spCall(cmd)
         else:
             output.append(
                 util.getOutputCall(cmd,
                                    suppress_errors=True,
                                    include_return_code=False))
     if store:
         return output
Beispiel #2
0
    def cat_log_job(self, jobids, jobinfo, *args, **kwargs):
        import re
        import glob
        run_dir = self.get_local_dir_name(jobinfo["runcard"],
                                          jobinfo["runfolder"])
        log_files = [i for i in os.listdir(run_dir) if i.endswith(".log")]

        if jobinfo["iseed"] is None:
            jobinfo["iseed"] = 1
        expected_seeds = set(
            range(int(jobinfo["iseed"]),
                  int(jobinfo["iseed"]) + int(jobinfo["no_runs"])))

        logseed_regex = re.compile(r".s([0-9]+)\.[^\.]+$")
        logseeds_in_dir = set([
            int(logseed_regex.search(i).group(1))
            for i in glob.glob('{0}/*.log'.format(run_dir))
        ])
        seeds_to_print = (logseeds_in_dir.union(expected_seeds))

        cat_logs = []
        for log_file in log_files:
            for seed in seeds_to_print:
                if F".s{seed}." in log_file:
                    cat_logs.append(log_file)
                    seeds_to_print.remove(seed)
                    break

        for log in cat_logs:
            cmd = ["cat", os.path.join(run_dir, log)]
            util.spCall(cmd)
Beispiel #3
0
def run_test(args, runcard):
    # header.debug_level = 99999

    if args.runArc:
        from pyHepGrid.src.runArcjob import testWrapper
    elif args.runArcProduction:
        from pyHepGrid.src.runArcjob import testWrapperProduction as testWrapper
    elif args.runDirac:
        from pyHepGrid.src.runDiracjob import testWrapper
    elif args.runSlurm:
        from pyHepGrid.src.runSlurmjob import testWrapper
    elif args.runSlurmProduction:
        from pyHepGrid.src.runSlurmjob import testWrapperProduction \
            as testWrapper
    else:
        raise Exception("Choose what you want to test -(A/B/D/E/F)")

    rncards, dCards = util.expandCard(runcard)

    # if args.runSlurm:
    #     header.runfile = header.SLURMSCRIPTDEFAULT
    # if args.runSlurmProduction:
    #     header.runfile = header.SLURMSCRIPTDEFAULT_PRODUCTION

    setup()

    for r in rncards:
        nnlojob_args = testWrapper(r, dCards).replace("\"", "").split()
        runfile = os.path.basename(header.runfile)
        util.spCall(["chmod", "+x", runfile])
        util.spCall(["./{0}".format(runfile)] + nnlojob_args)
Beispiel #4
0
 def clean_job(self, jobids):
     """ remove the sandbox of a given job (including its stdout!) from
     the arc storage """
     self._press_yes_to_continue(
         "  \033[93m WARNING:\033[0m You are about to clean the job!")
     for jobid in jobids:
         cmd = [self.cmd_clean, "-j", header.arcbase, jobid.strip()]
         util.spCall(cmd)
Beispiel #5
0
    def kill_job(self, jobids, jobinfo):
        header.logger.debug(jobids, jobinfo)
        if len(jobids) == 0:
            header.logger.critical(
                "No jobids stored associated with this database entry, "
                "therefore nothing to kill.")

        for jobid in jobids:
            util.spCall(["scancel", str(jobid)])
Beispiel #6
0
 def status_job(self, jobids, verbose=False):
     """ print the current status of a given job """
     cmd = [self.cmd_stat, "-j", header.arcbase]
     jobids = [jobid.strip() for jobid in jobids]
     if len(jobids) == 0:
         header.logger.critical("No jobs selected")
     cmd = cmd + jobids
     if verbose:
         cmd += ["-l"]
     util.spCall(cmd)
Beispiel #7
0
 def cat_job(self, jobids, jobinfo, print_stderr=None, store=False):
     """ print standard output of a given job"""
     out = []
     for jobid in jobids:
         cmd = [self.cmd_print, "-j", header.arcbase, jobid.strip()]
         if print_stderr:
             cmd += ["-e"]
         if not store:
             util.spCall(cmd)
         else:
             out.append(util.getOutputCall(cmd, include_return_code=False))
     if store:
         return out
Beispiel #8
0
    def kill_job(self, jobids, jobinfo):
        """ kill all jobs associated with this run """
        self._press_yes_to_continue(
            "  \033[93m WARNING:\033[0m You are about to kill all jobs for "
            "this run!")

        if len(jobids) == 0:
            header.logger.critical(
                "No jobids stored associated with this database entry, "
                "therefore nothing to kill.")

        cmd = [self.cmd_kill] + jobids
        util.spCall(cmd)
Beispiel #9
0
 def status_job(self, jobids, verbose=False):
     """ print the current status of a given job """
     # for jobid in jobids:
     #     cmd = [self.cmd_stat, "-j", header.arcbase, jobid.strip()]
     #     if verbose:
     #         cmd += ["-l"]
     #     util.spCall(cmd)
     cmd = [self.cmd_stat, "-j", header.arcbase]
     print(header.arcbase)
     jobids = [jobid.strip() for jobid in jobids]
     cmd = cmd + jobids
     if verbose:
         cmd += ["-l"]
     util.spCall(cmd)
Beispiel #10
0
 def cat_log_job(self, jobids, jobinfo):
     """Sometimes the std output doesn't get updated
     but we can choose to access the logs themselves"""
     output_folder = ["file:///tmp/"]
     cmd_base = ["arccp", "-i"]
     cmd_str = "cat /tmp/"
     for jobid in jobids:
         files = util.getOutputCall(["arcls", jobid]).split()
         logfiles = [i for i in files if i.endswith(".log")]
         for logfile in logfiles:
             cmd = cmd_base + [os.path.join(jobid, logfile)] + output_folder
             output = util.getOutputCall(cmd).split()
             for text in output:
                 if ".log" in text:
                     util.spCall((cmd_str + text).split())
Beispiel #11
0
    def _do_extract_outputData(self, tarfile):
        """ Multithread wrapper used in get_data_production
            for untaring files
        """
        # It assumes log and dat folder are already there
        if not os.path.isfile(tarfile):
            logger.info("{0} not found".format(tarfile))
            return -1

        out_dict = {".log" : "log/", ".dat" : "dat/" }
        self.tarw.extract_extension_to(tarfile, out_dict)

        util.spCall(["rm", tarfile])

        return 0
Beispiel #12
0
    def kill_job(self, jobids, jobinfo):
        """ kills given job """
        self._press_yes_to_continue(
            "  \033[93m WARNING:\033[0m You are about to kill the job!")

        if len(jobids) == 0:
            header.logger.critical(
                "No jobids stored associated with this database entry, "
                "therefore nothing to kill.")

        # Kill in groups of 150 for speeeed
        for jobid_set in util.batch_gen(jobids, 150):
            stripped_set = [i.strip() for i in jobid_set]
            cmd = [self.cmd_kill, "-j", header.arcbase] + stripped_set
            header.logger.debug("job_kill batch length:{0}".format(
                len(stripped_set)))
            util.spCall(cmd)
Beispiel #13
0
 def bring_current_warmup(self, db_id):
     """ Sometimes we want to retrieve the warmup before the job finishes """
     output_folder = ["file:///tmp/"]
     cmd_base = ["gfal-copy", "-v"]
     fields = ["pathfolder", "runfolder", "jobid"]
     data = self.dbase.list_data(self.table, fields, db_id)[0]
     runfolder = data["runfolder"]
     finfolder = data["pathfolder"] + "/" + runfolder + "/"
     if header.finalisation_script is not None:
         finfolder = header.default_runfolder
     jobids = data["jobid"].split()
     output_folder = ["file://" + finfolder]
     for jobid in jobids:
         cmd = cmd_base + [jobid + "/*.y*"] + output_folder
         util.spCall(cmd)
         cmd = cmd_base + [jobid + "/*.log"] + output_folder
         util.spCall(cmd)
     print("Warmup stored at {0}".format(finfolder))
Beispiel #14
0
 def _get_data_warmup(self, db_id):
     """
     Given a database entry, retrieve its data from the warmup folder to the
     folder defined in said database entry For arc jobs stdoutput will be
     downloaded in said folder as well
     """
     # Retrieve data from database
     from pyHepGrid.src.header import arcbase, grid_warmup_dir
     fields = ["runcard", "runfolder", "jobid", "pathfolder"]
     data = self.dbase.list_data(self.table, fields, db_id)[0]
     runfolder = data["runfolder"]
     finfolder = data["pathfolder"] + "/" + runfolder
     runcard = data["runcard"]
     jobids = data["jobid"].split()
     util.spCall(["mkdir", "-p", finfolder])
     logger.info("Retrieving ARC output into " + finfolder)
     try:
         # Retrieve ARC standard output for every job of this run
         for jobid in jobids:
             logger.info(jobid)
             cmd = [self.cmd_get, "-j", arcbase, jobid.strip()]
             output = util.getOutputCall(cmd, include_return_code=False)
             outputfol = output.split("Results stored at: ")[1].rstrip()
             outputfolder = outputfol.split("\n")[0]
             if outputfolder == "" or (len(outputfolder.split(" ")) > 1):
                 logger.info("Running mv and rm command is not safe here")
                 logger.info("Found blank spaces in the output folder")
                 logger.info(
                     "Nothing will be moved to the warmup global folder")
             else:
                 destination = finfolder + "/" + "arc_out_" + runcard + \
                     outputfolder
                 util.spCall(["mv", outputfolder, destination])
                 # util.spCall(["rm", "-rf", outputfolder])
     except BaseException:
         logger.info("Couldn't find job output in the ARC server")
         logger.info("jobid: " + jobid)
         logger.info("Run arcstat to check the state of the job")
         logger.info("Trying to retrieve data from grid storage anyway")
     # Retrieve warmup from the grid storage warmup folder
     wname = self.warmup_name(runcard, runfolder)
     self.gridw.bring(wname, grid_warmup_dir, finfolder + "/" + wname)
Beispiel #15
0
 def update_stdout(self):
     """ retrieves stdout of all running jobs and store the current state
     into its correspondent folder
     """
     fields = ["rowid", "jobid", "pathfolder", "runfolder"]
     dictC = self._db_list(fields)
     for job in dictC:
         # Retrieve data from database
         jobid = str(job['jobid'])
         rfold = str(job['runfolder'])
         pfold = str(job['pathfolder']) + "/" + rfold
         flnam = pfold + "/stdout"
         # Create target folder if it doesn't exist
         if not os.path.exists(pfold):
             os.makedirs(pfold)
         cmd = self.cmd_print + ' ' + jobid.strip()
         # It seems script is the only right way to save data with arc
         stripcm = ['script', '-c', cmd, '-a', 'tmpscript.txt']
         mvcmd = ['mv', 'tmpscript.txt', flnam]
         util.spCall(stripcm)
         util.spCall(mvcmd)
Beispiel #16
0
    def init_production(self, provided_warmup=None, continue_warmup=False,
                        local=False):
        """ Initialises a production run. If a warmup file is provided
        retrieval step is skipped
        Steps are:
            0 - Retrieve warmup from the grid/local
            1 - tar up executable, runcard and necessary files
            2 - sent it to the grid storage
        """
        from shutil import copy
        import tempfile
        from pyHepGrid.src.header import runcardDir as runFol
        from pyHepGrid.src.header import executable_exe, executable_src_dir, logger

        if local:
            self.init_local_production(provided_warmup=provided_warmup)
            return

        rncards, dCards = util.expandCard()
        path_to_exe_full = self._exe_fullpath(executable_src_dir, executable_exe)

        origdir = os.path.abspath(os.getcwd())
        tmpdir = tempfile.mkdtemp()

        # if provided warmup is a relative path, ensure we have the full path
        # before we change to the tmp directory
        if provided_warmup:
            if provided_warmup[0] != "/":
                provided_warmup = "{0}/{1}".format(origdir, provided_warmup)

        os.chdir(tmpdir)
        logger.debug("Temporary directory: {0}".format(tmpdir))

        if not os.path.isfile(path_to_exe_full):
            logger.critical("Could not find executable at {0}".format(path_to_exe_full))
        copy(path_to_exe_full, os.getcwd())
        files = [executable_exe]
        for idx, i in enumerate(rncards):
            logger.info("Initialising {0} [{1}/{2}]".format(i, idx+1, len(rncards)))
            local = False
            # Check whether warmup/production is active in the runcard
            runcard_file = os.path.join(runFol, i)
            runcard_obj = PROGRAMruncard(runcard_file, logger=logger,
                                         use_cvmfs=header.use_cvmfs_lhapdf,
                                         cvmfs_loc=header.cvmfs_lhapdf_location)
            multichannel = self.check_runcard_multichannel(runcard_obj)
            self._check_production(runcard_obj)
            rname = dCards[i]
            tarfile = i + rname + ".tar.gz"
            copy(os.path.join(runFol, i), os.getcwd())
            if provided_warmup:
                match, local = self._get_local_warmup_name(runcard_obj.warmup_filename(),
                                                          provided_warmup)
                warmupFiles = [match]
            elif header.provided_warmup_dir:
                match, local = self._get_local_warmup_name(runcard_obj.warmup_filename(),
                                                          header.provided_warmup_dir)
                warmupFiles = [match]
            else:
                logger.info("Retrieving warmup file from grid")
                warmupFiles = self._bring_warmup_files(i, rname, shell=True, multichannel=multichannel)
            self.tarw.tarFiles(files + [i] + warmupFiles, tarfile)
            if self.gridw.checkForThis(tarfile, header.grid_input_dir):
                logger.info("Removing old version of {0} from Grid Storage".format(tarfile))
                self.gridw.delete(tarfile, header.grid_input_dir)
            logger.info("Sending {0} to GFAL {1}/".format(tarfile, header.grid_input_dir))
            self.gridw.send(tarfile, header.grid_input_dir, shell=True)
            if local:
                util.spCall(["rm", i, tarfile])
            else:
                util.spCall(["rm", i, tarfile] + warmupFiles)
        os.remove(executable_exe)
        os.chdir(origdir)
Beispiel #17
0
    def _get_data_production(self, db_id):
        """ Given a database entry, retrieve its data from
        the output folder to the folder defined in said db entry
        """
        logger.info("You are going to download all folders corresponding to this runcard from grid output")
        logger.info("Make sure all runs are finished using the -s or -S options!")
        fields       = ["runfolder", "runcard", "jobid", "pathfolder", "iseed"]
        data         = self.dbase.list_data(self.table, fields, db_id)[0]
        self.rcard   = data["runcard"]
        self.rfolder = data["runfolder"]
        pathfolderTp = data["pathfolder"]
        initial_seed = data["iseed"]
        pathfolder   = util.sanitiseGeneratedPath(pathfolderTp, self.rfolder)
        jobids       = data["jobid"].split(" ")
        finalSeed    = int(initial_seed) + len(jobids)
        if initial_seed == "None":
            initial_seed = self.bSeed
        else:
            initial_seed = int(initial_seed)
        while True:
            firstName = self.output_name(self.rcard, self.rfolder, initial_seed)
            finalName = self.output_name(self.rcard, self.rfolder, finalSeed)
            logger.info("The starting filename is {}".format(firstName))
            logger.info("The final filename is {}".format(finalName))
            yn = self._press_yes_to_continue("If you are ok with this, press y", fallback = -1)
            if yn == 0:
                break
            initial_seed = int(input("Please, introduce the starting seed (ex: 400): "))
            finalSeed  = int(input("Please, introduce the final seed (ex: 460): "))
        try:
            os.makedirs(self.rfolder)
        except OSError as err:
            if err.errno == 17:
                logger.info("Tried to create folder %s in this directory".format(self.rfolder))
                logger.info("to no avail. We are going to assume the directory was already there")
                self._press_yes_to_continue("", "Folder {} already exists".format(self.rfolder))
            else:
                raise
        os.chdir(self.rfolder)
        try:
            os.makedirs("log")
            os.makedirs("dat")
        except: # todo: macho... this is like mkdir -p :P
            pass
        seeds    =  range(initial_seed, finalSeed)
        # If we are only act on a subrange of jobids (ie, the ones which are done...) choose only those seeds
        if self.act_only_on_done:
            old_status = self._get_old_status(db_id)
            if old_status:
                new_seed = []
                for seed, stat in zip(seeds, old_status):
                    if stat == self.cDONE:
                        new_seed.append(seed)
            seeds = new_seed

        from pyHepGrid.src.header import finalise_no_cores as n_threads
        # Check which of the seeds actually produced some data
        all_remote = self.output_name_array(self.rcard, self.rfolder, seeds)
        all_output = self.gridw.get_dir_contents(header.grid_output_dir).split()
        remote_tarfiles = list(set(all_remote) & set(all_output))
        logger.info("Found data for {0} of the {1} seeds.".format(len(remote_tarfiles), len(seeds)))

        # Download said data
        tarfiles = self._multirun(self._do_get_data, remote_tarfiles, n_threads, use_counter = True)
        tarfiles = list(filter(None, tarfiles))
        logger.info("Downloaded 0 files", end ='\r')
        logger.info("Downloaded {0} files, extracting...".format(len(tarfiles)))

        # Extract some information from the first tarfile
        for tarfile in tarfiles:
            if self._extract_output_warmup_data(tarfile):
                break

        # Extract all
        dummy    =  self._multirun(self._do_extract_outputData, tarfiles, n_threads)
        os.chdir("..")
        logger.info("Everything saved at {0}".format(pathfolder))
        util.spCall(["mv", self.rfolder, pathfolder])
Beispiel #18
0
 def renew_proxy(self, jobids):
     """ renew proxy for a given job """
     for jobid in jobids:
         cmd = [self.cmd_renew, jobid.strip()]
         util.spCall(cmd)
Beispiel #19
0
 def cat_job(self, jobids, jobinfo, print_stderr=None):
     print("Printing the last 20 lines of the last job")
     jobid = jobids[-1]
     cmd = [self.cmd_print, jobid.strip()]
     util.spCall(cmd)
Beispiel #20
0
 def do_status_job(self, jobid):
     """ multiproc wrapper for status_job """
     cmd = [self.cmd_stat, jobid]
     util.spCall(cmd, suppress_errors=True)
     return 0
Beispiel #21
0
    def _bring_warmup_files(self, runcard, rname, shell=False,
                            check_only=False, multichannel=False):
        """ Download the warmup file for a run to local directory
        extracts Vegas grid and log file and returns a list with their names

        check_only flag doesn't error out if the warmup doesn't exist, instead just returns
        and empty list for later use [intended for checkwarmup mode so multiple warmups can
        be checked consecutively.
        """

        from pyHepGrid.src.header import grid_warmup_dir, logger
        gridFiles = []
        suppress_errors = False
        if check_only:
            suppress_errors = True
        ## First bring the warmup .tar.gz
        outnm = self.warmup_name(runcard, rname)
        logger.debug("Warmup GFAL name: {0}".format(outnm))
        tmpnm = "tmp.tar.gz"
        logger.debug("local tmp tar name: {0}".format(tmpnm))
        success = self.gridw.bring(outnm, grid_warmup_dir, tmpnm, shell = shell,
                                   suppress_errors=suppress_errors)

        success ==  self.__check_pulled_warmup(success, tmpnm, warmup_extensions)


        if not success and not check_only:
            if self._press_yes_to_continue("Grid files failed to copy. Try backups from individual sockets?") == 0:
                backup_dir = os.path.join(grid_warmup_dir,outnm.replace(".tar.gz",""))
                backups = self.gridw.get_dir_contents(backup_dir)
                if len(backups) == 0:
                    logger.critical("No backups found. Did the warmup complete successfully?")
                else:
                    backup_files = backups.split()
                    for idx, backup in enumerate(backup_files):
                        logger.info("Attempting backup {1} [{0}]".format(idx+1, backup))
                        success = self.gridw.bring(backup, backup_dir, tmpnm, shell = shell, force=True)

                        success ==  self.__check_pulled_warmup(success, tmpnm, warmup_extensions)
                        if success:
                            break

        if not success:
            logger.critical("Grid files failed to copy. Did the warmup complete successfully?")
        else:
            logger.info("Grid files copied ok.")

        ## Now extract only the Vegas grid files and log file
        gridp = warmup_extensions
        gridp += [i+"_channel" for i in gridp]
        extractFiles = self.tarw.extract_extensions(tmpnm,
                                                    gridp+[".log",".txt","channels"])
        try:
            gridFiles = [i for i in extractFiles if ".log" not in i]
            logfile = [i for i in extractFiles if ".log" in i][0]
        except IndexError as e:
            if not check_only:
                logger.critical("Logfile not found. Did the warmup complete successfully?")
            else:
                return []


        if multichannel and len([i for i in gridFiles if "channels" in i]) ==0:
            logger.critical("No multichannel warmup found, but multichannel is set in the runcard.")
        elif multichannel:
            logger.info("Multichannel warmup files found.")
        if gridFiles == [] and not check_only: # No grid files found
            logger.critical("Grid files not found in warmup tarfile. Did the warmup complete successfully?")
        elif gridFiles == []:
            return []

        ## Tag log file as -warmup
        newlog = logfile + "-warmup"
        os.rename(logfile, newlog)
        # Remove temporary tar files
        os.remove(tmpnm)
        gridFiles.append(newlog)
        # Make sure access to the file is correct!
        for i in gridFiles:
            util.spCall(["chmod", "a+wrx", i])
        return gridFiles