def test_exists(self): io_parser(self.existingFolder, varname="existingFolder", isfile=False, exists=True) with pytest.raises(SPYIOError): io_parser(self.existingFolder, varname="existingFolder", isfile=False, exists=False) io_parser(self.nonExistingFolder, varname="nonExistingFolder", exists=False) with pytest.raises(SPYIOError): io_parser(self.nonExistingFolder, varname="nonExistingFolder", exists=True)
def esi_cluster_setup(partition="8GBS", n_jobs=2, mem_per_job=None, timeout=180, interactive=True, start_client=True, **kwargs): """ Start a distributed Dask cluster of parallel processing workers using SLURM (or local multi-processing) Parameters ---------- partition : str Name of SLURM partition/queue to use n_jobs : int Number of jobs to spawn mem_per_job : None or str Memory booking for each job. Can be specified either in megabytes (e.g., ``mem_per_job = 1500MB``) or gigabytes (e.g., ``mem_per_job = "2GB"``). If `mem_per_job` is `None`, it is attempted to infer a sane default value from the chosen queue, e.g., for ``partition = "8GBS"`` `mem_per_job` is automatically set to the allowed maximum of `'8GB'`. However, even in queues with guaranted memory bookings, it is possible to allocate less memory than the allowed maximum per job to spawn numerous low-memory jobs. See Examples for details. timeout : int Number of seconds to wait for requested jobs to start up. interactive : bool If `True`, user input is required in case not all jobs could be started in the provided waiting period (determined by `timeout`). If `interactive` is `False` and the jobs could not be started within `timeout` seconds, a `TimeoutError` is raised. start_client : bool If `True`, a distributed computing client is launched and attached to the workers. If `start_client` is `False`, only a distributed computing cluster is started to which compute-clients can connect. **kwargs : dict Additional keyword arguments can be used to control job-submission details. Returns ------- proc : object A distributed computing client (if ``start_client = True``) or a distributed computing cluster (otherwise). Examples -------- The following command launches 10 SLURM jobs with 2 gigabytes memory each in the `8GBS` partition >>> spy.esi_cluster_setup(n_jobs=10, partition="8GBS", mem_per_job="2GB") If you want to access properties of the created distributed computing client, assign an explicit return quantity, i.e., >>> client = spy.esi_cluster_setup(n_jobs=10, partition="8GBS", mem_per_job="2GB") The underlying distributed computing cluster can be accessed using >>> client.cluster Notes ----- Syncopy's parallel computing engine relies on the concurrent processing library `Dask <https://docs.dask.org/en/latest/>`_. Thus, the distributed computing clients used by Syncopy are in fact instances of :class:`dask.distributed.Client`. This function specifically acts as a wrapper for :class:`dask_jobqueue.SLURMCluster`. Users familiar with Dask in general and its distributed scheduler and cluster objects in particular, may leverage Dask's entire API to fine-tune parallel processing jobs to their liking (if wanted). See also -------- cluster_cleanup : remove dangling parallel processing job-clusters """ # For later reference: dynamically fetch name of current function funcName = "Syncopy <{}>".format(inspect.currentframe().f_code.co_name) # Be optimistic: prepare success message successMsg = "{name:s} Cluster dashboard accessible at {dash:s}" # Retrieve all partitions currently available in SLURM out, err = subprocess.Popen("sinfo -h -o %P", stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True).communicate() if len(err) > 0: # SLURM is not installed, either allocate `LocalCluster` or just leave if "sinfo: not found" in err: if interactive: msg = "{name:s} SLURM does not seem to be installed on this machine " +\ "({host:s}). Do you want to start a local multi-processing " +\ "computing client instead? " startLocal = user_yesno(msg.format(name=funcName, host=socket.gethostname()), default="no") else: startLocal = True if startLocal: client = Client() successMsg = "{name:s} Local parallel computing client ready. \n" + successMsg print(successMsg.format(name=funcName, dash=client.cluster.dashboard_link)) if start_client: return client return client.cluster return # SLURM is installed, but something's wrong msg = "SLURM queuing system from node {node:s}. " +\ "Original error message below:\n{error:s}" raise SPYIOError(msg.format(node=socket.gethostname(), error=err)) options = out.split() # Make sure we're in a valid partition (exclude IT partitions from output message) if partition not in options: valid = list(set(options).difference(["DEV", "PPC"])) raise SPYValueError(legal="'" + "or '".join(opt + "' " for opt in valid), varname="partition", actual=partition) # Parse job count try: scalar_parser(n_jobs, varname="n_jobs", ntype="int_like", lims=[1, np.inf]) except Exception as exc: raise exc # Get requested memory per job if mem_per_job is not None: if not isinstance(mem_per_job, str): raise SPYTypeError(mem_per_job, varname="mem_per_job", expected="string") if not any(szstr in mem_per_job for szstr in ["MB", "GB"]): lgl = "string representation of requested memory (e.g., '8GB', '12000MB')" raise SPYValueError(legal=lgl, varname="mem_per_job", actual=mem_per_job) # Query memory limit of chosen partition and ensure that `mem_per_job` is # set for partitions w/o limit idx = partition.find("GB") if idx > 0: mem_lim = int(partition[:idx]) * 1000 else: if partition == "PREPO": mem_lim = 16000 else: if mem_per_job is None: lgl = "explicit memory amount as required by partition '{}'" raise SPYValueError(legal=lgl.format(partition), varname="mem_per_job", actual=mem_per_job) mem_lim = np.inf # Consolidate requested memory with chosen partition (or assign default memory) if mem_per_job is None: mem_per_job = str(mem_lim) + "MB" else: if "MB" in mem_per_job: mem_req = int(mem_per_job[:mem_per_job.find("MB")]) else: mem_req = int(round(float(mem_per_job[:mem_per_job.find("GB")]) * 1000)) if mem_req > mem_lim: msg = "`mem_per_job` exceeds limit of {lim:d}GB for partition {par:s}. " +\ "Capping memory at partition limit. " SPYWarning(msg.format(lim=mem_lim, par=partition)) mem_per_job = str(int(mem_lim)) + "GB" # Parse requested timeout period try: scalar_parser(timeout, varname="timeout", ntype="int_like", lims=[1, np.inf]) except Exception as exc: raise exc # Determine if cluster allocation is happening interactively if not isinstance(interactive, bool): raise SPYTypeError(interactive, varname="interactive", expected="bool") # Determine if a dask client was requested if not isinstance(start_client, bool): raise SPYTypeError(start_client, varname="start_client", expected="bool") # Set/get "hidden" kwargs workers_per_job = kwargs.get("workers_per_job", 1) try: scalar_parser(workers_per_job, varname="workers_per_job", ntype="int_like", lims=[1, 8]) except Exception as exc: raise exc n_cores = kwargs.get("n_cores", 1) try: scalar_parser(n_cores, varname="n_cores", ntype="int_like", lims=[1, np.inf]) except Exception as exc: raise exc slurm_wdir = kwargs.get("slurmWorkingDirectory", None) if slurm_wdir is None: usr = getpass.getuser() slurm_wdir = "/mnt/hpx/slurm/{usr:s}/{usr:s}_{date:s}" slurm_wdir = slurm_wdir.format(usr=usr, date=datetime.now().strftime('%Y%m%d-%H%M%S')) os.makedirs(slurm_wdir, exist_ok=True) else: try: io_parser(slurm_wdir, varname="slurmWorkingDirectory", isfile=False) except Exception as exc: raise exc # Hotfix for upgraded cluster-nodes: point to correct Python executable if working from /home pyExec = sys.executable if sys.executable.startswith("/home"): pyExec = "/mnt/gs" + sys.executable # Create `SLURMCluster` object using provided parameters out_files = os.path.join(slurm_wdir, "slurm-%j.out") cluster = SLURMCluster(cores=n_cores, memory=mem_per_job, processes=workers_per_job, local_directory=slurm_wdir, queue=partition, name="spyswarm", python=pyExec, header_skip=["-t", "--mem"], job_extra=["--output={}".format(out_files)]) # interface="asdf", # interface is set via `psutil.net_if_addrs()` # job_extra=["--hint=nomultithread", # "--threads-per-core=1"] # Compute total no. of workers and up-scale cluster accordingly total_workers = n_jobs * workers_per_job cluster.scale(total_workers) # Fire up waiting routine to avoid premature cluster setups if _cluster_waiter(cluster, funcName, total_workers, timeout, interactive): return # Kill a zombie cluster in non-interactive mode if not interactive and _count_running_workers(cluster) == 0: cluster.close() err = "SLURM jobs could not be started within given time-out " +\ "interval of {0:d} seconds" raise TimeoutError(err.format(timeout)) # Highlight how to connect to dask performance monitor print(successMsg.format(name=funcName, dash=cluster.dashboard_link)) # If client was requested, return that instead of the created cluster if start_client: return Client(cluster) return cluster
def test_ext(self): with tempfile.NamedTemporaryFile(suffix='a7f3.lfp') as f: io_parser(f.name, ext=['lfp', 'mua'], exists=True) io_parser(f.name, ext='lfp', exists=True) with pytest.raises(SPYValueError): io_parser(f.name, ext='mua', exists=True)
def test_isfile(self): with tempfile.NamedTemporaryFile() as f: io_parser(f.name, isfile=True, exists=True) with pytest.raises(SPYValueError): io_parser(f.name, isfile=False, exists=True)
def test_none(self): with pytest.raises(SPYTypeError): io_parser(None)
def _load(filename, checksum, mode, out): """ Local helper """ fileInfo = filename_parser(filename) hdfFile = os.path.join(fileInfo["folder"], fileInfo["filename"]) jsonFile = hdfFile + FILE_EXT["info"] try: _ = io_parser(hdfFile, varname="hdfFile", isfile=True, exists=True) _ = io_parser(jsonFile, varname="jsonFile", isfile=True, exists=True) except Exception as exc: raise exc with open(jsonFile, "r") as file: jsonDict = json.load(file) if "dataclass" not in jsonDict.keys(): raise SPYError( "Info file {} does not contain a dataclass field".format(jsonFile)) if hasattr(spd, jsonDict["dataclass"]): dataclass = getattr(spd, jsonDict["dataclass"]) else: raise SPYError("Unknown data class {class}".format( jsonDict["dataclass"])) requiredFields = tuple( startInfoDict.keys()) + dataclass._infoFileProperties for key in requiredFields: if key not in jsonDict.keys(): raise SPYError( "Required field {field} for {cls} not in {file}".format( field=key, cls=dataclass.__name__, file=jsonFile)) # If `_hdr` is an empty list, set it to `None` to not confuse meta-functions hdr = jsonDict.get("_hdr") if isinstance(hdr, (list, np.ndarray)): if len(hdr) == 0: jsonDict["_hdr"] = None # FIXME: add version comparison (syncopy.__version__ vs jsonDict["_version"]) # If wanted, perform checksum matching if checksum: hsh_msg = "hash = {hsh:s}" hsh = hash_file(hdfFile) if hsh != jsonDict["file_checksum"]: raise SPYValueError( legal=hsh_msg.format(hsh=jsonDict["file_checksum"]), varname=os.path.basename(hdfFile), actual=hsh_msg.format(hsh=hsh)) # Parsing is done, create new or check provided object if out is not None: try: data_parser(out, varname="out", writable=True, dataclass=jsonDict["dataclass"]) except Exception as exc: raise exc new_out = False else: out = dataclass() new_out = True # First and foremost, assign dimensional information dimord = jsonDict.pop("dimord") out.dimord = dimord # Access data on disk (error checking is done by setters) out.mode = mode for datasetProperty in out._hdfFileDatasetProperties: setattr(out, datasetProperty, h5py.File(hdfFile, mode="r")[datasetProperty]) # Abuse ``definetrial`` to set trial-related props trialdef = h5py.File(hdfFile, mode="r")["trialdefinition"][()] out.definetrial(trialdef) # Assign metadata for key in [ prop for prop in dataclass._infoFileProperties if prop != "dimord" ]: setattr(out, key, jsonDict[key]) # Write `cfg` entries thisMethod = sys._getframe().f_code.co_name.replace("_", "") out.cfg = {"method": thisMethod, "files": [hdfFile, jsonFile]} # Write log-entry msg = "Read files v. {ver:s} ".format(ver=jsonDict["_version"]) msg += "{hdf:s}\n\t" + (len(msg) + len(thisMethod) + 2) * " " + "{json:s}" out.log = msg.format(hdf=hdfFile, json=jsonFile) # Happy breakdown return out if new_out else None