Esempio n. 1
0
    def test_exists(self):
        io_parser(self.existingFolder,
                  varname="existingFolder",
                  isfile=False,
                  exists=True)
        with pytest.raises(SPYIOError):
            io_parser(self.existingFolder,
                      varname="existingFolder",
                      isfile=False,
                      exists=False)

        io_parser(self.nonExistingFolder,
                  varname="nonExistingFolder",
                  exists=False)

        with pytest.raises(SPYIOError):
            io_parser(self.nonExistingFolder,
                      varname="nonExistingFolder",
                      exists=True)
Esempio n. 2
0
def esi_cluster_setup(partition="8GBS", n_jobs=2, mem_per_job=None,
                      timeout=180, interactive=True, start_client=True,
                      **kwargs):
    """
    Start a distributed Dask cluster of parallel processing workers using SLURM 
    (or local multi-processing)
    
    Parameters
    ----------
    partition : str
        Name of SLURM partition/queue to use
    n_jobs : int
        Number of jobs to spawn
    mem_per_job : None or str
        Memory booking for each job. Can be specified either in megabytes 
        (e.g., ``mem_per_job = 1500MB``) or gigabytes (e.g., ``mem_per_job = "2GB"``). 
        If `mem_per_job` is `None`, it is attempted to infer a sane default value
        from the chosen queue, e.g., for ``partition = "8GBS"`` `mem_per_job` is 
        automatically set to the allowed maximum of `'8GB'`. However, even in
        queues with guaranted memory bookings, it is possible to allocate less
        memory than the allowed maximum per job to spawn numerous low-memory 
        jobs. See Examples for details. 
    timeout : int
        Number of seconds to wait for requested jobs to start up. 
    interactive : bool
        If `True`, user input is required in case not all jobs could 
        be started in the provided waiting period (determined by `timeout`). 
        If `interactive` is `False` and the jobs could not be started
        within `timeout` seconds, a `TimeoutError` is raised. 
    start_client : bool
        If `True`, a distributed computing client is launched and attached to
        the workers. If `start_client` is `False`, only a distributed 
        computing cluster is started to which compute-clients can connect. 
    **kwargs : dict
        Additional keyword arguments can be used to control job-submission details. 
        
    Returns
    -------
    proc : object
        A distributed computing client (if ``start_client = True``) or 
        a distributed computing cluster (otherwise). 

    Examples
    --------
    The following command launches 10 SLURM jobs with 2 gigabytes memory each 
    in the `8GBS` partition
    
    >>> spy.esi_cluster_setup(n_jobs=10, partition="8GBS", mem_per_job="2GB") 
    
    If you want to access properties of the created distributed computing client, 
    assign an explicit return quantity, i.e., 
    
    >>> client = spy.esi_cluster_setup(n_jobs=10, partition="8GBS", mem_per_job="2GB") 
    
    The underlying distributed computing cluster can be accessed using
    
    >>> client.cluster
    
    Notes
    -----
    Syncopy's parallel computing engine relies on the concurrent processing library
    `Dask <https://docs.dask.org/en/latest/>`_. Thus, the distributed computing
    clients used by Syncopy are in fact instances of :class:`dask.distributed.Client`. 
    This function specifically acts  as a wrapper for :class:`dask_jobqueue.SLURMCluster`. 
    Users familiar with Dask in general and its distributed scheduler and cluster 
    objects in particular, may leverage Dask's entire API to fine-tune parallel 
    processing jobs to their liking (if wanted). 
    
    See also
    --------
    cluster_cleanup : remove dangling parallel processing job-clusters
    """
    
    # For later reference: dynamically fetch name of current function
    funcName = "Syncopy <{}>".format(inspect.currentframe().f_code.co_name)
    
    # Be optimistic: prepare success message
    successMsg = "{name:s} Cluster dashboard accessible at {dash:s}"

    # Retrieve all partitions currently available in SLURM
    out, err = subprocess.Popen("sinfo -h -o %P",
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                                text=True, shell=True).communicate()
    if len(err) > 0:
        
        # SLURM is not installed, either allocate `LocalCluster` or just leave
        if "sinfo: not found" in err:
            if interactive:
                msg = "{name:s} SLURM does not seem to be installed on this machine " +\
                    "({host:s}). Do you want to start a local multi-processing " +\
                    "computing client instead? "
                startLocal = user_yesno(msg.format(name=funcName, host=socket.gethostname()), 
                                        default="no")
            else:
                startLocal = True
            if startLocal:
                client = Client()
                successMsg = "{name:s} Local parallel computing client ready. \n" + successMsg
                print(successMsg.format(name=funcName, dash=client.cluster.dashboard_link))
                if start_client:
                    return client
                return client.cluster
            return 

        # SLURM is installed, but something's wrong        
        msg = "SLURM queuing system from node {node:s}. " +\
              "Original error message below:\n{error:s}"
        raise SPYIOError(msg.format(node=socket.gethostname(), error=err))
    options = out.split()

    # Make sure we're in a valid partition (exclude IT partitions from output message)
    if partition not in options:
        valid = list(set(options).difference(["DEV", "PPC"]))
        raise SPYValueError(legal="'" + "or '".join(opt + "' " for opt in valid),
                            varname="partition", actual=partition)

    # Parse job count
    try:
        scalar_parser(n_jobs, varname="n_jobs", ntype="int_like", lims=[1, np.inf])
    except Exception as exc:
        raise exc

    # Get requested memory per job
    if mem_per_job is not None:
        if not isinstance(mem_per_job, str):
            raise SPYTypeError(mem_per_job, varname="mem_per_job", expected="string")
        if not any(szstr in mem_per_job for szstr in ["MB", "GB"]):
            lgl = "string representation of requested memory (e.g., '8GB', '12000MB')"
            raise SPYValueError(legal=lgl, varname="mem_per_job", actual=mem_per_job)

    # Query memory limit of chosen partition and ensure that `mem_per_job` is
    # set for partitions w/o limit
    idx = partition.find("GB")
    if idx > 0:
        mem_lim = int(partition[:idx]) * 1000
    else:
        if partition == "PREPO":
            mem_lim = 16000
        else:
            if mem_per_job is None:
                lgl = "explicit memory amount as required by partition '{}'"
                raise SPYValueError(legal=lgl.format(partition),
                                    varname="mem_per_job", actual=mem_per_job)
        mem_lim = np.inf

    # Consolidate requested memory with chosen partition (or assign default memory)
    if mem_per_job is None:
        mem_per_job = str(mem_lim) + "MB"
    else:
        if "MB" in mem_per_job:
            mem_req = int(mem_per_job[:mem_per_job.find("MB")])
        else:
            mem_req = int(round(float(mem_per_job[:mem_per_job.find("GB")]) * 1000))
        if mem_req > mem_lim:
            msg = "`mem_per_job` exceeds limit of {lim:d}GB for partition {par:s}. " +\
                "Capping memory at partition limit. "
            SPYWarning(msg.format(lim=mem_lim, par=partition))
            mem_per_job = str(int(mem_lim)) + "GB"

    # Parse requested timeout period
    try:
        scalar_parser(timeout, varname="timeout", ntype="int_like", lims=[1, np.inf])
    except Exception as exc:
        raise exc

    # Determine if cluster allocation is happening interactively
    if not isinstance(interactive, bool):
        raise SPYTypeError(interactive, varname="interactive", expected="bool")

    # Determine if a dask client was requested
    if not isinstance(start_client, bool):
        raise SPYTypeError(start_client, varname="start_client", expected="bool")

    # Set/get "hidden" kwargs
    workers_per_job = kwargs.get("workers_per_job", 1)
    try:
        scalar_parser(workers_per_job, varname="workers_per_job",
                      ntype="int_like", lims=[1, 8])
    except Exception as exc:
        raise exc

    n_cores = kwargs.get("n_cores", 1)
    try:
        scalar_parser(n_cores, varname="n_cores",
                      ntype="int_like", lims=[1, np.inf])
    except Exception as exc:
        raise exc

    slurm_wdir = kwargs.get("slurmWorkingDirectory", None)
    if slurm_wdir is None:
        usr = getpass.getuser()
        slurm_wdir = "/mnt/hpx/slurm/{usr:s}/{usr:s}_{date:s}"
        slurm_wdir = slurm_wdir.format(usr=usr,
                                       date=datetime.now().strftime('%Y%m%d-%H%M%S'))
        os.makedirs(slurm_wdir, exist_ok=True)
    else:
        try:
            io_parser(slurm_wdir, varname="slurmWorkingDirectory", isfile=False)
        except Exception as exc:
            raise exc
        
    # Hotfix for upgraded cluster-nodes: point to correct Python executable if working from /home
    pyExec = sys.executable
    if sys.executable.startswith("/home"):
        pyExec = "/mnt/gs" + sys.executable
        
    # Create `SLURMCluster` object using provided parameters
    out_files = os.path.join(slurm_wdir, "slurm-%j.out")
    cluster = SLURMCluster(cores=n_cores,
                           memory=mem_per_job,
                           processes=workers_per_job,
                           local_directory=slurm_wdir,
                           queue=partition,
                           name="spyswarm",
                           python=pyExec,
                           header_skip=["-t", "--mem"],
                           job_extra=["--output={}".format(out_files)])
                           # interface="asdf", # interface is set via `psutil.net_if_addrs()`
                           # job_extra=["--hint=nomultithread",
                           #            "--threads-per-core=1"]
                           
    # Compute total no. of workers and up-scale cluster accordingly
    total_workers = n_jobs * workers_per_job
    cluster.scale(total_workers)

    # Fire up waiting routine to avoid premature cluster setups
    if _cluster_waiter(cluster, funcName, total_workers, timeout, interactive):
        return
    
    # Kill a zombie cluster in non-interactive mode
    if not interactive and _count_running_workers(cluster) == 0:
        cluster.close()
        err = "SLURM jobs could not be started within given time-out " +\
              "interval of {0:d} seconds"
        raise TimeoutError(err.format(timeout))
    
    # Highlight how to connect to dask performance monitor
    print(successMsg.format(name=funcName, dash=cluster.dashboard_link))

    # If client was requested, return that instead of the created cluster
    if start_client:
        return Client(cluster)
    return cluster
Esempio n. 3
0
 def test_ext(self):
     with tempfile.NamedTemporaryFile(suffix='a7f3.lfp') as f:
         io_parser(f.name, ext=['lfp', 'mua'], exists=True)
         io_parser(f.name, ext='lfp', exists=True)
         with pytest.raises(SPYValueError):
             io_parser(f.name, ext='mua', exists=True)
Esempio n. 4
0
 def test_isfile(self):
     with tempfile.NamedTemporaryFile() as f:
         io_parser(f.name, isfile=True, exists=True)
         with pytest.raises(SPYValueError):
             io_parser(f.name, isfile=False, exists=True)
Esempio n. 5
0
 def test_none(self):
     with pytest.raises(SPYTypeError):
         io_parser(None)
Esempio n. 6
0
def _load(filename, checksum, mode, out):
    """
    Local helper
    """

    fileInfo = filename_parser(filename)
    hdfFile = os.path.join(fileInfo["folder"], fileInfo["filename"])
    jsonFile = hdfFile + FILE_EXT["info"]

    try:
        _ = io_parser(hdfFile, varname="hdfFile", isfile=True, exists=True)
        _ = io_parser(jsonFile, varname="jsonFile", isfile=True, exists=True)
    except Exception as exc:
        raise exc

    with open(jsonFile, "r") as file:
        jsonDict = json.load(file)

    if "dataclass" not in jsonDict.keys():
        raise SPYError(
            "Info file {} does not contain a dataclass field".format(jsonFile))

    if hasattr(spd, jsonDict["dataclass"]):
        dataclass = getattr(spd, jsonDict["dataclass"])
    else:
        raise SPYError("Unknown data class {class}".format(
            jsonDict["dataclass"]))

    requiredFields = tuple(
        startInfoDict.keys()) + dataclass._infoFileProperties

    for key in requiredFields:
        if key not in jsonDict.keys():
            raise SPYError(
                "Required field {field} for {cls} not in {file}".format(
                    field=key, cls=dataclass.__name__, file=jsonFile))

    # If `_hdr` is an empty list, set it to `None` to not confuse meta-functions
    hdr = jsonDict.get("_hdr")
    if isinstance(hdr, (list, np.ndarray)):
        if len(hdr) == 0:
            jsonDict["_hdr"] = None

    # FIXME: add version comparison (syncopy.__version__ vs jsonDict["_version"])

    # If wanted, perform checksum matching
    if checksum:
        hsh_msg = "hash = {hsh:s}"
        hsh = hash_file(hdfFile)
        if hsh != jsonDict["file_checksum"]:
            raise SPYValueError(
                legal=hsh_msg.format(hsh=jsonDict["file_checksum"]),
                varname=os.path.basename(hdfFile),
                actual=hsh_msg.format(hsh=hsh))

    # Parsing is done, create new or check provided object
    if out is not None:
        try:
            data_parser(out,
                        varname="out",
                        writable=True,
                        dataclass=jsonDict["dataclass"])
        except Exception as exc:
            raise exc
        new_out = False
    else:
        out = dataclass()
        new_out = True

    # First and foremost, assign dimensional information
    dimord = jsonDict.pop("dimord")
    out.dimord = dimord

    # Access data on disk (error checking is done by setters)
    out.mode = mode
    for datasetProperty in out._hdfFileDatasetProperties:
        setattr(out, datasetProperty,
                h5py.File(hdfFile, mode="r")[datasetProperty])

    # Abuse ``definetrial`` to set trial-related props
    trialdef = h5py.File(hdfFile, mode="r")["trialdefinition"][()]
    out.definetrial(trialdef)

    # Assign metadata
    for key in [
            prop for prop in dataclass._infoFileProperties if prop != "dimord"
    ]:
        setattr(out, key, jsonDict[key])

    # Write `cfg` entries
    thisMethod = sys._getframe().f_code.co_name.replace("_", "")
    out.cfg = {"method": thisMethod, "files": [hdfFile, jsonFile]}

    # Write log-entry
    msg = "Read files v. {ver:s} ".format(ver=jsonDict["_version"])
    msg += "{hdf:s}\n\t" + (len(msg) + len(thisMethod) + 2) * " " + "{json:s}"
    out.log = msg.format(hdf=hdfFile, json=jsonFile)

    # Happy breakdown
    return out if new_out else None