def receive(self, sock): """ Receive a message on ``sock`` Args: sock: Returns: """ msg = None data = b"" recv_done = False recv_len = -1 while not recv_done: buf = sock.recv(BUFSIZE) if buf is None or len(buf) == 0: raise Exception("socket closed") if recv_len == -1: recv_len = struct.unpack(">I", buf[:4])[0] data += buf[4:] recv_len -= len(data) else: data += buf recv_len -= len(buf) recv_done = recv_len == 0 if conf.is_spark_available(): msg = cloudpickle.loads(data) return msg else: return data
def find_spark(): """ Returns: SparkSession """ if mc.is_spark_available(): return SparkSession.builder.getOrCreate() else: return None
def __init__( self, num_trials: int, optimizer: Union[str, AbstractOptimizer], searchspace: Searchspace, optimization_key: str = "Metric", direction: str = "max", es_interval: int = 1, es_min: int = 10, es_policy: Union[str, AbstractEarlyStop] = "median", name: str = "HPOptimization", description: str = "", hb_interval: int = 1, model: Union[tf.keras.Model, Type[torch.nn.Module], List[Type[torch.nn.Module]]] = None, dataset: List[Optional[Union[str, tf.data.Dataset, torch.util.data.Dataset]]] = None, ): """Initializes HP optimization experiment parameters. :param num_trials: Controls how many seperate runs are conducted during the hp search. :param optimizer: Optimizer type for searching the hp searchspace. :param searchspace: A Searchspace object configuring the names, types and ranges of hps. :param optimization_key: Name of the metric to use for hp search evaluation. :param direction: Direction of optimization. :param es_interval: Early stopping polling frequency during an experiment run. :param es_min: Minimum number of experiments to conduct before starting the early stopping mechanism. Useful to establish a baseline for performance estimates. :param es_policy: Early stopping policy which formulates a rule for triggering aborts. :param name: Experiment name. :param description: A description of the experiment. :param hb_interval: Heartbeat interval with which the server is polling. :param model: The class of the model to be used in the training function. :param dataset: A List of strings containing the dataset path or list of tf.data.Dataset or torch.util.data.Dataset. These datasets represent the ones you are going to use in the training function. For example, if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and extract them in the training function. If you want to load the set inside the training function, this can be disregarded. """ super().__init__(name, description, hb_interval) if not mc.is_spark_available(): raise NotImplementedError( "Hyperparameter Optimization can run only on a Spark kernel.") if not num_trials > 0: raise ValueError("Number of trials should be greater than zero!") self.num_trials = num_trials self.optimizer = optimizer self.optimization_key = optimization_key self.searchspace = searchspace self.direction = direction self.es_policy = es_policy self.es_interval = es_interval self.es_min = es_min self.model = model self.dataset = dataset
def __init__( self, module: Union[Type[torch.nn.Module], List[Type[torch.nn.Module]]], dataset: List[Optional[Union[str, torch.util.data.Dataset]]] = None, hparams: dict = None, backend: str = "torch", mixed_precision: bool = False, zero_lvl: int = 0, deepspeed_config: dict = None, name: str = "torchDist", hb_interval: int = 1, description: str = "", ): """Initializes PyTorch distributed training parameters. :param module: A PyTorch module class or list of PyTorch module classes. Note that this has to be the class itself, not an instance. :param dataset: A List of strings containing the dataset path or list of torch.util.data.Dataset. these datasets represent the ones you are going to use in the training function. For example, if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and extract them in the training function. If you want to load the set inside the training function, this can be disregarded. :param hparams: Hyperparameters that should be used during model initialization. Primarily used to give an interface for hp optimization. :param backend: The backend framework used for training. Note that `deepspeed` needs syntax changes to a normal PyTorch script! :param mixed_precision: Used to control the use of mixed precision training in `torch` backend mode with model sharding (`zero_lvl` 3). :param zero_lvl: Sets the ZeRO optimization stages for `torch`. Note: When using `deepspeed` backend, overwrites `deepspeed_config` zero level! :param deepspeed_config: A dictionary that represents a valid deepspeed ZeRO optimizer config. For information on the config, see https://www.deepspeed.ai/docs/config-json/. :param name: Experiment name. :param hb_interval: Heartbeat interval with which the server is polling. :param description: A description of the experiment. """ super().__init__(name, description, hb_interval) mc.initialize() if not mc.is_spark_available(): raise NotImplementedError( "Torch Distributed Training can run only on a Spark kernel.") self.module = module self.dataset = dataset if backend not in self.BACKENDS: raise ValueError("""Backend {} not supported by Maggy. Supported types are: {}""".format(backend, self.BACKENDS)) self.backend = backend self.mixed_precision = mixed_precision self.hparams = hparams if hparams else {} self.zero_lvl = zero_lvl self.ds_config = deepspeed_config
def get_partition_attempt_id(): """Returns partitionId and attemptNumber of the task context, when invoked on a spark executor. PartitionId is ID of the RDD partition that is computed by this task. The first task attempt will be assigned attemptNumber = 0, and subsequent attempts will have increasing attempt numbers. Returns: partitionId, attemptNumber -- [description] """ if mc.is_spark_available(): task_context = TaskContext.get() return task_context.partitionId(), task_context.attemptNumber() else: return 0, 0
def send(self, sock, msg): """ Send ``msg`` to destination ``sock``. Args: sock: msg: Returns: """ if conf.is_spark_available(): data = cloudpickle.dumps(msg) else: data = msg buf = struct.pack(">I", len(data)) + data sock.sendall(buf)
def __init__( self, ablation_study: AblationStudy, ablator: Union[str, AbstractAblator] = "loco", direction: str = "max", name: str = "ablationStudy", description: str = "", hb_interval: int = 1, model: tf.keras.Model = None, dataset: List[Union[str, tf.data.Dataset]] = None, ): """Initializes ablation study experiment parameters. :param ablation_study: Ablation study object that defines the entry point into the experiment. :param ablator: An instance of `AbstractAblator` or a supported ablator name that controls the manner in which parts of the model are ablated. :param direction: Optimization direction to evaluate the experiments. :param name: Experiment name. :param description: A description of the experiment. :param hb_interval: Heartbeat interval with which the server is polling. :param model: The class of the model to be used in the training function. :param dataset: A List of strings containing the dataset path or list of tf.data.Dataset. These datasets represent the ones you are going to use in the training function. For example, if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and extract them in the training function. If you want to load the set inside the training function, this can be disregarded. """ super().__init__(name, description, hb_interval) mc.initialize() if not mc.is_spark_available(): raise NotImplementedError("Ablation Study can run only on a Spark kernel.") self.ablator = ablator self.ablation_study = ablation_study self.direction = direction self.model = model self.dataset = dataset
def lagom(train_fn: Callable, config: LagomConfig = None) -> dict: """Entry point for Maggy experiment, this function passes the parameters to the lagom function depending whether the kernel is pyspark or python. **lagom** is a Swedish word meaning "just the right amount". :param train_fn: User defined experiment containing the model training. :param config: An experiment configuration. For more information, see config. :returns: The experiment results as a dict. """ from maggy.experiment import experiment_python from maggy.experiment import experiment_pyspark from maggy.core import config as maggyconfig if config is None: config = BaseConfig( name="maggy_experiment", description="experiment without config object", hb_interval=1, ) if maggyconfig.is_spark_available(): return experiment_pyspark.lagom(train_fn, config) else: return experiment_python.lagom(train_fn, config)
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import json import os import time from pickle import PicklingError from typing import Callable, Type from maggy import util from maggy.core import config as mc from maggy.core.environment.singleton import EnvSing from maggy.core.rpc import Server if mc.is_spark_available(): from maggy.core.experiment_driver.spark_driver import Driver else: from maggy.core.experiment_driver.python_driver import Driver from maggy.core.executors.base_executor import base_executor_fn from maggy.config import BaseConfig class BaseDriver(Driver): """Driver for base experiments. Registers the workers on an RPC server, ensures proper configuration and logging, and accumulates final results. """ def __init__(self, config: BaseConfig, app_id: int, run_id: int): """Initializes the server, but does not start it yet.
# from __future__ import annotations import secrets import select import socket import struct import threading import time import typing from typing import Any import maggy.core.config as conf if conf.is_spark_available(): from pyspark import cloudpickle from maggy.core.environment.singleton import EnvSing from maggy.config import TfDistributedConfig from maggy.trial import Trial if typing.TYPE_CHECKING: # Avoid circular import error. from maggy.core.experiment_driver import Driver BUFSIZE = 1024 * 2 MAX_RETRIES = 3 SERVER_HOST_PORT = None class Reservations(object):