Ejemplo n.º 1
0
    def receive(self, sock):
        """
        Receive a message on ``sock``

        Args:
            sock:

        Returns:

        """
        msg = None
        data = b""
        recv_done = False
        recv_len = -1
        while not recv_done:
            buf = sock.recv(BUFSIZE)
            if buf is None or len(buf) == 0:
                raise Exception("socket closed")
            if recv_len == -1:
                recv_len = struct.unpack(">I", buf[:4])[0]
                data += buf[4:]
                recv_len -= len(data)
            else:
                data += buf
                recv_len -= len(buf)
            recv_done = recv_len == 0

        if conf.is_spark_available():
            msg = cloudpickle.loads(data)
            return msg
        else:
            return data
Ejemplo n.º 2
0
def find_spark():
    """
    Returns: SparkSession
    """
    if mc.is_spark_available():
        return SparkSession.builder.getOrCreate()
    else:
        return None
Ejemplo n.º 3
0
    def __init__(
        self,
        num_trials: int,
        optimizer: Union[str, AbstractOptimizer],
        searchspace: Searchspace,
        optimization_key: str = "Metric",
        direction: str = "max",
        es_interval: int = 1,
        es_min: int = 10,
        es_policy: Union[str, AbstractEarlyStop] = "median",
        name: str = "HPOptimization",
        description: str = "",
        hb_interval: int = 1,
        model: Union[tf.keras.Model, Type[torch.nn.Module],
                     List[Type[torch.nn.Module]]] = None,
        dataset: List[Optional[Union[str, tf.data.Dataset,
                                     torch.util.data.Dataset]]] = None,
    ):
        """Initializes HP optimization experiment parameters.

        :param num_trials: Controls how many seperate runs are conducted during the hp search.
        :param optimizer: Optimizer type for searching the hp searchspace.
        :param searchspace: A Searchspace object configuring the names, types and ranges of hps.
        :param optimization_key: Name of the metric to use for hp search evaluation.
        :param direction: Direction of optimization.
        :param es_interval: Early stopping polling frequency during an experiment run.
        :param es_min: Minimum number of experiments to conduct before starting the early stopping
            mechanism. Useful to establish a baseline for performance estimates.
        :param es_policy: Early stopping policy which formulates a rule for triggering aborts.
        :param name: Experiment name.
        :param description: A description of the experiment.
        :param hb_interval: Heartbeat interval with which the server is polling.
        :param model: The class of the model to be used in the training function.
        :param dataset: A List of strings containing the dataset path or list of tf.data.Dataset or
        torch.util.data.Dataset. These datasets represent the ones you are going to use in the training function.
        For example, if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and
        extract them in the training function. If you want to load the set inside the training function, this can be
        disregarded.
        """
        super().__init__(name, description, hb_interval)
        if not mc.is_spark_available():
            raise NotImplementedError(
                "Hyperparameter Optimization can run only on a Spark kernel.")
        if not num_trials > 0:
            raise ValueError("Number of trials should be greater than zero!")
        self.num_trials = num_trials
        self.optimizer = optimizer
        self.optimization_key = optimization_key
        self.searchspace = searchspace
        self.direction = direction
        self.es_policy = es_policy
        self.es_interval = es_interval
        self.es_min = es_min
        self.model = model
        self.dataset = dataset
Ejemplo n.º 4
0
    def __init__(
        self,
        module: Union[Type[torch.nn.Module], List[Type[torch.nn.Module]]],
        dataset: List[Optional[Union[str, torch.util.data.Dataset]]] = None,
        hparams: dict = None,
        backend: str = "torch",
        mixed_precision: bool = False,
        zero_lvl: int = 0,
        deepspeed_config: dict = None,
        name: str = "torchDist",
        hb_interval: int = 1,
        description: str = "",
    ):
        """Initializes PyTorch distributed training parameters.

        :param module: A PyTorch module class or list of PyTorch module classes.
            Note that this has to be the class itself, not an instance.
        :param dataset: A List of strings containing the dataset path or list of torch.util.data.Dataset.
        these datasets represent the ones you are going to use in the training function. For example,
        if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and extract them in
        the training function. If you want to load the set inside the training function, this can be disregarded.
        :param hparams: Hyperparameters that should be used during model initialization. Primarily
            used to give an interface for hp optimization.
        :param backend: The backend framework used for training. Note that `deepspeed` needs syntax
            changes to a normal PyTorch script!
        :param mixed_precision: Used to control the use of mixed precision training in `torch`
            backend mode with model sharding (`zero_lvl` 3).
        :param zero_lvl: Sets the ZeRO optimization stages for `torch`. Note: When using `deepspeed`
            backend, overwrites `deepspeed_config` zero level!
        :param deepspeed_config: A dictionary that represents a valid deepspeed ZeRO optimizer
            config. For information on the config, see https://www.deepspeed.ai/docs/config-json/.
        :param name: Experiment name.
        :param hb_interval: Heartbeat interval with which the server is polling.
        :param description: A description of the experiment.
        """
        super().__init__(name, description, hb_interval)
        mc.initialize()
        if not mc.is_spark_available():
            raise NotImplementedError(
                "Torch Distributed Training can run only on a Spark kernel.")
        self.module = module
        self.dataset = dataset
        if backend not in self.BACKENDS:
            raise ValueError("""Backend {} not supported by Maggy.
                 Supported types are: {}""".format(backend, self.BACKENDS))
        self.backend = backend
        self.mixed_precision = mixed_precision
        self.hparams = hparams if hparams else {}
        self.zero_lvl = zero_lvl
        self.ds_config = deepspeed_config
Ejemplo n.º 5
0
def get_partition_attempt_id():
    """Returns partitionId and attemptNumber of the task context, when invoked
    on a spark executor.
    PartitionId is ID of the RDD partition that is computed by this task.
    The first task attempt will be assigned attemptNumber = 0, and subsequent
    attempts will have increasing attempt numbers.
    Returns:
        partitionId, attemptNumber -- [description]
    """
    if mc.is_spark_available():
        task_context = TaskContext.get()
        return task_context.partitionId(), task_context.attemptNumber()
    else:
        return 0, 0
Ejemplo n.º 6
0
    def send(self, sock, msg):
        """
        Send ``msg`` to destination ``sock``.

        Args:
            sock:
            msg:

        Returns:

        """
        if conf.is_spark_available():
            data = cloudpickle.dumps(msg)
        else:
            data = msg
        buf = struct.pack(">I", len(data)) + data
        sock.sendall(buf)
Ejemplo n.º 7
0
    def __init__(
        self,
        ablation_study: AblationStudy,
        ablator: Union[str, AbstractAblator] = "loco",
        direction: str = "max",
        name: str = "ablationStudy",
        description: str = "",
        hb_interval: int = 1,
        model: tf.keras.Model = None,
        dataset: List[Union[str, tf.data.Dataset]] = None,
    ):
        """Initializes ablation study experiment parameters.

        :param ablation_study: Ablation study object that defines the entry point into the
            experiment.
        :param ablator: An instance of `AbstractAblator` or a supported ablator name that controls
            the manner in which parts of the model are ablated.
        :param direction: Optimization direction to evaluate the experiments.
        :param name: Experiment name.
        :param description: A description of the experiment.
        :param hb_interval: Heartbeat interval with which the server is polling.
        :param model: The class of the model to be used in the training function.
        :param dataset: A List of strings containing the dataset path or list of tf.data.Dataset.
        These datasets represent the ones you are going to use in the training function.
        For example, if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and
        extract them in the training function. If you want to load the set inside the training function, this can be
        disregarded.
        """
        super().__init__(name, description, hb_interval)
        mc.initialize()
        if not mc.is_spark_available():
            raise NotImplementedError("Ablation Study can run only on a Spark kernel.")
        self.ablator = ablator
        self.ablation_study = ablation_study
        self.direction = direction
        self.model = model
        self.dataset = dataset
Ejemplo n.º 8
0
def lagom(train_fn: Callable, config: LagomConfig = None) -> dict:
    """Entry point for Maggy experiment, this function passes the parameters to the lagom function
    depending whether the kernel is pyspark or python.
    **lagom** is a Swedish word meaning "just the right amount".

    :param train_fn: User defined experiment containing the model training.
    :param config: An experiment configuration. For more information, see config.

    :returns: The experiment results as a dict.
    """
    from maggy.experiment import experiment_python
    from maggy.experiment import experiment_pyspark
    from maggy.core import config as maggyconfig

    if config is None:
        config = BaseConfig(
            name="maggy_experiment",
            description="experiment without config object",
            hb_interval=1,
        )
    if maggyconfig.is_spark_available():
        return experiment_pyspark.lagom(train_fn, config)
    else:
        return experiment_python.lagom(train_fn, config)
Ejemplo n.º 9
0
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
#
import json
import os
import time
from pickle import PicklingError
from typing import Callable, Type

from maggy import util
from maggy.core import config as mc
from maggy.core.environment.singleton import EnvSing
from maggy.core.rpc import Server

if mc.is_spark_available():
    from maggy.core.experiment_driver.spark_driver import Driver
else:
    from maggy.core.experiment_driver.python_driver import Driver
from maggy.core.executors.base_executor import base_executor_fn
from maggy.config import BaseConfig


class BaseDriver(Driver):
    """Driver for base experiments.

    Registers the workers on an RPC server, ensures proper configuration and
    logging, and accumulates final results.
    """
    def __init__(self, config: BaseConfig, app_id: int, run_id: int):
        """Initializes the server, but does not start it yet.
Ejemplo n.º 10
0
#

from __future__ import annotations

import secrets
import select
import socket
import struct
import threading
import time
import typing
from typing import Any

import maggy.core.config as conf

if conf.is_spark_available():
    from pyspark import cloudpickle

from maggy.core.environment.singleton import EnvSing
from maggy.config import TfDistributedConfig
from maggy.trial import Trial

if typing.TYPE_CHECKING:  # Avoid circular import error.
    from maggy.core.experiment_driver import Driver

BUFSIZE = 1024 * 2
MAX_RETRIES = 3
SERVER_HOST_PORT = None


class Reservations(object):