Example #1
0
    def connect_host(self, server_sock, server_host_port, exp_driver):
        if not server_host_port:
            server_sock.bind(("", 0))
            # hostname may not be resolvable but IP address probably will be
            host = self.get_ip_address()
            port = server_sock.getsockname()[1]
            server_host_port = (host, port)
            # register this driver with Hopsworks
            sp = util.find_spark()
            if sp is not None:
                sc = sp.sparkContext
                app_id = str(sc.applicationId)
            else:
                app_id = self.APP_ID
                util.set_app_id(app_id)

            hopscons = self.get_constants()
            method = hopscons.HTTP_CONFIG.HTTP_POST
            resource_url = (hopscons.DELIMITERS.SLASH_DELIMITER +
                            hopscons.REST_CONFIG.HOPSWORKS_REST_RESOURCE +
                            hopscons.DELIMITERS.SLASH_DELIMITER + "maggy" +
                            hopscons.DELIMITERS.SLASH_DELIMITER + "drivers")
            json_contents = {
                "hostIp": host,
                "port": port,
                "appId": app_id,
                "secret": exp_driver._secret,
            }
            json_embeddable = json.dumps(json_contents)
            headers = {
                hopscons.HTTP_CONFIG.HTTP_CONTENT_TYPE:
                hopscons.HTTP_CONFIG.HTTP_APPLICATION_JSON
            }

            try:
                response = self.send_request(method,
                                             resource_url,
                                             data=json_embeddable,
                                             headers=headers)

                if (response.status_code // 100) != 2:
                    print("No connection to Hopsworks for logging.")
                    exp_driver.log("No connection to Hopsworks for logging.")
            except Exception as e:
                print("Connection failed to Hopsworks. No logging.")
                exp_driver.log(e)
                exp_driver.log("Connection failed to Hopsworks. No logging.")
        else:
            server_sock.bind(server_host_port)

        server_sock.listen(10)

        return server_sock, server_host_port
Example #2
0
def lagom(train_fn: Callable, config: LagomConfig) -> dict:
    """Launches a maggy experiment, which depending on 'config' can either
    be a hyperparameter optimization, an ablation study experiment or distributed
    training. Given a search space, objective and a model training procedure `train_fn`
    (black-box function), an experiment is the whole process of finding the
    best hyperparameter combination in the search space, optimizing the
    black-box function. Currently maggy supports random search and a median
    stopping rule.
    **lagom** is a Swedish word meaning "just the right amount".

    :param train_fn: User defined experiment containing the model training.
    :param config: An experiment configuration. For more information, see config.

    :returns: The experiment results as a dict.
    """
    global APP_ID
    global RUNNING
    global RUN_ID
    job_start = time.time()
    try:
        if RUNNING:
            raise RuntimeError("An experiment is currently running.")
        RUNNING = True
        spark_context = util.find_spark().sparkContext
        APP_ID = str(spark_context.applicationId)
        APP_ID, RUN_ID = util.register_environment(APP_ID, RUN_ID)
        EnvSing.get_instance().set_app_id(APP_ID)
        driver = lagom_driver(config, APP_ID, RUN_ID)
        return driver.run_experiment(train_fn, config)
    except:  # noqa: E722
        _exception_handler(
            util.seconds_to_milliseconds(time.time() - job_start))
        raise
    finally:
        # Clean up spark jobs
        RUN_ID += 1
        RUNNING = False
        util.find_spark().sparkContext.setJobGroup("", "")
Example #3
0
    def __init__(self, config: LagomConfig, app_id: int, run_id: int):
        """Sets up the RPC server, message queue and logs.

        :param config: Experiment config.
        :param app_id: Maggy application ID.
        :param run_id: Maggy run ID.
        """
        global DRIVER_SECRET
        self.config = config
        self.app_id = app_id
        self.run_id = run_id
        self.name = config.name
        self.description = config.description
        self.spark_context = util.find_spark().sparkContext
        self.num_executors = util.num_executors(self.spark_context)
        self.hb_interval = config.hb_interval
        self.server_addr = None
        self.job_start = None
        DRIVER_SECRET = (DRIVER_SECRET if DRIVER_SECRET else
                         self._generate_secret(self.SECRET_BYTES))
        self._secret = DRIVER_SECRET
        # Logging related initialization
        self._message_q = queue.Queue()
        self.message_callbacks = {}
        self._register_msg_callbacks()
        self.worker_done = False
        self.executor_logs = ""
        self.log_lock = threading.RLock()
        self.log_dir = EnvSing.get_instance().get_logdir(app_id, run_id)
        log_file = self.log_dir + "/maggy.log"
        # Open File desc for HDFS to log
        if not EnvSing.get_instance().exists(log_file):
            EnvSing.get_instance().dump("", log_file)
        self.log_file_handle = EnvSing.get_instance().open_file(log_file,
                                                                flags="w")
        self.exception = None
        self.result = None
        self.result_dict = {}
        self.main_metric_key = None
Example #4
0
 def get_ip_address(self):
     sc = util.find_spark().sparkContext
     return sc._conf.get("spark.driver.host")