def connect_host(self, server_sock, server_host_port, exp_driver): if not server_host_port: server_sock.bind(("", 0)) # hostname may not be resolvable but IP address probably will be host = self.get_ip_address() port = server_sock.getsockname()[1] server_host_port = (host, port) # register this driver with Hopsworks sp = util.find_spark() if sp is not None: sc = sp.sparkContext app_id = str(sc.applicationId) else: app_id = self.APP_ID util.set_app_id(app_id) hopscons = self.get_constants() method = hopscons.HTTP_CONFIG.HTTP_POST resource_url = (hopscons.DELIMITERS.SLASH_DELIMITER + hopscons.REST_CONFIG.HOPSWORKS_REST_RESOURCE + hopscons.DELIMITERS.SLASH_DELIMITER + "maggy" + hopscons.DELIMITERS.SLASH_DELIMITER + "drivers") json_contents = { "hostIp": host, "port": port, "appId": app_id, "secret": exp_driver._secret, } json_embeddable = json.dumps(json_contents) headers = { hopscons.HTTP_CONFIG.HTTP_CONTENT_TYPE: hopscons.HTTP_CONFIG.HTTP_APPLICATION_JSON } try: response = self.send_request(method, resource_url, data=json_embeddable, headers=headers) if (response.status_code // 100) != 2: print("No connection to Hopsworks for logging.") exp_driver.log("No connection to Hopsworks for logging.") except Exception as e: print("Connection failed to Hopsworks. No logging.") exp_driver.log(e) exp_driver.log("Connection failed to Hopsworks. No logging.") else: server_sock.bind(server_host_port) server_sock.listen(10) return server_sock, server_host_port
def lagom(train_fn: Callable, config: LagomConfig) -> dict: """Launches a maggy experiment, which depending on 'config' can either be a hyperparameter optimization, an ablation study experiment or distributed training. Given a search space, objective and a model training procedure `train_fn` (black-box function), an experiment is the whole process of finding the best hyperparameter combination in the search space, optimizing the black-box function. Currently maggy supports random search and a median stopping rule. **lagom** is a Swedish word meaning "just the right amount". :param train_fn: User defined experiment containing the model training. :param config: An experiment configuration. For more information, see config. :returns: The experiment results as a dict. """ global APP_ID global RUNNING global RUN_ID job_start = time.time() try: if RUNNING: raise RuntimeError("An experiment is currently running.") RUNNING = True spark_context = util.find_spark().sparkContext APP_ID = str(spark_context.applicationId) APP_ID, RUN_ID = util.register_environment(APP_ID, RUN_ID) EnvSing.get_instance().set_app_id(APP_ID) driver = lagom_driver(config, APP_ID, RUN_ID) return driver.run_experiment(train_fn, config) except: # noqa: E722 _exception_handler( util.seconds_to_milliseconds(time.time() - job_start)) raise finally: # Clean up spark jobs RUN_ID += 1 RUNNING = False util.find_spark().sparkContext.setJobGroup("", "")
def __init__(self, config: LagomConfig, app_id: int, run_id: int): """Sets up the RPC server, message queue and logs. :param config: Experiment config. :param app_id: Maggy application ID. :param run_id: Maggy run ID. """ global DRIVER_SECRET self.config = config self.app_id = app_id self.run_id = run_id self.name = config.name self.description = config.description self.spark_context = util.find_spark().sparkContext self.num_executors = util.num_executors(self.spark_context) self.hb_interval = config.hb_interval self.server_addr = None self.job_start = None DRIVER_SECRET = (DRIVER_SECRET if DRIVER_SECRET else self._generate_secret(self.SECRET_BYTES)) self._secret = DRIVER_SECRET # Logging related initialization self._message_q = queue.Queue() self.message_callbacks = {} self._register_msg_callbacks() self.worker_done = False self.executor_logs = "" self.log_lock = threading.RLock() self.log_dir = EnvSing.get_instance().get_logdir(app_id, run_id) log_file = self.log_dir + "/maggy.log" # Open File desc for HDFS to log if not EnvSing.get_instance().exists(log_file): EnvSing.get_instance().dump("", log_file) self.log_file_handle = EnvSing.get_instance().open_file(log_file, flags="w") self.exception = None self.result = None self.result_dict = {} self.main_metric_key = None
def get_ip_address(self): sc = util.find_spark().sparkContext return sc._conf.get("spark.driver.host")