def _load_topology_details(self): client_nodes = self._all_nodes_on_grid() if len(client_nodes) != 1 and not is_square(len(client_nodes)): raise Exception( 'Provide either one client node, or a square number of cells (to create a square grid).' ) local_port = ClientEnvironment.port matching_nodes = [ node for node in client_nodes if is_local_host(node['address']) and int(node['port']) == local_port ] if len(matching_nodes) == 1: # TRACE: Aca se encuentro el lugar de el nodo en matching_node en la grid dim = int(round(sqrt(len(client_nodes)))) idx = client_nodes.index(matching_nodes[0]) x = idx % dim y = idx // dim return len(client_nodes), (x, y), matching_nodes[0] elif len(matching_nodes) > 1: raise Exception('Too many clients match the condition.') else: raise Exception( 'This host is not specified as client in the configuration file.' )
def run(self): if os.environ.get('DOCKER', False) == 'True': self._logger.info('Detected Docker environment, enforcing auto-discover.') self.cc.settings['general']['distribution']['auto_discover'] = True if self.cc.settings['general']['distribution']['auto_discover']: self._logger.info('Running in auto-discover mode. Detecting clients...') clients = self._load_available_clients() self.cc.settings['general']['distribution']['client_nodes'] = clients self._logger.info('Detected {} clients ({})'.format(len(clients), clients)) else: # Expand port ranges to multiple client entries self.expand_clients() clients = self.cc.settings['general']['distribution']['client_nodes'] accessible_clients = self._accessible_clients(clients) if len(accessible_clients) == 0 or not is_square(len(accessible_clients)): self._logger.critical('{} clients found, but Lipizzaner currently only supports square grids.' .format(len(accessible_clients))) self._terminate(stop_clients=False) ### THIS WAS NOT COMMENTED BEFORE # if len(accessible_clients) != len(clients): # non_accessible = set([c['address'] for c in accessible_clients]) & \ # set([c['address'] for c in clients]) # self._logger.critical('Client with address {} is either busy or not accessible.'.format(non_accessible)) # self._terminate(stop_clients=False) # It is not possible to obtain reproducible result for large grid due to nature of asynchronous training # But still set seed here to minimize variance set_random_seed(self.cc.settings['general']['seed'], self.cc.settings['trainer']['params']['score']['cuda']) self._logger.info("Seed used in master: {}".format(self.cc.settings['general']['seed'])) self.heartbeat_event = Event() self.heartbeat_thread = Heartbeat(self.heartbeat_event, self.cc.settings['general']['distribution']['master_node'][ 'exit_clients_on_disconnect']) signal.signal(signal.SIGINT, self._sigint) self._start_experiments() self.heartbeat_thread.start() self.heartbeat_thread.join() # When this is reached, the heartbeat thread has stopped. # This either happens when the experiments are done, or if they were terminated if self.heartbeat_thread.success: self._gather_results() self._terminate(stop_clients=False, return_code=0) else: self._terminate(stop_clients=False, return_code=-1)
def run(self): if os.environ.get('DOCKER', False) == 'True': self._logger.info('Detected Docker environment, enforcing auto-discover.') self.cc.settings['general']['distribution']['auto_discover'] = True if self.cc.settings['general']['distribution']['auto_discover']: self._logger.info('Running in auto-discover mode. Detecting clients...') clients = self._load_available_clients() self.cc.settings['general']['distribution']['client_nodes'] = clients self._logger.info('Detected {} clients ({})'.format(len(clients), clients)) else: # Expand port ranges to multiple client entries self.expand_clients() clients = self.cc.settings['general']['distribution']['client_nodes'] accessible_clients = self._accessible_clients(clients) if len(accessible_clients) == 0 or not is_square(len(accessible_clients)): self._logger.critical('{} clients found, but Lipizzaner currently only supports square grids.' .format(len(accessible_clients))) self._terminate(stop_clients=False) if len(accessible_clients) != len(clients): non_accessible = set([c['address'] for c in accessible_clients]) & \ set([c['address'] for c in clients]) self._logger.critical('Client with address {} is either busy or not accessible.'.format(non_accessible)) self._terminate(stop_clients=False) self.heartbeat_event = Event() self.heartbeat_thread = Heartbeat(self.heartbeat_event, self.cc.settings['general']['distribution']['master_node'][ 'exit_clients_on_disconnect']) signal.signal(signal.SIGINT, self._sigint) self._start_experiments() self.heartbeat_thread.start() self.heartbeat_thread.join() # When this is reached, the heartbeat thread has stopped. # This either happens when the experiments are done, or if they were terminated if self.heartbeat_thread.success: self._gather_results() self._terminate(stop_clients=False, return_code=0) else: self._terminate(stop_clients=False, return_code=-1)