Beispiel #1
0
    def _load_topology_details(self):
        client_nodes = self._all_nodes_on_grid()

        if len(client_nodes) != 1 and not is_square(len(client_nodes)):
            raise Exception(
                'Provide either one client node, or a square number of cells (to create a square grid).'
            )

        local_port = ClientEnvironment.port
        matching_nodes = [
            node for node in client_nodes if is_local_host(node['address'])
            and int(node['port']) == local_port
        ]

        if len(matching_nodes) == 1:
            # TRACE: Aca se encuentro el lugar de el nodo en matching_node en la grid
            dim = int(round(sqrt(len(client_nodes))))
            idx = client_nodes.index(matching_nodes[0])
            x = idx % dim
            y = idx // dim
            return len(client_nodes), (x, y), matching_nodes[0]
        elif len(matching_nodes) > 1:
            raise Exception('Too many clients match the condition.')
        else:
            raise Exception(
                'This host is not specified as client in the configuration file.'
            )
Beispiel #2
0
    def run(self):
        if os.environ.get('DOCKER', False) == 'True':
            self._logger.info('Detected Docker environment, enforcing auto-discover.')
            self.cc.settings['general']['distribution']['auto_discover'] = True

        if self.cc.settings['general']['distribution']['auto_discover']:
            self._logger.info('Running in auto-discover mode. Detecting clients...')
            clients = self._load_available_clients()
            self.cc.settings['general']['distribution']['client_nodes'] = clients
            self._logger.info('Detected {} clients ({})'.format(len(clients), clients))
        else:
            # Expand port ranges to multiple client entries
            self.expand_clients()
            clients = self.cc.settings['general']['distribution']['client_nodes']
        accessible_clients = self._accessible_clients(clients)

        if len(accessible_clients) == 0 or not is_square(len(accessible_clients)):
            self._logger.critical('{} clients found, but Lipizzaner currently only supports square grids.'
                                  .format(len(accessible_clients)))
            self._terminate(stop_clients=False)

        ### THIS WAS NOT COMMENTED BEFORE
        # if len(accessible_clients) != len(clients):
        #     non_accessible = set([c['address'] for c in accessible_clients]) & \
        #                      set([c['address'] for c in clients])
        #     self._logger.critical('Client with address {} is either busy or not accessible.'.format(non_accessible))
        #     self._terminate(stop_clients=False)

        # It is not possible to obtain reproducible result for large grid due to nature of asynchronous training
        # But still set seed here to minimize variance
        set_random_seed(self.cc.settings['general']['seed'],
                        self.cc.settings['trainer']['params']['score']['cuda'])
        self._logger.info("Seed used in master: {}".format(self.cc.settings['general']['seed']))

        self.heartbeat_event = Event()
        self.heartbeat_thread = Heartbeat(self.heartbeat_event,
                                          self.cc.settings['general']['distribution']['master_node'][
                                              'exit_clients_on_disconnect'])

        signal.signal(signal.SIGINT, self._sigint)
        self._start_experiments()
        self.heartbeat_thread.start()

        self.heartbeat_thread.join()

        # When this is reached, the heartbeat thread has stopped.
        # This either happens when the experiments are done, or if they were terminated
        if self.heartbeat_thread.success:
            self._gather_results()
            self._terminate(stop_clients=False, return_code=0)
        else:
            self._terminate(stop_clients=False, return_code=-1)
Beispiel #3
0
    def run(self):
        if os.environ.get('DOCKER', False) == 'True':
            self._logger.info('Detected Docker environment, enforcing auto-discover.')
            self.cc.settings['general']['distribution']['auto_discover'] = True

        if self.cc.settings['general']['distribution']['auto_discover']:
            self._logger.info('Running in auto-discover mode. Detecting clients...')
            clients = self._load_available_clients()
            self.cc.settings['general']['distribution']['client_nodes'] = clients
            self._logger.info('Detected {} clients ({})'.format(len(clients), clients))
        else:
            # Expand port ranges to multiple client entries
            self.expand_clients()
            clients = self.cc.settings['general']['distribution']['client_nodes']
        accessible_clients = self._accessible_clients(clients)

        if len(accessible_clients) == 0 or not is_square(len(accessible_clients)):
            self._logger.critical('{} clients found, but Lipizzaner currently only supports square grids.'
                                  .format(len(accessible_clients)))
            self._terminate(stop_clients=False)

        if len(accessible_clients) != len(clients):
            non_accessible = set([c['address'] for c in accessible_clients]) & \
                             set([c['address'] for c in clients])
            self._logger.critical('Client with address {} is either busy or not accessible.'.format(non_accessible))
            self._terminate(stop_clients=False)

        self.heartbeat_event = Event()
        self.heartbeat_thread = Heartbeat(self.heartbeat_event,
                                          self.cc.settings['general']['distribution']['master_node'][
                                              'exit_clients_on_disconnect'])

        signal.signal(signal.SIGINT, self._sigint)
        self._start_experiments()
        self.heartbeat_thread.start()

        self.heartbeat_thread.join()

        # When this is reached, the heartbeat thread has stopped.
        # This either happens when the experiments are done, or if they were terminated
        if self.heartbeat_thread.success:
            self._gather_results()
            self._terminate(stop_clients=False, return_code=0)
        else:
            self._terminate(stop_clients=False, return_code=-1)