Example #1
0
    def run(self):
        if os.environ.get('DOCKER', False) == 'True':
            self._logger.info('Detected Docker environment, enforcing auto-discover.')
            self.cc.settings['general']['distribution']['auto_discover'] = True

        if self.cc.settings['general']['distribution']['auto_discover']:
            self._logger.info('Running in auto-discover mode. Detecting clients...')
            clients = self._load_available_clients()
            self.cc.settings['general']['distribution']['client_nodes'] = clients
            self._logger.info('Detected {} clients ({})'.format(len(clients), clients))
        else:
            # Expand port ranges to multiple client entries
            self.expand_clients()
            clients = self.cc.settings['general']['distribution']['client_nodes']
        accessible_clients = self._accessible_clients(clients)

        if len(accessible_clients) == 0 or not is_square(len(accessible_clients)):
            self._logger.critical('{} clients found, but Lipizzaner currently only supports square grids.'
                                  .format(len(accessible_clients)))
            self._terminate(stop_clients=False)

        ### THIS WAS NOT COMMENTED BEFORE
        # if len(accessible_clients) != len(clients):
        #     non_accessible = set([c['address'] for c in accessible_clients]) & \
        #                      set([c['address'] for c in clients])
        #     self._logger.critical('Client with address {} is either busy or not accessible.'.format(non_accessible))
        #     self._terminate(stop_clients=False)

        # It is not possible to obtain reproducible result for large grid due to nature of asynchronous training
        # But still set seed here to minimize variance
        set_random_seed(self.cc.settings['general']['seed'],
                        self.cc.settings['trainer']['params']['score']['cuda'])
        self._logger.info("Seed used in master: {}".format(self.cc.settings['general']['seed']))

        self.heartbeat_event = Event()
        self.heartbeat_thread = Heartbeat(self.heartbeat_event,
                                          self.cc.settings['general']['distribution']['master_node'][
                                              'exit_clients_on_disconnect'])

        signal.signal(signal.SIGINT, self._sigint)
        self._start_experiments()
        self.heartbeat_thread.start()

        self.heartbeat_thread.join()

        # When this is reached, the heartbeat thread has stopped.
        # This either happens when the experiments are done, or if they were terminated
        if self.heartbeat_thread.success:
            self._gather_results()
            self._terminate(stop_clients=False, return_code=0)
        else:
            self._terminate(stop_clients=False, return_code=-1)
Example #2
0
    def run(self):
        if os.environ.get('DOCKER', False) == 'True':
            self._logger.info('Detected Docker environment, enforcing auto-discover.')
            self.cc.settings['general']['distribution']['auto_discover'] = True

        if self.cc.settings['general']['distribution']['auto_discover']:
            self._logger.info('Running in auto-discover mode. Detecting clients...')
            clients = self._load_available_clients()
            self.cc.settings['general']['distribution']['client_nodes'] = clients
            self._logger.info('Detected {} clients ({})'.format(len(clients), clients))
        else:
            # Expand port ranges to multiple client entries
            self.expand_clients()
            clients = self.cc.settings['general']['distribution']['client_nodes']
        accessible_clients = self._accessible_clients(clients)

        if len(accessible_clients) == 0 or not is_square(len(accessible_clients)):
            self._logger.critical('{} clients found, but Lipizzaner currently only supports square grids.'
                                  .format(len(accessible_clients)))
            self._terminate(stop_clients=False)

        if len(accessible_clients) != len(clients):
            non_accessible = set([c['address'] for c in accessible_clients]) & \
                             set([c['address'] for c in clients])
            self._logger.critical('Client with address {} is either busy or not accessible.'.format(non_accessible))
            self._terminate(stop_clients=False)

        self.heartbeat_event = Event()
        self.heartbeat_thread = Heartbeat(self.heartbeat_event,
                                          self.cc.settings['general']['distribution']['master_node'][
                                              'exit_clients_on_disconnect'])

        signal.signal(signal.SIGINT, self._sigint)
        self._start_experiments()
        self.heartbeat_thread.start()

        self.heartbeat_thread.join()

        # When this is reached, the heartbeat thread has stopped.
        # This either happens when the experiments are done, or if they were terminated
        if self.heartbeat_thread.success:
            self._gather_results()
            self._terminate(stop_clients=False, return_code=0)
        else:
            self._terminate(stop_clients=False, return_code=-1)
Example #3
0
class LipizzanerMaster:
    _logger = logging.getLogger(__name__)

    def __init__(self):
        self.cc = ConfigurationContainer.instance()
        self.heartbeat_event = None
        self.heartbeat_thread = None
        self.experiment_id = None

    def run(self):
        if os.environ.get('DOCKER', False) == 'True':
            self._logger.info(
                'Detected Docker environment, enforcing auto-discover.')
            self.cc.settings['general']['distribution']['auto_discover'] = True

        if self.cc.settings['general']['distribution']['auto_discover']:
            self._logger.info(
                'Running in auto-discover mode. Detecting clients...')
            clients = self._load_available_clients()
            self.cc.settings['general']['distribution'][
                'client_nodes'] = clients
            self._logger.info('Detected {} clients ({})'.format(
                len(clients), clients))
        else:
            # Expand port ranges to multiple client entries
            self.expand_clients()
            clients = self.cc.settings['general']['distribution'][
                'client_nodes']
        accessible_clients = self._accessible_clients(clients)

        if len(accessible_clients) == 0 or not is_square(
                len(accessible_clients)):
            self._logger.critical(
                '{} clients found, but Lipizzaner currently only supports square grids.'
                .format(len(accessible_clients)))
            self._terminate(stop_clients=False)

        if len(accessible_clients) != len(clients):
            non_accessible = set([c['address'] for c in accessible_clients]) & \
                             set([c['address'] for c in clients])
            self._logger.critical(
                'Client with address {} is either busy or not accessible.'.
                format(non_accessible))
            self._terminate(stop_clients=False)

        # It is not possible to obtain reproducible result for large grid due to nature of asynchronous training
        # But still set seed here to minimize variance
        set_random_seed(self.cc.settings['general']['seed'],
                        self.cc.settings['trainer']['params']['score']['cuda'])
        self._logger.info("Seed used in master: {}".format(
            self.cc.settings['general']['seed']))

        self.heartbeat_event = Event()
        self.heartbeat_thread = Heartbeat(
            self.heartbeat_event, self.cc.settings['general']['distribution']
            ['master_node']['exit_clients_on_disconnect'])

        signal.signal(signal.SIGINT, self._sigint)
        self._start_experiments()
        self.heartbeat_thread.start()

        self.heartbeat_thread.join()

        # When this is reached, the heartbeat thread has stopped.
        # This either happens when the experiments are done, or if they were terminated
        if self.heartbeat_thread.success:
            self._gather_results()
            self._terminate(stop_clients=False, return_code=0)
        else:
            self._terminate(stop_clients=False, return_code=-1)

    def _sigint(self, signal, frame):
        self._terminate()

    def _accessible_clients(self, clients):
        accessible_clients = []
        for client in clients:
            address = 'http://{}:{}/status'.format(client['address'],
                                                   client['port'])
            try:
                resp = requests.get(address)
                assert resp.status_code == 200
                assert not resp.json()['busy']
                accessible_clients.append(client)
            except Exception:
                pass
        return accessible_clients

    def _load_available_clients(self):
        ip_addresses = get_network_devices()
        possible_clients = []
        for ip in ip_addresses:
            possible_clients.append({'address': ip, 'port': 5000})

        accessible_clients = sorted(self._accessible_clients(possible_clients),
                                    key=lambda x: x['address'])

        # Docker swarm specific: lowest address is overlay network address, remove it
        if os.environ.get('SWARM',
                          False) == 'True' and len(accessible_clients) != 0:
            print('Removing client 0')
            del accessible_clients[0]

        return accessible_clients

    def _start_experiments(self):
        self.cc.settings['general']['distribution'][
            'start_time'] = time.strftime('%Y-%m-%d_%H-%M-%S')

        # If DB logging is enabled, create a new experiment and attach its ID to settings for clients
        db_logger = DbLogger()
        if db_logger.is_enabled:
            self.experiment_id = db_logger.create_experiment(self.cc.settings)
            self.cc.settings['general']['logging'][
                'experiment_id'] = self.experiment_id

        for client in self.cc.settings['general']['distribution'][
                'client_nodes']:
            address = 'http://{}:{}/experiments'.format(
                client['address'], client['port'])
            try:
                resp = requests.post(address, json=self.cc.settings)
                assert resp.status_code == 200, resp.text
                self._logger.info(
                    'Successfully started experiment on {}'.format(address))
            except AssertionError as err:
                self._logger.critical(
                    'Could not start experiment on {}: {}'.format(
                        address, err))
                self._terminate()

    def _terminate(self, stop_clients=True, return_code=-1):
        try:
            if self.heartbeat_thread:
                self._logger.info('Stopping heartbeat...')
                self.heartbeat_thread.stopped.set()
                self.heartbeat_thread.join()

            if stop_clients:
                self._logger.info('Stopping clients...')
                node_client = NodeClient(None)
                node_client.stop_running_experiments()
        finally:
            db_logger = DbLogger()
            if db_logger.is_enabled and self.experiment_id is not None:
                db_logger.finish_experiment(self.experiment_id)

            exit(return_code)

    def _gather_results(self):
        self._logger.info('Collecting results from clients...')

        # Initialize node client
        dataloader = self.cc.create_instance(
            self.cc.settings['dataloader']['dataset_name'])
        network_factory = self.cc.create_instance(
            self.cc.settings['network']['name'], dataloader.n_input_neurons)
        node_client = NodeClient(network_factory)
        db_logger = DbLogger()

        results = node_client.gather_results(
            self.cc.settings['general']['distribution']['client_nodes'], 120)

        scores = []
        for (node, generator_pop, discriminator_pop, weights_generator,
             weights_discriminator) in results:
            node_name = '{}:{}'.format(node['address'], node['port'])
            try:
                output_dir = self.get_and_create_output_dir(node)

                for generator in generator_pop.individuals:
                    source = generator.source.replace(':', '-')
                    filename = '{}{}.pkl'.format(GENERATOR_PREFIX, source)
                    torch.save(
                        generator.genome.net.state_dict(),
                        os.path.join(output_dir,
                                     'generator-{}.pkl'.format(source)))

                    with open(os.path.join(output_dir, 'mixture.yml'),
                              "a") as file:
                        file.write('{}: {}\n'.format(
                            filename, weights_generator[generator.source]))

                for discriminator in discriminator_pop.individuals:
                    source = discriminator.source.replace(':', '-')
                    filename = '{}{}.pkl'.format(DISCRIMINATOR_PREFIX, source)
                    torch.save(discriminator.genome.net.state_dict(),
                               os.path.join(output_dir, filename))

                # Save images
                dataset = MixedGeneratorDataset(
                    generator_pop, weights_generator,
                    self.cc.settings['master']['score_sample_size'],
                    self.cc.settings['trainer']
                    ['mixture_generator_samples_mode'])
                image_paths = self.save_samples(dataset, output_dir,
                                                dataloader)
                self._logger.info(
                    'Saved mixture result images of client {} to target directory {}.'
                    .format(node_name, output_dir))

                # Calculate inception or FID score
                score = float('-inf')
                if self.cc.settings['master']['calculate_score']:
                    calc = ScoreCalculatorFactory.create()
                    self._logger.info('Score calculator: {}'.format(
                        type(calc).__name__))
                    self._logger.info(
                        'Calculating score score of {}. Depending on the type, this may take very long.'
                        .format(node_name))

                    score = calc.calculate(dataset)
                    self._logger.info(
                        'Node {} with weights {} yielded a score of {}'.format(
                            node_name, weights_generator, score))
                    scores.append((node, score))

                if db_logger.is_enabled and self.experiment_id is not None:
                    db_logger.add_experiment_results(self.experiment_id,
                                                     node_name, image_paths,
                                                     score)
            except Exception as ex:
                self._logger.error(
                    'An error occured while trying to gather results from {}: {}'
                    .format(node_name, ex))
                traceback.print_exc()

        if self.cc.settings['master']['calculate_score'] and scores:
            best_node = sorted(
                scores,
                key=lambda x: x[1],
                reverse=ScoreCalculatorFactory.create().is_reversed)[-1]
            self._logger.info('Best result: {}:{} = {}'.format(
                best_node[0]['address'], best_node[0]['port'], best_node[1]))

    def get_and_create_output_dir(self, node):
        directory = os.path.join(
            self.cc.output_dir, 'master',
            self.cc.settings['general']['distribution']['start_time'],
            '{}-{}'.format(node['address'], node['port']))
        os.makedirs(directory, exist_ok=True)
        return directory

    def save_samples(self,
                     dataset,
                     output_dir,
                     image_specific_loader,
                     n_images=10,
                     batch_size=100):
        image_format = self.cc.settings['general']['logging']['image_format']
        dataloader = torch.utils.data.DataLoader(dataset,
                                                 batch_size=batch_size)
        loaded = image_specific_loader.load()
        paths = []

        for i, data in enumerate(dataloader):
            shape = loaded.dataset.train_data.shape if hasattr(
                loaded.dataset, 'train_data') else None
            path = os.path.join(output_dir,
                                'mixture-{}.{}'.format(i + 1, image_format))
            image_specific_loader.save_images(Variable(data), shape, path)
            paths.append(path)

            if i + 1 == n_images:
                break

        return paths

    def expand_clients(self):
        clients = self.cc.settings['general']['distribution']['client_nodes']
        clients_to_expand = [
            c for c in clients
            if isinstance(c['port'], str) and '-' in c['port']
        ]

        if clients_to_expand:
            clients = [x for x in clients if x not in clients_to_expand]
            for client in clients_to_expand:
                rng = client['port'].split('-')
                if len(rng) != 2 or not rng[0].isdigit() or not rng[1].isdigit(
                ):
                    raise Exception(
                        'Configuration for client {} has incorrect format.'.
                        format(client))

                for port in range(int(rng[0]), int(rng[1]) + 1):
                    clients.append({
                        'address': client['address'],
                        'port': port
                    })
            self.cc.settings['general']['distribution'][
                'client_nodes'] = clients