class Heartbeat(Thread): def __init__(self, event, kill_clients_on_disconnect): Thread.__init__(self) self.kill_clients_on_disconnect = kill_clients_on_disconnect self.stopped = event self.success = None self.node_client = NodeClient(None) def run(self): while not self.stopped.wait(HEARTBEAT_FREQUENCY_SEC): client_statuses = self.node_client.get_client_statuses() dead_clients = [c for c in client_statuses if not c['alive'] or not c['busy']] alive_clients = [c for c in client_statuses if c['alive'] and c['busy']] if dead_clients and self.kill_clients_on_disconnect: printable_names = '.'.join([c['address'] for c in dead_clients]) _logger.critical('Heartbeat: One or more clients ({}) are not alive anymore; ' 'exiting others as well.'.format(printable_names)) self.node_client.stop_running_experiments(dead_clients) self.success = False return elif all(c['finished'] for c in alive_clients): _logger.info('Heartbeat: All clients finished their experiments.') self.success = True return
def __init__(self): self.cc = ConfigurationContainer.instance() self.concurrent_populations = ConcurrentPopulations.instance() dataloader = self.cc.create_instance(self.cc.settings['dataloader']['dataset_name']) network_factory = self.cc.create_instance(self.cc.settings['network']['name'], dataloader.n_input_neurons) self.node_client = NodeClient(network_factory) self.grid_size, self.grid_position, self.local_node = self._load_topology_details() self.cell_number = self._load_cell_number() self.neighbours = self._adjacent_cells() self.all_nodes = self.neighbours + [self.local_node] self.mixture_weights_generators = self._init_mixture_weights() self.mixture_weights_discriminators = self._init_mixture_weights()
def _terminate(self, stop_clients=True, return_code=-1): try: if self.heartbeat_thread: self._logger.info('Stopping heartbeat...') self.heartbeat_thread.stopped.set() self.heartbeat_thread.join() if stop_clients: self._logger.info('Stopping clients...') node_client = NodeClient(None) node_client.stop_running_experiments() finally: db_logger = DbLogger() if db_logger.is_enabled and self.experiment_id is not None: db_logger.finish_experiment(self.experiment_id) exit(return_code)
def __init__(self): self.cc = ConfigurationContainer.instance() self.concurrent_populations = ConcurrentPopulations.instance() dataloader = self.cc.create_instance( self.cc.settings['dataloader']['dataset_name']) network_factory = self.cc.create_instance( self.cc.settings['network']['name'], dataloader.n_input_neurons) # TRACE: Node client es la comunicación con las apis de otros clientes self.node_client = NodeClient(network_factory) self.grid_size, self.grid_position, self.local_node = self._load_topology_details( ) self.cell_number = self._load_cell_number() self.neighbours = self._adjacent_cells() self.all_nodes = self.neighbours + [self.local_node] # TRACE: Se generan pesos iniciales para cada nodo de all_nodes, como 1/cantidad de nodos en all_nodes self.mixture_weights_generators = self._init_mixture_weights() if self.cc.settings['trainer']['name'] == 'with_disc_mixture_wgan' \ or self.cc.settings['trainer']['name'] == 'with_disc_mixture_gan': self.mixture_weights_discriminators = self._init_mixture_weights() else: self.mixture_weights_discriminators = None
def _gather_results(self): self._logger.info('Collecting results from clients...') # Initialize node client dataloader = self.cc.create_instance( self.cc.settings['dataloader']['dataset_name']) network_factory = self.cc.create_instance( self.cc.settings['network']['name'], dataloader.n_input_neurons) node_client = NodeClient(network_factory) db_logger = DbLogger() results = node_client.gather_results( self.cc.settings['general']['distribution']['client_nodes'], 120) scores = [] for (node, generator_pop, discriminator_pop, weights_generator, weights_discriminator) in results: node_name = '{}:{}'.format(node['address'], node['port']) try: output_dir = self.get_and_create_output_dir(node) for generator in generator_pop.individuals: source = generator.source.replace(':', '-') filename = '{}{}.pkl'.format(GENERATOR_PREFIX, source) torch.save( generator.genome.net.state_dict(), os.path.join(output_dir, 'generator-{}.pkl'.format(source))) with open(os.path.join(output_dir, 'mixture.yml'), "a") as file: file.write('{}: {}\n'.format( filename, weights_generator[generator.source])) for discriminator in discriminator_pop.individuals: source = discriminator.source.replace(':', '-') filename = '{}{}.pkl'.format(DISCRIMINATOR_PREFIX, source) torch.save(discriminator.genome.net.state_dict(), os.path.join(output_dir, filename)) # Save images dataset = MixedGeneratorDataset( generator_pop, weights_generator, self.cc.settings['master']['score_sample_size'], self.cc.settings['trainer'] ['mixture_generator_samples_mode']) image_paths = self.save_samples(dataset, output_dir, dataloader) self._logger.info( 'Saved mixture result images of client {} to target directory {}.' .format(node_name, output_dir)) # Calculate inception or FID score score = float('-inf') if self.cc.settings['master']['calculate_score']: calc = ScoreCalculatorFactory.create() self._logger.info('Score calculator: {}'.format( type(calc).__name__)) self._logger.info( 'Calculating score score of {}. Depending on the type, this may take very long.' .format(node_name)) score = calc.calculate(dataset) self._logger.info( 'Node {} with weights {} yielded a score of {}'.format( node_name, weights_generator, score)) scores.append((node, score)) if db_logger.is_enabled and self.experiment_id is not None: db_logger.add_experiment_results(self.experiment_id, node_name, image_paths, score) except Exception as ex: self._logger.error( 'An error occured while trying to gather results from {}: {}' .format(node_name, ex)) traceback.print_exc() if self.cc.settings['master']['calculate_score'] and scores: best_node = sorted( scores, key=lambda x: x[1], reverse=ScoreCalculatorFactory.create().is_reversed)[-1] self._logger.info('Best result: {}:{} = {}'.format( best_node[0]['address'], best_node[0]['port'], best_node[1]))
class Neighbourhood: def __init__(self): self.cc = ConfigurationContainer.instance() self.concurrent_populations = ConcurrentPopulations.instance() dataloader = self.cc.create_instance( self.cc.settings['dataloader']['dataset_name']) network_factory = self.cc.create_instance( self.cc.settings['network']['name'], dataloader.n_input_neurons) self.node_client = NodeClient(network_factory) self.grid_size, self.grid_position, self.local_node = self._load_topology_details( ) self.cell_number = self._load_cell_number() self.neighbours = self._adjacent_cells() self.all_nodes = self.neighbours + [self.local_node] self.mixture_weights_generators = self._init_mixture_weights() if self.cc.settings['trainer']['name'] == 'with_disc_mixture_wgan' \ or self.cc.settings['trainer']['name'] == 'with_disc_mixture_gan': self.mixture_weights_discriminators = self._init_mixture_weights() else: self.mixture_weights_discriminators = None @property def local_generators(self): # Return local individuals for now, possibility to split up gens and discs later return self._set_source(self.concurrent_populations.generator) @property def local_discriminators(self): # Return local individuals for now, possibility to split up gens and discs later return self._set_source(self.concurrent_populations.discriminator) @property def all_generators(self): neighbour_individuals = self.node_client.get_all_generators( self.neighbours) local_population = self.local_generators return Population(individuals=neighbour_individuals + local_population.individuals, default_fitness=local_population.default_fitness, population_type=TYPE_GENERATOR) @property def best_generators(self): best_neighbour_individuals = self.node_client.get_best_generators( self.neighbours) local_population = self.local_generators best_local_individual = sorted(local_population.individuals, key=lambda x: x.fitness)[0] return Population(individuals=best_neighbour_individuals + [best_local_individual], default_fitness=local_population.default_fitness, population_type=TYPE_GENERATOR) @property def all_discriminators(self): neighbour_individuals = self.node_client.get_all_discriminators( self.neighbours) local_population = self.local_discriminators return Population(individuals=neighbour_individuals + local_population.individuals, default_fitness=local_population.default_fitness, population_type=TYPE_DISCRIMINATOR) @property def all_generator_parameters(self): neighbour_generators = self.node_client.load_generators_from_api( self.neighbours) local_parameters = [ i.genome.encoded_parameters for i in self.local_generators.individuals ] return local_parameters + [ n['parameters'] for n in neighbour_generators ] @property def all_discriminator_parameters(self): neighbour_discriminators = self.node_client.load_discriminators_from_api( self.neighbours) local_parameters = [ i.genome.encoded_parameters for i in self.local_discriminators.individuals ] return local_parameters + [ n['parameters'] for n in neighbour_discriminators ] @property def best_generator_parameters(self): return self.node_client.load_best_generators_from_api( self.neighbours + [self.local_node]) @property def best_discriminator_parameters(self): return self.node_client.load_best_discriminators_from_api( self.neighbours + [self.local_node]) def _load_topology_details(self): client_nodes = self._all_nodes_on_grid() if len(client_nodes) != 1 and not is_square(len(client_nodes)): raise Exception( 'Provide either one client node, or a square number of cells (to create a square grid).' ) local_port = ClientEnvironment.port matching_nodes = [ node for node in client_nodes if is_local_host(node['address']) and int(node['port']) == local_port ] if len(matching_nodes) == 1: dim = int(round(sqrt(len(client_nodes)))) idx = client_nodes.index(matching_nodes[0]) x = idx % dim y = idx // dim return len(client_nodes), (x, y), matching_nodes[0] else: raise Exception( 'This host is not specified as client in the configuration file, ' 'or too many clients match the condition.') def _load_cell_number(self): x, y = self.grid_position return y * int(sqrt(self.grid_size)) + x def _adjacent_cells(self): if self.grid_size == 1: return [] nodes = self._all_nodes_on_grid() for node in nodes: node['id'] = '{}:{}'.format(node['address'], node['port']) dim = int(round(sqrt(len(nodes)))) x, y = self.grid_position nodes = np.reshape(nodes, (-1, dim)) def neighbours(x, y): indices = np.array([(x - 1, y), (x, y - 1), (x + 1, y), (x, y + 1)]) # Start at 0 when x or y is out of bounds indices[indices >= dim] = 0 indices[indices == -1] = dim - 1 # Remove duplicates (needed for smaller grids), and convert to (x,y) tuples return np.array([tuple(row) for row in np.unique(indices, axis=0)]) mask = np.zeros((dim, dim)) mask[tuple(neighbours(x, y).T)] = 1 return nodes[mask == 1].tolist() def _all_nodes_on_grid(self): nodes = self.cc.settings['general']['distribution']['client_nodes'] for node in nodes: node['id'] = '{}:{}'.format(node['address'], node['port']) return nodes def _set_source(self, population): for individual in population.individuals: individual.source = '{}:{}'.format(self.local_node['address'], self.local_node['port']) return population def _init_mixture_weights(self): node_ids = [node['id'] for node in self.all_nodes] default_weight = 1 / len(node_ids) # Warning: Feature of order preservation in Dict is used in the mixture_weight # initialized here because further code involves converting it to list # According to https://stackoverflow.com/a/39980548, it's still preferable/safer # to use OrderedDict over Dict in Python 3.6 return OrderedDict({n_id: default_weight for n_id in node_ids})
def __init__(self, event, kill_clients_on_disconnect): Thread.__init__(self) self.kill_clients_on_disconnect = kill_clients_on_disconnect self.stopped = event self.success = None self.node_client = NodeClient(None)