def __init__(self, dimension_sizes=(-1, -1, 10), initial_size=50, steps=1000, allow_cpu=True, mutation_rate=0.05, copy_mutation_rate=0, replace_mutation_rate=0, zone_mutation_rate=0, crossover_rate=0.4, selection='random', tournament_size=10, benchmarking_function=None, benchmarking_steps=0, benchmark_before_selection=False, benchmarking_n_keep=None, benchmarking_time_threshold=None, include_trivial_solutions=True, show_score_plot=False, plot_axes=(0, 2), plot_animation=False, animation_fps=1, archive_log_period=None, **kwargs): super().__init__(**kwargs) self.dimension_sizes = dimension_sizes self.initial_size = initial_size self.steps = steps self.allow_cpu = allow_cpu self.mutation_rate = mutation_rate self.copy_mutation_rate = copy_mutation_rate self.replace_mutation_rate = replace_mutation_rate self.zone_mutation_rate = zone_mutation_rate self.crossover_rate = crossover_rate self.selection = selection self.tournament_size = tournament_size self.include_trivial_solutions = include_trivial_solutions self.benchmarking_steps = benchmarking_steps self.benchmark_before_selection = benchmark_before_selection self.benchmarking_n_keep = benchmarking_n_keep self.benchmarking_time_threshold = benchmarking_time_threshold self.benchmarking_function = benchmarking_function self.plot_axes = plot_axes self.show_score_plot = show_score_plot self.plot_animation = plot_animation self.animation_fps = animation_fps self.archive_log_period = archive_log_period if self.archive_log_period is not None: if not os.path.exists(os.path.join(get_log_dir(), 'archive_logs')): os.makedirs(os.path.join(get_log_dir(), 'archive_logs')) self.worker_pool = None
def optimize(self, net_string, device_graph): n_devices = len(device_graph.devices) groups = self.create_colocation_groups( get_flattened_layer_names(net_string)) with open(os.path.join(get_log_dir(), 'time_history.csv'), 'w') as f: f.write('generation, time\n') i = 0 with tqdm(total=self.steps, disable=not self.verbose) as t: def eval_function(x): nonlocal i t.update(1) new_placement = [int(round(g)) for g in x] score = self.evaluate_placement( apply_placement(net_string, new_placement, groups), device_graph) i += 1 return score def callback(x, score, context): if self.verbose: log(f'[{i + 1}/{self.steps}] Found new minimum: {score:.2f}ms' ) with open(os.path.join(get_log_dir(), 'time_history.csv'), 'a') as f: f.write(f'{i + 1}, {score}\n') result = scipy.optimize.dual_annealing( eval_function, [(0, n_devices - 1)] * len(groups), no_local_search=True, maxfun=self.steps, callback=callback) placement = [int(round(g)) for g in result.x] if self.verbose: log(f'Best found placement: {placement}') solution = json.dumps(apply_placement(net_string, placement, groups), indent=4) with open(os.path.join(get_log_dir(), 'sa_solution.json'), 'w') as f: f.write(solution) return solution
def run_experiment(lg_dir): set_log_dir(lg_dir) optimize_with_config(config_path) convert_to_placement( os.path.join(get_log_dir(), 'checkpoints'), os.path.join(get_log_dir(), 'checkpoints', 'placements')) benchmark_all_placements(os.path.join(get_log_dir(), 'checkpoints', 'placements'), os.path.join(get_log_dir(), 'batch_times.csv'), model_type, batches=BATCHES, drop_batches=1, format='long')
def plot_results(sim_path, real_path): sim_results = pd.read_csv(sim_path, names=['generation', 'time']) sim_results['category'] = 'Simulated' real_results = pd.read_csv(real_path, names=['generation', 'time']) real_results['category'] = 'Benchmarked' all_results = pd.concat([sim_results, real_results], axis=0) cmap = sns.cubehelix_palette(2, start=.5, rot=-.75, light=0.5, reverse=True) sns.lineplot(x='generation', y='time', hue='category', style='category', data=all_results, palette=cmap) plt.legend(['Simulated', 'Benchmarked']) plt.xlabel('Generation') plt.ylabel('Batch execution time (ms)') plt.tight_layout() plt.savefig(os.path.join(get_log_dir(), 'sim_real_comp.pdf')) plt.show() plt.close()
def callback(x, score, context): if self.verbose: log(f'[{i + 1}/{self.steps}] Found new minimum: {score:.2f}ms' ) with open(os.path.join(get_log_dir(), 'time_history.csv'), 'a') as f: f.write(f'{i + 1}, {score}\n')
def optimize(self, net_string, device_graph): n_devices = len(device_graph.devices) groups = self.create_colocation_groups( get_flattened_layer_names(net_string)) placement = [randint(0, n_devices - 1) for n in range(len(groups))] # [0] * len(groups) score = self.evaluate_placement( apply_placement(net_string, placement, groups), device_graph) if self.score_save_period: with open(os.path.join(get_log_dir(), 'time_history.csv'), 'w') as f: f.write('step, time\n') for i in tqdm(range(self.steps), disable=not self.verbose): new_placement = placement[:] new_placement[randint(0, len(new_placement) - 1)] = randint( 0, n_devices - 1) new_score = self.evaluate_placement( apply_placement(net_string, new_placement, groups), device_graph) if self.verbose and (i + 1) % self.verbose == 0: log(f'[{i + 1}/{self.steps}] Best run time: {score:,.2f}ms') if self.score_save_period and i % self.score_save_period == 0: with open(os.path.join(get_log_dir(), 'time_history.csv'), 'a') as f: f.write(f'{i + 1}, {score}\n') if new_score != -1: if new_score < score or score == -1 \ or random() < expit((score - new_score) / self.temp(i)): score = new_score placement = new_placement solution = json.dumps(apply_placement(net_string, placement, groups), indent=4) with open(os.path.join(get_log_dir(), 'sa_solution.json'), 'w') as f: f.write(solution) return solution
def clear_processor_log(): with open(os.path.join(get_log_dir(), 'processor_util.csv'), 'w') as f: headers = ['timestamp', 'step'] for gpu in range(len(GPUtil.getGPUs())): headers.append(f'gpu:{gpu}') for cpu in range(psutil.cpu_count()): headers.append(f'cpu:{cpu}') f.write(f'{",".join(headers)}\n')
def log_archive(file_name): indices = list(np.argwhere(np.isfinite(archive_scores))) indices = sorted(indices, key=lambda i: -archive_scores[i[0], i[1], i[2]]) with open(os.path.join(get_log_dir(), 'archive_logs', file_name), 'w') as f: f.write('niche; time; placement\n') for i in indices: niche = tuple(i) time = 1 / archive_scores[i[0], i[1], i[2]] placement = archive_individuals[i[0], i[1], i[2]].tolist() f.write(f'{niche}; {time}; {placement}\n')
def update_processor_log(step='null'): log_file = os.path.join(get_log_dir(), 'processor_util.csv') if not os.path.exists(log_file): clear_processor_log() with open(log_file, 'a') as f: log_line = [datetime.now().isoformat(), step] for gpu in GPUtil.getGPUs(): log_line.append(f'{gpu.load*100:.2f}') for cpu_util in psutil.cpu_percent(percpu=True): log_line.append(f'{cpu_util:.2f}') f.write(f'{",".join(map(str, log_line))}\n')
def optimize_with_config(config_path=None, config=None, verbose=True, set_log_dir=False): assert config_path or config, 'Either a config path or a config dictionary must be provided' assert config is None or isinstance(config, dict), 'config must be a dictionary' if config_path: with open(config_path) as f: config = json.load(f) device_graph_path = config['device_graph_path'] net_path = config['net_path'] log_dir = config.get('log_dir', '') if log_dir and set_log_dir: exprimo.set_log_dir(log_dir) if verbose: log('\n\n\n') log('=' * 100) log('EXPRIMO OPTIMIZATION'.rjust(60)) log('=' * 100) log() if verbose: if config_path: log(f'Using config path {config_path}') else: log('Using config provided as dictionary') args = config.get('optimizer_args', {}) batches = args.get('batches', 1) pipeline_batches = args.get('pipeline_batches', 1) args['batches'] = batches args['pipeline_batches'] = pipeline_batches if 'benchmarking_function' in args and isinstance( args['benchmarking_function'], dict): args['benchmarking_function'] = create_benchmark_function( **args['benchmarking_function']) comp_penalty = args.get('simulator_comp_penalty', 1.0) comm_penalty = args.get('simulator_comm_penalty', 1.0) optimizers = { 'random_hill_climber': RandomHillClimbingOptimizer, 'hill_climber': HillClimbingOptimizer, 'linear_search': LinearSearchOptimizer, 'simulated_annealing': SimulatedAnnealingOptimizer, 'sa': SimulatedAnnealingOptimizer, 'scipy_sa': ScipySimulatedAnnealingOptimizer, 'scipy_simulated_annealing': ScipySimulatedAnnealingOptimizer, 'genetic_algorithm': GAOptimizer, 'ga': GAOptimizer, 'pso': ParticleSwarmOptimizer, 'particle_swarm': ParticleSwarmOptimizer, 'map_elites': MapElitesOptimizer, 'map-elites': MapElitesOptimizer } if config['optimizer'] in ['sa', 'simulated_annealing'] and isinstance( args['temp_schedule'], list): tp = args['temp_schedule'] args['temp_schedule'] = temp_schedules[tp[0]](*tp[1:]) optimizer = optimizers[config['optimizer']](**args) device_graph = DeviceGraph.load_from_file(device_graph_path) with open(net_path) as f: net_string = f.read() if verbose: log(f'Optimizing {net_path} on {device_graph_path} using {optimizer}') log(args) log() best_net = optimizer.optimize(net_string, device_graph) net_dict = json.loads(best_net) graph = ComputationGraph() graph.load_from_string(best_net) simulator = Simulator(graph, device_graph) simulated_execution_time, events = simulator.simulate( batch_size=128, print_memory_usage=config.get('print_memory_usage', False), print_event_trace=config.get('print_event_trace', False), return_event_trace=True, batches=batches, pipeline_batches=pipeline_batches, comm_penalization=comm_penalty, comp_penalization=comp_penalty) if config.get('plot_event_trace', True): save_path = os.path.join(exprimo.get_log_dir(), 'event_trace.pdf') plot_event_trace(events, simulator, save_path=save_path) if verbose: log('\n') # print(f'Best discovered configuration: {[layer["device"] for layer in net_dict["layers"].values()]}') log(f'Simulated execution time: {simulated_execution_time:.2f}ms') if config.get('benchmark_solution', False) and args.get( 'benchmarking_function', None): device_assignment = get_device_assignment(net_dict) time = args['benchmarking_function'](device_assignment) log(f'Benchmarked execution time: {time:.2f}ms') return best_net, simulated_execution_time
def run_n_times(n): for i in tqdm(range(n)): run_experiment(lg_dir=os.path.join(log_dir, f'{i:03}')) plot_results(os.path.join(get_log_dir(), 'checkpoints', 'scores.csv'), os.path.join(get_log_dir(), 'batch_times.csv'))
x, y = all_results['time_simulated'], all_results['time_benchmarked'] plt.scatter(x, y) if plot_regression: x_min = np.min(x) x_max = np.max(x) x1 = np.arange(x_min, x_max, (x_max - x_min) / 1000) m, b = np.polyfit(x, y, 1) plt.plot(x1, m * x1 + b, c='orange', ls='--') corr = np.corrcoef(x, y) print(f'Pearson coefficient: R = {corr[0][1]}') plt.xlabel('Simulated batch time (ms)') plt.ylabel('Benchmarked batch time (ms)') plt.tight_layout() plt.savefig(os.path.join(lg_dir, 'scatter_plot.pdf')) plt.show() if __name__ == '__main__': if repeats == 1: run_experiment(log_dir=log_dir) plot_results(os.path.join(get_log_dir(), 'checkpoints', 'scores.csv'), os.path.join(get_log_dir(), 'batch_times.csv')) else: run_n_times(repeats)
def run_optimization(steps, benchmarking_function=None, start_step=0): nonlocal archive_individuals, archive_scores if self.verbose: if benchmarking_function: log('Optimizing with benchmarking...') else: log('Optimizing with simulator...') step_size = 1 if benchmarking_function else self.n_threads for i in tqdm(range(0, steps, step_size), disable=not self.verbose): init_number = min(max(0, self.initial_size - i), self.n_threads) if self.include_trivial_solutions and i == 0: candidates = create_candidates(init_number, create_trivial=True, create_random=True) else: candidates = create_candidates(init_number, create_random=True) if init_number > 0: candidates += create_candidates( self.n_threads - init_number, selectable_candidates=candidates[:]) else: candidates += create_candidates(self.n_threads - init_number) if benchmarking_function: eval_results = [ benchmark(candidates[0], benchmarking_function) ] elif self.n_threads == 1: eval_results = [evaluate(candidates[0])] else: fn_args = zip(((create_description(c), c) for c in candidates), repeat(net_string), repeat(groups), repeat(device_graph), repeat(self.pipeline_batches), repeat(self.batches), repeat(self.simulator_comp_penalty), repeat(self.simulator_comm_penalty), repeat(self.device_memory_utilization)) eval_results = self.worker_pool.starmap(_evaluate, fn_args) for result in eval_results: score, description, individual = result previous_elite_score = archive_scores[description[0], description[1], description[2]] if np.isnan(previous_elite_score ) or previous_elite_score < score: archive_scores[description[0], description[1], description[2]] = score archive_individuals[description[0], description[1], description[2], :] = individual if self.verbose and (i + 1) % self.verbose < step_size: best_time = 1 / np.nanmax(archive_scores) log(f'[{i + 1}/{steps}] Best time: {best_time:.4f}ms') if self.score_save_period and (i % self.score_save_period == 0 or steps - i < step_size): best_time = 1 / np.nanmax(archive_scores) with open(os.path.join(get_log_dir(), 'time_history.csv'), 'a') as f: f.write(f'{i + start_step + 1}, {best_time}\n') if self.archive_log_period and ( i + 1) % self.archive_log_period < step_size: log_archive(f'step_{i + start_step + 1:06}.csv')
def optimize(self, net_string, device_graph, return_full_archive=False): if self.n_threads > 1: self.worker_pool = Pool(self.n_threads) n_devices = len(device_graph.devices) groups = self.create_colocation_groups( get_flattened_layer_names(net_string)) if self.dimension_sizes[0] == -1: self.dimension_sizes[0] = n_devices if self.dimension_sizes[1] == -1: self.dimension_sizes[1] = n_devices if self.dimension_sizes[2] == -1: comp_graph = ComputationGraph() comp_graph.load_from_string(net_string) _, max_jumps = comp_graph.get_number_of_jumps( return_max_jumps=True) self.dimension_sizes[2] = max_jumps archive_scores = np.empty(self.dimension_sizes) archive_scores[:] = np.NaN archive_individuals = np.zeros(list(self.dimension_sizes) + [len(groups)], dtype=int) def evaluate(individual): return _evaluate(individual, net_string, groups, device_graph, self.dimension_sizes, self.pipeline_batches, self.batches, self.simulator_comp_penalty, self.simulator_comm_penalty, self.device_memory_utilization) def mutate(individual): new_individual = [] if random.random() < self.replace_mutation_rate: devices_present = list(set(individual)) i1 = random.choice(devices_present) i2 = random.choice(devices_present) new_individual = [i2 if i == i1 else i for i in individual] elif random.random() < self.zone_mutation_rate: split1 = random.randint(0, len(individual) - 1) split2 = split1 + min(np.random.geometric(0.2), len(individual) - split1) dev = random.randint(0 if self.allow_cpu else 1, n_devices - 1) new_individual = individual[:split1] + [dev] * ( split2 - split1) + individual[split2:] else: for i, gene in enumerate(individual): if random.random() < self.copy_mutation_rate and i > 0: new_individual.append(individual[i - 1]) elif random.random() < self.mutation_rate: if self.allow_cpu: new_individual.append( random.randint(0, n_devices - 1)) else: new_individual.append( random.randint(1, n_devices - 1)) else: new_individual.append(gene) return new_individual def crossover(parent1, parent2): crossover_point = random.randint(1, len(parent1) - 1) return parent1[:crossover_point] + parent2[crossover_point:] def create_candidates(n, create_random=False, create_trivial=False, selectable_candidates=None): if n <= 0: return [] candidates = [] if create_trivial: candidates.extend([[i] * len(groups) for i in range(1, n_devices)]) n -= n_devices - 1 if self.allow_cpu: candidates.append([0] * len(groups)) n -= 1 if create_random: while len(candidates) < n: candidates.append( generate_random_placement( len(groups), n_devices, allow_device_0=self.allow_cpu)) else: selectable_indices = np.argwhere(np.isfinite(archive_scores)) # selectable_indices = sorted(selectable_indices, key=lambda x: -archive_scores[x[0], x[1], x[2]]) while len(candidates) < n: c = [] if selectable_candidates: for _ in range(1 + int( random.random() < self.crossover_rate)): c.append(random.choice(selectable_candidates)) else: if self.selection == 'random': for _ in range(1 + int( random.random() < self.crossover_rate)): idx = random.choice(selectable_indices) c.append( archive_individuals[idx[0], idx[1], idx[2], :].tolist()) elif self.selection == 'tournament': idx = [] t = min(self.tournament_size, len(selectable_indices)) while len(idx) < 1 + int( random.random() < self.crossover_rate): competitors = random.sample( selectable_indices.tolist(), t) winner = max(competitors, key=lambda x: archive_scores[x[ 0], x[1], x[2]]) idx.append(winner) for i in idx: c.append(archive_individuals[i[0], i[1], i[2], :].tolist()) if len(c) == 2: candidate = crossover(*c) else: candidate = c[0] candidate = mutate(candidate) candidates.append(candidate) return candidates def create_description(individual): c = Counter(individual) device_mode = c.most_common(1)[0][0] device_mode = round((device_mode / len(device_graph.devices)) * self.dimension_sizes[0]) used_devices = round( ((len(set(individual)) - 1) / (len(device_graph.devices))) * self.dimension_sizes[1]) comp_graph_dict = apply_placement(net_string, individual, groups) comp_graph = ComputationGraph() comp_graph.load_from_string(json.dumps(comp_graph_dict)) num_jumps, max_jumps = comp_graph.get_number_of_jumps( return_max_jumps=True) num_jumps = round( (num_jumps / max_jumps) * (self.dimension_sizes[2] - 1)) return (device_mode, used_devices, num_jumps) def benchmark(individual, benchmarking_function): device_assignment = get_device_assignment( apply_placement(net_string, individual, groups)) time, memory_overflow = benchmarking_function( device_assignment, return_memory_overflow=True) description = create_description(individual) # Time is set to -1 if memory overflows - but we check with memory_overflow instead time = max(time, 0) if memory_overflow == -1: memory_overflow = 1 if memory_overflow > 0: time += memory_overflow * 10**9 * 1 return 1 / time, description, individual def reevaluate_archive(benchmarking_function=None, n_keep=None, time_threshold=None): indices = list(np.argwhere(np.isfinite(archive_scores))) if time_threshold: indices = [ i for i in indices if archive_scores[i[0], i[1], i[2]] >= 1 / time_threshold ] if n_keep: indices = sorted( indices, key=lambda i: -archive_scores[i[0], i[1], i[2]]) indices = indices[:n_keep] assert len( indices), 'No solutions fulfill the specified requirements' archive_scores[:] = np.NaN if self.verbose: if n_keep: log(f'Reevaluating {n_keep} best individuals in archive (and throwing away the rest)' ) else: log('Reevaluating all individuals in archive') if time_threshold: log(f'Time threshold: {time_threshold}ms') for i in tqdm(indices, disable=not self.verbose): individual = archive_individuals[i[0], i[1], i[2], :].tolist() if benchmarking_function: archive_scores[i[0], i[1], i[2]] = benchmark(individual, benchmarking_function)[0] else: archive_scores[i[0], i[1], i[2]] = evaluate(individual)[0] def log_archive(file_name): indices = list(np.argwhere(np.isfinite(archive_scores))) indices = sorted(indices, key=lambda i: -archive_scores[i[0], i[1], i[2]]) with open(os.path.join(get_log_dir(), 'archive_logs', file_name), 'w') as f: f.write('niche; time; placement\n') for i in indices: niche = tuple(i) time = 1 / archive_scores[i[0], i[1], i[2]] placement = archive_individuals[i[0], i[1], i[2]].tolist() f.write(f'{niche}; {time}; {placement}\n') def run_optimization(steps, benchmarking_function=None, start_step=0): nonlocal archive_individuals, archive_scores if self.verbose: if benchmarking_function: log('Optimizing with benchmarking...') else: log('Optimizing with simulator...') step_size = 1 if benchmarking_function else self.n_threads for i in tqdm(range(0, steps, step_size), disable=not self.verbose): init_number = min(max(0, self.initial_size - i), self.n_threads) if self.include_trivial_solutions and i == 0: candidates = create_candidates(init_number, create_trivial=True, create_random=True) else: candidates = create_candidates(init_number, create_random=True) if init_number > 0: candidates += create_candidates( self.n_threads - init_number, selectable_candidates=candidates[:]) else: candidates += create_candidates(self.n_threads - init_number) if benchmarking_function: eval_results = [ benchmark(candidates[0], benchmarking_function) ] elif self.n_threads == 1: eval_results = [evaluate(candidates[0])] else: fn_args = zip(((create_description(c), c) for c in candidates), repeat(net_string), repeat(groups), repeat(device_graph), repeat(self.pipeline_batches), repeat(self.batches), repeat(self.simulator_comp_penalty), repeat(self.simulator_comm_penalty), repeat(self.device_memory_utilization)) eval_results = self.worker_pool.starmap(_evaluate, fn_args) for result in eval_results: score, description, individual = result previous_elite_score = archive_scores[description[0], description[1], description[2]] if np.isnan(previous_elite_score ) or previous_elite_score < score: archive_scores[description[0], description[1], description[2]] = score archive_individuals[description[0], description[1], description[2], :] = individual if self.verbose and (i + 1) % self.verbose < step_size: best_time = 1 / np.nanmax(archive_scores) log(f'[{i + 1}/{steps}] Best time: {best_time:.4f}ms') if self.score_save_period and (i % self.score_save_period == 0 or steps - i < step_size): best_time = 1 / np.nanmax(archive_scores) with open(os.path.join(get_log_dir(), 'time_history.csv'), 'a') as f: f.write(f'{i + start_step + 1}, {best_time}\n') if self.archive_log_period and ( i + 1) % self.archive_log_period < step_size: log_archive(f'step_{i + start_step + 1:06}.csv') if self.score_save_period: with open(os.path.join(get_log_dir(), 'time_history.csv'), 'w') as f: f.write('step, time\n') run_optimization(self.steps) if self.worker_pool: self.worker_pool.close() if self.archive_log_period is not None: log_archive('1_simulation_finished.csv') if self.benchmarking_steps > 0 or self.benchmark_before_selection: reevaluate_archive(self.benchmarking_function, n_keep=self.benchmarking_n_keep, time_threshold=self.benchmarking_time_threshold) if self.archive_log_period is not None: log_archive('2_reevaluated.csv') if self.benchmarking_steps > 0: run_optimization(self.benchmarking_steps, self.benchmarking_function, self.steps) log_archive('3_benchmarking_finished.csv') if self.show_score_plot: if self.verbose: log('Plotting archive scores...', end='') graph = ComputationGraph() graph.load_from_string(net_string) _, max_jumps = graph.get_number_of_jumps(return_max_jumps=True) plot_map_elites_archive(archive_scores, n_devices, max_jumps, self.plot_axes, save_path=os.path.join( get_log_dir(), 'archive_plot.pdf')) if self.verbose: log('Done') if self.plot_animation: if not self.archive_log_period and self.verbose: log('self.plot_animation was set to True, but archive logging was not enabled. ' 'Skipping animation plot.') else: if self.verbose: log('Plotting archive animation...', end='') paths = glob( os.path.join(get_log_dir(), 'archive_logs', 'step_*.csv')) plot_archive_animation( paths, (os.path.join(get_log_dir(), 'archive_animation.mp4'), os.path.join(get_log_dir(), 'archive_animation.gif')), self.dimension_sizes, n_devices=n_devices, max_jumps=max_jumps, axes=self.plot_axes, fps=self.animation_fps) if self.verbose: log('Done') if return_full_archive: return archive_scores, archive_individuals best_index = np.nanargmax(archive_scores) best_individual = archive_individuals.reshape( (-1, len(groups)))[best_index] if self.verbose: log(f'Best individual: {best_individual.tolist()}') solution = json.dumps(apply_placement(net_string, best_individual.tolist(), groups), indent=4) with open(os.path.join(get_log_dir(), 'me_solution.json'), 'w') as f: f.write(solution) return solution