class EvaluationHelper: """Analyse performance of runs in experiment_dir.""" def __init__(self, runs, user=None, zs=None): """Read given runs.""" self.runs = SortedSet(runs) self.perfs, self.confs = self.read_runs(runs) if zs is None: self.zs = ['z', 'rollout', 'z_sup', 'z_dyn'] else: self.zs = zs self.add_size() self.color_cylce = plt.rcParams["axes.prop_cycle"].by_key()["color"] # read rollout errors as property, see below self._rollouts = None frames = ['pred', 'recon', 'total'] self.supairvised_groups = list(product(['vin_true'], frames, [True, False]))\ + list(product(['vin_sup', 'sup_true'], frames, [False])) # for printing of gif_wall location if user is None: self.user = '******' else: self.user = user def read_runs(self, runs): """Read performance metrics and configs for runs.""" perfs = {} confs = {} for run in runs: try: perfs[run] = pd.read_csv(os.path.join( run, 'performance.csv')).set_index('step') perfs[run]['type'] = perfs[run]['type'].str.strip() confs[run] = pd.read_csv(os.path.join( run, 'config.txt')).set_index('setting name') except Exception as e: print(e) print("Could not read run {}.".format(run)) print("Deleting from self.runs") self.runs = self.runs - {run} return perfs, confs def update_runs(self, extras): """Read extra runs from different experiment_dir. Use to compare against precious experiments. """ extra_perfs, extra_confs = self.read_runs(extras) self.perfs.update(extra_perfs) self.confs.update(extra_confs) self.runs = self.runs.union(extras) self.add_size(extras) def add_size(self, runs=None): """Add size and duration of performance.csv to config. This adds 'progress' of run. """ if runs is None: runs = list(self.runs) for run in runs: perf = self.perfs[run] run_size = perf.index.max() self.confs[run] = self.confs[run].append( pd.Series(data={' setting value': run_size}, name='max_step')) # time in hours duration = perf.time.max()/60/60 self.confs[run] = self.confs[run].append( pd.Series(data={' setting value': duration}, name='duration')) def show_differences(self, runs=None): """Show differences of configs between runs. Immediately see what config values are different from default for this experiment dir. """ if runs is None: runs = self.runs configs = pd.DataFrame() # Stack all Configs and show differences for run in runs: conf = self.confs[run] tmp = conf.T tmp['run'] = run tmp = tmp.set_index('run') configs = configs.append(tmp, sort=False) differences = configs.T.apply(lambda x: not all(i == x[0] for i in x), axis=1) tmp = configs.T[differences].T pd.set_option('display.max_columns', 500) tmp = tmp.sort_values('run') return tmp def gif_wall(self, runs=None, target='rollout_00.gif', file='gif_wall.html'): """Create a html file, which displays the target gifs for all runs.""" if runs is None: runs = list(self.runs) gif_path = os.path.join('gifs', target) gif_html = """ <div class="img-frame-cap"> <img src="{}" width="180" height="180+15"> <div class="caption"> {} </div> </div> """ style = """ <style> body { background-color: #000000; } .img-frame-cap { width:200px; height:200px; background:#333; padding:18px 18px 2px 18px; border:1px solid #666; display:inline-block; } .caption { text-align:center; margin-top:4px; font-size:12px; color: #FFFFFF; } </style> """ skeleton = [ '<HTML>', '<HEAD> <TITLE>Gif WALL</TITLE> </HEAD>', style, '<BODY>', '</BODY>', '</HTML>'] skeleton = [i+'\n' for i in skeleton] with open(os.path.join('test', file), 'w') as f: f.write(skeleton[0]) f.write(skeleton[1]) f.write(style) for run in runs: location = os.path.join('../', run, gif_path) f.write(gif_html.format(location, location)) f.write(skeleton[-2]) f.write(skeleton[-1]) # print url for easy access print( 'file:///Users/{}/Documents/remote/thesis/code/'.format(self.user) + '/'.join(os.getcwd().split('/')[-3:]) + '/test/gif_wall.html') def perf_plot(self, perf, z, ax, col='error', label=None, rol=1, color=None): """For a given run and z, plot performance column. Note that perf=self.perfs[run]. """ tmp = perf.loc[perf.type == z, col] if 'roll' in z: tmp = tmp.groupby('step').mean() if label is None: label = z tmp.rolling(rol).mean().plot(label=label, alpha=0.5, ax=ax, c=color) return tmp def plot_error(self, run, perf, col, rol=1): """For a given run and col, plot error for all zs.""" fig, ax = plt.subplots(figsize=(12, 10)) plt.title(run + " " + self.confs[run].T.description.values[0]) for z in self.zs: try: self.perf_plot(perf, z, plt.gca(), col, rol=rol) except Exception as e: print(run, e) plt.legend() return fig, ax def compare_errors(self, runs=None, rol=1, col='error', colors=None): """For a given col, plot errors for all runs and zs.""" if runs is None: runs = self.runs fig, axs = plt.subplots(4, 1, figsize=(15, 10), sharex=True) for run in runs: if colors is not None: c = colors[1][run] else: c = None perf = self.perfs[run] for z, ax in zip(self.zs, axs): try: self.perf_plot(perf, z, ax, col=col, label=run, rol=rol, color=c) except Exception as e: print(run, e) axs[1].legend(bbox_to_anchor=(1.05, 1.05)) if colors is not None: for i, (value, color) in enumerate(colors[0].items()): axs[-1].text( 0.9, 0.9-0.1*i, str(value), horizontalalignment='right', verticalalignment='top', transform=axs[1].transAxes, color=color) return fig, axs def speed_comparison(self, runs=None): """Compare speeds of runs.""" if runs is None: runs = self.runs fig, ax = plt.subplots(1, 1, figsize=(10, 10)) for run in runs: perf = self.perfs[run] if not perf.size > 0: continue try: perf.loc[perf.type == 'z'].time.plot(label=run, alpha=.5, ax=ax) except Exception as e: print(e) print('Not possible for run {}'.format(run)) perf.time.plot(label=run, alpha=0.5, ax=ax) plt.legend(bbox_to_anchor=(1.05, 1.05)) plt.show() return fig, ax def compare_runs(self, col='elbo', description=None, legend=False, runs=None, colors=None, **kwargs): """For a given col, compare values over all plots. Unlike previous methods, this is only used for metric, which do not differ between zs, such as ['elbo', 'bg', 'patch',...] """ fig, ax = plt.subplots(1, figsize=(12, 6)) plt.title(col) if runs is None: runs = self.runs for run in runs: perf = self.perfs[run] # only plot if description matches if description is not None: if not all(self.confs[run].T.description == description): continue # ignore empty runs try: # if multiple types present, select any but rollout if 'type' in perf.columns: # choose any type but rollout z = list(set(perf.type.unique()) - {'rollout'}) z = [i for i in z if 'roll' not in i] z = z[0] df_plot = perf[perf.type == z] else: df_plot = perf if colors is not None: c = colors[1][run] else: c = None df_plot.plot(y=col, ax=ax, label=run, alpha=0.9, legend=legend, color=c, **kwargs) if legend is True: plt.legend(bbox_to_anchor=(1.05, 1.05)) except Exception as e: print(e) print(run) if colors is not None: for i, (value, color) in enumerate(colors[0].items()): ax.text( 0.9, 0.9-0.1*i, str(value), horizontalalignment='right', verticalalignment='top', transform=ax.transAxes, color=color) return fig, ax def get_highlighting(self, highlighter, condition=None, runs=None): """Color runs depending on config. All methods with color argument take color_map returned by this function to color runs depending on their value on one of the configs arguments. Example: For highlighter='num_epochs'. All runs are colored by their number of epochs: same color for same num_epochs, each distinct num_epoch value is assigned a different color. Returns: color_map (dict): Contains mapping of unique highlighter values to colors. mapping (dict): Contains mapping of runs to colors, depending on value highlighter takes in conf. Pass colors as argument to other functions. """ if runs is None: runs = self.runs # first, apply condition to all runs possible_values = set() realised_values = dict() for run in runs: try: conf = self.confs[run].T[highlighter].values[0] except Exception as e: print(e) print("Did not find conf '{}' for '{}', set to None.".format( highlighter, run)) conf = None if condition is not None: # check more than just content of column conf = condition(conf) possible_values.update({conf}) realised_values.update({run: conf}) color_map = dict(zip(list(possible_values), self.color_cylce)) mapping = {r: color_map[v] for r, v in realised_values.items()} colors = [color_map, mapping] return colors def aggregate_comparisons(self, description=None, with_z=False): """Plot compare_runs for common metrics.""" compare_runs = self.compare_runs try: compare_runs('elbo', legend=True, description=description) compare_runs('bg', description=description) compare_runs('patch', description=description) compare_runs('overlap', description=description) compare_runs('log_q', description=description) if with_z: compare_runs('scale_x', description=description) compare_runs('scale_y', description=description) compare_runs('error', description=description) compare_runs('std_error', description=description) except Exception as e: print(e) def plot_single_rollout(self, run, col='x_errors'): """Plot rollout error evolution for a single run.""" if col is None: col = 'x_errors' # always load x_error (see rollout property) error = self.rollouts[col][run] fig, ax = plt.subplots(1, 1) error.T.plot(label='', legend=None, color='gray', alpha=0.1, ax=ax) error.mean(axis=0).plot(ax=ax) # avg error over first and last 10 epochs error.iloc[-5:-1].mean(axis=0).plot(c='g') error.iloc[0:5].mean(axis=0).plot(c='r') # start of inference # inference = int(self.confs[run].T.num_visible.values[0])\ # - int(self.confs[run].T.skip.values[0]) # plt.plot([inference, inference], [0, 0.3]) ax.set_title(run) return fig, ax def plot_all_rollouts(self, runs=None, col=None): """Plot rollout error evolutions for all runs.""" if runs is None: runs = self.runs figs, axs = [], [] for run in list(self.runs): try: f, a = self.plot_single_rollout(run, col) figs.append(f) axs.append(a) except Exception as e: print(e) print('Not available for run {}'.format(run)) return figs, axs def plot_compare_rollouts(self, runs=None, which='x_errors'): """Plot final rollout errors for all runs.""" if runs is None: runs = self.runs fig, ax = plt.subplots(1, 1, figsize=(15, 10)) errors = self.rollouts[which] for run in list(runs): try: error = errors[run] error.iloc[-1].plot(label=run) except Exception as e: print(e) print('Not available for run {}'.format(run)) ax.legend(bbox_to_anchor=(1.05, 1.05)) ax.set_title('avg over last 5 epochs') return fig, ax @property def rollouts(self): """Load rollout errors as property.""" # check if already loaded if self._rollouts is not None: return self._rollouts # if not present, load rollout errors from csv v_errors = {} x_errors = {} v_errors_std = {} x_errors_std = {} x_errors_sup = {} x_errors_std_sup = {} for run in list(self.runs): try: x_errors[run] = pd.read_csv(os.path.join(run, 'error.csv'), header=None) x_errors_std[run] = pd.read_csv(os.path.join(run, 'std_error.csv'), header=None) except Exception as e: x_errors[run] = None x_errors_std[run] = None print(e) print('x_error available for run {}'.format(run)) try: v_errors[run] = pd.read_csv(os.path.join(run, 'v_error.csv'), header=None) v_errors_std[run] = pd.read_csv(os.path.join(run, 'std_v_error.csv'), header=None) except Exception as e: v_errors[run] = None v_errors_std[run] = None print(e) print('v_error not available for run {}'.format(run)) # Load supervised errors if model was supervised. # If error or not supervised write None. try: if self.confs[run].T.supairvised.values[0] != 'True': raise x_errors_sup[run] = pd.read_csv(os.path.join(run, 'error_sup.csv'), header=None) x_errors_std_sup[run] = pd.read_csv(os.path.join(run, 'std_error_sup.csv'), header=None) except Exception as _: x_errors_sup[run] = None x_errors_std_sup[run] = None try: if self.confs[run].T.supairvised.values[0] != 'True': raise x_errors_sup[run] = pd.read_csv(os.path.join(run, 'error_sup.csv'), header=None) x_errors_std_sup[run] = pd.read_csv(os.path.join(run, 'std_error_sup.csv'), header=None) except Exception as _: x_errors_sup[run] = None x_errors_std_sup[run] = None self._rollouts = dict() self._rollouts['v_errors'] = v_errors self._rollouts['x_errors'] = x_errors self._rollouts['v_errors_std'] = v_errors_std self._rollouts['x_errors_std'] = x_errors_std self._rollouts['x_errors_sup'] = x_errors_sup self._rollouts['x_errors_std_sup'] = x_errors_std_sup return self._rollouts @staticmethod def run_fmt(x, with_under=False): """Format list of ints as run-strings.""" return 'run{:03d}'.format(x) if not with_under else 'run_{:03d}'.format(x) @staticmethod def descr2folder(run): """Find a run given its config description. If multiple matches, returns first only. """ for folder_run, conf in self.confs.items(): if conf.T.description.values[0][-3:] == run[-3:]: return folder_run print('Run {} cant be found'.format(run)) return -1 @staticmethod def folder2descr(run): """Given a run, return config description.""" return self.confs[run].T.description.values[0] def plot_supairvised_errors(self, runs=None): """Plot errors for supairvised model. Some info on the displayed columns for to remember: - vin_true pred False/True is the most interesting error. Prediction error not always measured against true labels, for use_supair=True this is measured as model against supair values. - vin_sup pred False/True is also interesting. How good is vin prediction w.r.t supair values. (There should be no big differences for False/True (train/test) here. If there are, we are overfitting on the training set.) - sup_true recon/pred/total false/true should all be roughly the same b/c supair is pretrained and should perform the same no matter on which images it is applied. - vin_sup recon is always 0, bc vin does not encode supair states - vin_true recon is equal to sup_true recon bc of that """ if runs is None: runs = self.runs groups_dict = dict(zip(self.supairvised_groups, range(12))) fig, axs = plt.subplots(12, 1, figsize=(15, 25), sharex=True) for name, i in groups_dict.items(): axs[i].set_title(name) for run in runs: perf = self.perfs[run] # automatically skip all non supairvised runs try: supairvised = bool(self.confs[run].T.supairvised.values[0]) if not supairvised: continue except Exception as _: continue groups = perf.groupby(['type', 'frame', 'test']) for name, group in groups: i = groups_dict[name] group.error.plot(label=run, ax=axs[i]) axs[0].legend(bbox_to_anchor=(1.05, 1.05)) plt.tight_layout() return fig, axs def plot_relevant_supairvised_errors(self, runs=None): """Plot only relevant errors for supairvised models. A lot of the information from plot_supairvised_errors is redundant. Only plot the most relevant columns to ease confusion. """ if runs is None: runs = self.runs relevant_cols = [ ('vin_true', 'pred', True), ('vin_true', 'pred', False), ('vin_sup', 'pred', False), ] groups_dict = dict(zip(relevant_cols, range(12))) fig, axs = plt.subplots(len(relevant_cols), 1, figsize=(15, 15), sharex=True) for name, i in groups_dict.items(): axs[i].set_title(name) for run in runs: perf = self.perfs[run] # automatically skip all non supairvised runs try: supairvised = bool(self.confs[run].T.supairvised.values[0]) if not supairvised: continue except Exception as e: continue groups = perf.groupby(['type', 'frame', 'test']) for name, group in groups: try: i = groups_dict[name] group.error.plot(label=run, ax=axs[i]) except Exception as e: continue axs[1].legend(bbox_to_anchor=(1.05, 1.05)) axs[0].legend(bbox_to_anchor=(1.05, 1.05)) plt.tight_layout() return fig, axs def compare_rollouts_with_supairvised(self, runs=None): """Add suparivised models to rollout plots.""" if runs is None: runs = self.runs fig, ax = plt.subplots(figsize=(16, 10)) for run in runs: perf = self.perfs[run] # figure out if supairvised or not supairvised = bool(self.confs[run].T.supairvised.values[0]) if supairvised: cond = ( (perf.type == 'vin_true') & (perf.frame == 'pred') & (perf.test == True)) perf = perf.loc[cond] else: perf = perf.loc[perf.type == 'rollout'] perf.groupby('step').error.mean().plot(label=run, ax=ax) ax.legend(bbox_to_anchor=(1.05, 1.05)) plt.tight_layout() plt.show() return fig, ax
def test_union(self): s = SortedSet({1, 2, 3}) t = [2, 3, 4] self.assertEqual(s.union(t), SortedSet({1, 2, 3, 4}))