def infer(self, model_path, save_path): loss = [] self.create_dataloader() if not torch.cuda.is_available(): state_dict = torch.load(model_path, map_location=torch.device('cpu')) else: state_dict = torch.load(model_path) self.model.load_state_dict(state_dict['model_state_dict']) self.optimizer.load_state_dict(state_dict['optimizer_state_dict']) self.model.eval() with torch.no_grad(): for i, batch in enumerate(self.dataloader): if i % 10 == 0 and i > 0: print('{}: Inferred on {} events'.format( get_time(), i * self.config['val_batch_size'])) x = batch[0].to(self.device).float() y = batch[1].to(self.device).float() events = batch[2].to(self.device) y_hat = self.model.forward(x) loss.append(self.loss(y_hat, y)) self.on_test_step(y_hat, events) # if i == 12: # break self.on_test_end()
def early_stopping(self, epoch, epoch_val_loss, model_state_dict, optimizer_state_dict): epoch_val_loss = round(epoch_val_loss.item(), 3) if epoch == 0 or epoch_val_loss < min(self.epoch_val_loss): best_val_loss = epoch_val_loss self.save_model_state(epoch, model_state_dict, optimizer_state_dict) self.early_stopping_counter = 0 print('{}: best model yet, saving'.format(get_time())) else: self.early_stopping_counter += 1 print('{}: model didn\'t improve for {} epoch(s)'.format( get_time(), self.early_stopping_counter)) self.epoch_val_loss.append(epoch_val_loss) if self.early_stopping_counter >= self.config['patience']: return True else: return False
def fit(self): self.create_dataloaders() for self.epoch in range(self.config['num_epochs']): self.reporter.on_epoch_start() self.train_epoch() epoch_val_loss = self.epoch_validation() make_early_stop = self.saver.early_stopping( self.epoch, epoch_val_loss, self.model.state_dict(), self.optimizer.state_dict()) self.reporter.on_epoch_end() if make_early_stop: print('{}: early stopping activated'.format(get_time())) break
def on_intermediate_training_end(self): self.avg_train_loss = torch.stack(self.train_loss).mean() self.training_end_timestamp = datetime.now() self.train_time_delta = ( self.training_end_timestamp - self.training_start_timestamp).total_seconds() log_text = (''' {}: Iteration {} / epoch {} Train loss: {:.3f} / {} batches / {:.1f} events/s '''.format( get_time(), self.iteration, self.current_epoch, self.avg_train_loss, len(self.train_loss), len(self.train_loss) * self.config['batch_size'] / self.train_time_delta, )) print(log_text) return self.avg_train_loss
def on_epoch_validation_end(self): self.val_end_timestamp = datetime.now() self.val_time_delta = (self.val_end_timestamp - self.val_start_timestamp).total_seconds() self.iteration += 1 avg_val_loss = torch.stack(self.val_loss).mean() log_text = (''' {}: Epoch validation / epoch {} Val loss: {:.3f} / {} batches / {:.1f} events/s '''.format( get_time(), self.current_epoch, avg_val_loss, len(self.val_loss), len(self.val_loss) * self.config['val_batch_size'] / self.val_time_delta)) print(log_text) self.train_loss = [] self.training_step = 0 self.val_loss = [] self.val_step = 0 self.first_train = True self.first_val = True return avg_val_loss
from src.modules.inferer import Inferer RUN_NAME = 'lemon-akita' ROOT = get_project_root() DOWNLOAD_FOLDER = ROOT.joinpath('mains/downloads').joinpath(RUN_NAME) DOWNLOAD_FOLDER.mkdir(exist_ok=True) api = wandb.Api() runs = api.runs('ehrhorn/cubeflow') for run in runs: if run.name == RUN_NAME: run_id = run.id run = api.run('ehrhorn/cubeflow/' + run_id) print('{}: Downloading run data'.format(get_time())) for file in run.files(): if file.name == 'model.pt' or file.name.split( '.')[-1] == 'py' or file.name == 'cnn.json': if file.name != 'code/mains/cnn.py': file.download(replace=True, root=str(DOWNLOAD_FOLDER)) if file.name != 'model.pt': model_file_name = file.name.split('/')[-1].split('.')[0] JSON_FILE = DOWNLOAD_FOLDER.joinpath('cnn.json') with open(str(JSON_FILE), 'r') as config_file: config_dict = json.load(config_file) config = Bunch(config_dict) config.save_train_dists = False config.wandb = False
def on_epoch_start(self): print(''' {}: {}: beginning epoch {} '''.format(get_time(), self.experiment_name, self.current_epoch))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-r', '--run', help='run name') args = parser.parse_args() experiment_name = args.run test_set_transformed_path = Path().home().joinpath('CubeFlowData').joinpath('dbs').joinpath('test_transformed.db') dirs, config = get_dirs_and_config(experiment_name, False) errors_db_path = dirs['dbs'].joinpath('errors.db') predictions_db_path = dirs['dbs'].joinpath('predictions.db') mask_and_split = MaskAndSplit(config, dirs, ['test']) sets = mask_and_split.split() config['val_batch_size'] = 2000 Loader = getattr(importlib.import_module('src.dataloaders.' + config['dataloader']), 'Dataloader') if 'SRTInIcePulses' in '-'.join(config['masks']): config['cleaning'] = 'SRTInIcePulses' config['cleaning_length'] = 'srt_in_ice_pulses_event_length' elif 'SplitInIcePulses' in '-'.join(config['masks']): config['cleaning'] = 'SplitInIcePulses' config['cleaning_length'] = 'split_in_ice_pulses_event_length' sets['test'] = sets['test'][0:20000] dataset = Loader( sets['test'], config, test_set_transformed_path, test=True ) events = [item for sublist in dataset.events for item in sublist] if not dirs['dbs'].joinpath('predictions.db').is_file(): print('{}: First run with these masks; saving truth and retro_crs_prefit to prediction db'.format(get_time())) TruthSaver(config, dirs, events) Loss = getattr(importlib.import_module('src.losses.losses'), config['loss']) loss_init = Loss() loss = loss_init.loss Model = getattr(importlib.import_module('src.models.' + config['model']), 'Model') model = Model() Optimizer = getattr(importlib.import_module('src.optimizers.optimizers'), config['optimizer']) optimizer_init = Optimizer(model.parameters(), config['min_learning_rate']) optimizer = optimizer_init.optimizer inferer = Inferer(model, optimizer, loss, dataset, config, experiment_name, dirs) model_path = dirs['run'].joinpath('model.pt') print('{}: Beginning inference'.format(get_time())) inferer.infer(model_path, dirs['run']) print('{}: Beginning error calculation'.format(get_time())) if not dirs['dbs'].joinpath('errors.db').is_file(): print('{}: First run with these masks; calculating retro_crs_prefit errors'.format(get_time())) ErrorCalculator('retro_crs_prefit', dirs) ErrorCalculator(experiment_name, dirs) print('{}: Beginning histogram calculation'.format(get_time())) HistogramCalculator(experiment_name, dirs) print('{}: Script done!'.format(get_time()))
def calculate_and_plot(files_and_dirs, dom_plots=False, use_train_dists=False, only_use_metrics=None, legends=True, reso_hists=False, use_own=True, reporter=None, wandb=False): first_metric_plot = True file_name = files_and_dirs['run_root'].joinpath( 'error_dataframe_parquet.gzip') errors_df = pd.read_parquet(file_name, engine='fastparquet') errors_df = errors_df[errors_df.energy <= 3.0] # comparison_df.energy = 10**comparison_df.energy.values if use_train_dists: TRAIN_DATA_DF_FILE = files_and_dirs['project_root'].joinpath( 'train_distributions/train_dists_parquet.gzip') train_data_df = pd.read_parquet(TRAIN_DATA_DF_FILE, engine='fastparquet') train_data_df = train_data_df[train_data_df.train_true_energy <= 3.0] if only_use_metrics is not None: errors_df = errors_df[errors_df.metric.isin(only_use_metrics)] PLOTS_DIR = files_and_dirs['run_root'].joinpath('plots') PLOTS_DIR.mkdir(exist_ok=True) RESO_PLOTS_DIR = PLOTS_DIR.joinpath('resolution_plots') RESO_PLOTS_DIR.mkdir(exist_ok=True) errors_df.replace([np.inf, -np.inf], np.nan, inplace=True) errors_df.dropna(inplace=True) energy_bins = calculate_energy_bins(errors_df) dom_bins = calculate_dom_bins(errors_df) metrics = [ metric.replace('own_', '').replace('_error', '') for metric in errors_df.keys() if not metric.find('own') ] print('{}: Calculating performance data for energy bins'.format( get_time())) performance_data = PerformanceData(metrics, df=errors_df, bins=energy_bins, bin_type='energy', percentiles=[0.16, 0.84], use_own=use_own) file_name = files_and_dirs['run_root'].joinpath( 'own_performance_energy_binned_dataframe_parquet.gzip') performance_data.own_performances_df.to_parquet(str(file_name), compression='gzip') file_name = files_and_dirs['run_root'].joinpath( 'opponent_performance_energy_binned_dataframe_parquet.gzip') performance_data.opponent_performances_df.to_parquet(str(file_name), compression='gzip') for metric in metrics: print('{}: Plotting {} metric, binned in energy'.format( get_time(), metric)) fig, markers_own = comparison_plot( metric, performance_data, train_data_df.train_true_energy.values if use_train_dists else None, legends) file_name = PLOTS_DIR.joinpath('{}_{}_reso_comparison.pdf'.format( 'energy_bins', metric)) fig.savefig(file_name) if wandb: buf = io.BytesIO() fig.savefig(buf, format='png') buf.seek(0) im = Image.open(buf) log_text = '{} resolution plot'.format(metric.title()) reporter.add_plot_to_wandb(im, log_text) buf.close() log_text = '{} resolution comparison'.format(metric.title()) reporter.add_metric_comparison_to_wandb(markers_own, log_text) plt.close(fig) fig = icecube_2d_histogram(metric, performance_data, legends) file_name = PLOTS_DIR.joinpath('{}_{}_ic_comparison.pdf'.format( 'energy_bins', metric)) fig.savefig(file_name) if wandb: buf = io.BytesIO() fig.savefig(buf, format='png') buf.seek(0) im = Image.open(buf) log_text = '{} IceCube histogram'.format(metric.title()) reporter.add_plot_to_wandb(im, log_text) buf.close() plt.close(fig) if reso_hists: for i, ibin in enumerate(energy_bins): indexer = errors_df.energy_binned == ibin fig = plot_error_in_bin( errors_df[indexer]['own_' + metric + '_error'].values, errors_df[indexer]['opponent_' + metric + '_error'].values, metric, ibin, 'energy', legends) file_name = RESO_PLOTS_DIR.joinpath( '{}_{}_resolution_in_bin_{:02d}.pdf'.format( 'energy_bins', metric, i)) fig.savefig(file_name) if wandb: reporter.save_file_to_wandb(str(file_name)) plt.close(fig) if dom_plots: print('{}: Calculating performance data for DOM bins'.format( get_time())) performance_data = PerformanceData(metrics, df=errors_df, bins=dom_bins, bin_type='doms', percentiles=[0.16, 0.84], use_own=use_own) for metric in metrics: print('{}: Plotting {} metric, binned in DOMs'.format( get_time(), metric)) fig, markers_own = comparison_plot( metric, performance_data, train_data_df.train_event_length.values if use_train_dists else None, legends) file_name = PLOTS_DIR.joinpath('{}_{}_reso_comparison.pdf'.format( 'dom_bins', metric)) fig.savefig(file_name) if wandb: reporter.save_file_to_wandb(str(file_name)) plt.close(fig) if reso_hists: for i, ibin in enumerate(dom_bins): indexer = errors_df.doms_binned == ibin fig = plot_error_in_bin( errors_df[indexer]['own_' + metric + '_error'].values, errors_df[indexer]['opponent_' + metric + '_error'].values, metric, ibin, 'dom', legends) file_name = RESO_PLOTS_DIR.joinpath( '{}_{}_resolution_in_bin_{:02d}.pdf'.format( 'dom_bins', metric, i)) fig.savefig(file_name) if wandb: reporter.save_file_to_wandb(str(file_name)) plt.close(fig)
PICKLE_DIRS = [ directory for directory in DATA_PATH.iterdir() if directory.is_dir() ] shelve_file_exists = SHELVE_DATA_FILE.is_file() if shelve_file_exists: with shelve.open(str(SHELVE_NAME), 'r') as f: EXISTING_EVENTS = list(f.keys()) else: EXISTING_EVENTS = [] for directory in PICKLE_DIRS: print( '{}: Handling directory {}'.format( get_time(), directory.stem ) ) time_start = datetime.datetime.now() files = [ file for file in directory.glob('**/*') if file.suffix == '.pickle' ] with shelve.open(str(SHELVE_NAME), 'c') as db: for file in files: if file.stem not in EXISTING_EVENTS: with open(file, 'rb') as f: db[file.stem] = pickle.load(f)
def main(): experiment_name = create_experiment_name(slug_length=2) dirs, config = get_dirs_and_config(experiment_name, True) if socket.gethostname() == 'air.local': train_set = Path().home().joinpath('CubeFlowData').joinpath( 'dbs').joinpath('test_transformed.db') val_set = Path().home().joinpath('CubeFlowData').joinpath( 'dbs').joinpath('test_transformed.db') mask_and_split = MaskAndSplit(config, dirs, ['test']) sets = mask_and_split.split() sets['train'] = sets['test'] sets['val'] = sets['test'] elif socket.gethostname() == 'gpulab': train_set = Path( '/home/bjoernhm/CubeML/data/oscnext-genie-level5-v01-01-pass2/train_transformed.db' ) val_set = Path( '/home/bjoernhm/CubeML/data/oscnext-genie-level5-v01-01-pass2/val_transformed.db' ) mask_and_split = MaskAndSplit(config, dirs, ['train', 'val']) sets = mask_and_split.split() Loader = getattr( importlib.import_module('src.dataloaders.' + config['dataloader']), 'Dataloader') print('{}: The overlap between train and val set is {}'.format( get_time(), len(list(set(sets['val']) & set(sets['train']))))) if 'SRTInIcePulses' in '-'.join(config['masks']): config['cleaning'] = 'SRTInIcePulses' config['cleaning_length'] = 'srt_in_ice_pulses_event_length' elif 'SplitInIcePulses' in '-'.join(config['masks']): config['cleaning'] = 'SplitInIcePulses' config['cleaning_length'] = 'split_in_ice_pulses_event_length' if config['dev_run']: sets['train'] = sets['train'][0:20000] sets['val'] = sets['val'][0:20000] train_dataset = Loader(sets['train'], config, train_set, test=False) val_dataset = Loader(sets['val'], config, val_set, test=True) reporter = Reporter(config, experiment_name) saver = Saver(config, dirs) Loss = getattr(importlib.import_module('src.losses.losses'), config['loss']) loss_init = Loss() loss = loss_init.loss Model = getattr(importlib.import_module('src.models.' + config['model']), 'Model') model = Model() Optimizer = getattr(importlib.import_module('src.optimizers.optimizers'), config['optimizer']) optimizer_init = Optimizer(model.parameters(), config['min_learning_rate']) optimizer = optimizer_init.optimizer trainer = Trainer(config, model, optimizer, loss, reporter, saver, train_dataset, val_dataset) trainer.fit() print('{}: Script done!'.format(get_time()))