def __init__(self, model_name, config, working_dir, network, sess, episode_runner, summaries_collector, curriculum_coefficient=None): self.model_name = model_name self.config = config self.working_dir = working_dir self.network = network self.sess = sess self.episode_runner = episode_runner self.summaries_collector = summaries_collector self.curriculum_coefficient = curriculum_coefficient self.fixed_start_goal_pairs = self.episode_runner.game.get_fixed_start_goal_pairs( challenging=False) self.hard_fixed_start_goal_pairs = self.episode_runner.game.get_fixed_start_goal_pairs( challenging=True) self.batch_size = config['model']['batch_size'] self.steps_per_trajectory_print = config['general'][ 'cycles_per_trajectory_print'] self.train_episodes_per_cycle = config['general'][ 'train_episodes_per_cycle'] self.gain = config['model']['gain'] self.train_episodes_counter = 0 self.check_gradients = config['gradient_checker']['enable'] if self.check_gradients: self.gradient_output_dir = os.path.join(working_dir, 'gradient', model_name) init_dir(self.gradient_output_dir) saver_dir = os.path.join(self.gradient_output_dir, 'temp_4_gradient_print') self.gradient_saver = ModelSaver(saver_dir, 1, 'gradient_checker', print_log=False) else: self.gradient_output_dir, self.gradient_saver = None, None
def main(): ms = ModelSaver() model = load_model('../artifacts/model.h5') ms.save_model(model, '../artifacts/model.horn')
def run_for_config(config): # set the name of the model model_name = config['general']['name'] now = datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S') model_name = now + '_' + model_name if model_name is not None else now # where we save all the outputs scenario = config['general']['scenario'] working_dir = os.path.join(get_base_directory(), 'sgt', scenario) init_dir(working_dir) saver_dir = os.path.join(working_dir, 'models', model_name) init_dir(saver_dir) init_log(log_file_path=os.path.join(saver_dir, 'log.txt')) copy_config(config, os.path.join(saver_dir, 'config.yml')) episodic_success_rates_path = os.path.join(saver_dir, 'results.txt') test_trajectories_dir = os.path.join(working_dir, 'test_trajectories', model_name) init_dir(test_trajectories_dir) # generate game game = _get_game(config) network = Network(config, game) network_variables = network.get_all_variables() # save model latest_saver = ModelSaver(os.path.join(saver_dir, 'latest_model'), 2, 'latest', variables=network_variables) best_saver = ModelSaver(os.path.join(saver_dir, 'best_model'), 1, 'best', variables=network_variables) summaries_collector = SummariesCollector( os.path.join(working_dir, 'tensorboard', model_name), model_name) with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto( gpu_options=tf.compat.v1.GPUOptions( per_process_gpu_memory_fraction=config['general'] ['gpu_usage']))) as sess: sess.run(tf.compat.v1.global_variables_initializer()) def policy_function(starts, goals, level, is_train): res = network.predict_policy(starts, goals, level, sess, is_train) means = 0.5 * (np.array(starts) + np.array(goals)) distance = np.linalg.norm(res[0] - means, axis=1) print( f'distance from mean: mean {distance.mean()} min {distance.min()} max {distance.max()}' ) if np.any(np.isnan(res)): print_and_log( '######################## Nan predictions detected...') return res episode_runner = EpisodeRunnerSubgoal(config, game, policy_function) trainer = TrainerSubgoal( model_name, config, working_dir, network, sess, episode_runner, summaries_collector, curriculum_coefficient=get_initial_curriculum(config)) decrease_learn_rate_if_static_success = config['model'][ 'decrease_learn_rate_if_static_success'] stop_training_after_learn_rate_decrease = config['model'][ 'stop_training_after_learn_rate_decrease'] reset_best_every = config['model']['reset_best_every'] global_step = 0 best_curriculum_coefficient = None for current_level in range(config['model']['starting_level'], config['model']['levels'] + 1): best_cost, best_cost_global_step = None, None no_test_improvement, consecutive_learn_rate_decrease = 0, 0 if config['model']['init_from_lower_level'] and current_level > 1: print_and_log('initiating level {} from previous level'.format( current_level)) network.init_policy_from_lower_level(sess, current_level) for cycle in range(config['general']['training_cycles_per_level']): print_and_log('starting cycle {}, level {}'.format( cycle, current_level)) new_global_step, success_ratio = trainer.train_policy_at_level( current_level, global_step) if new_global_step == global_step: print_and_log( 'no data found in training cycle {} global step still {}' .format(cycle, global_step)) continue else: global_step = new_global_step if (cycle + 1) % config['policy']['decrease_std_every'] == 0: network.decrease_base_std(sess, current_level) print_and_log('new base stds {}'.format( network.get_base_stds(sess, current_level))) print_and_log('done training cycle {} global step {}'.format( cycle, global_step)) # save every now and then if cycle % config['general']['save_every_cycles'] == 0: latest_saver.save(sess, global_step=global_step) if cycle % config['general']['test_frequency'] == 0: # do test test_successes, test_cost, _, endpoints_by_path = trainer.collect_test_data( current_level, False) summaries_collector.write_test_success_summaries( sess, global_step, test_successes, test_cost, trainer.curriculum_coefficient) with open(episodic_success_rates_path, 'a') as f: f.write('{} {} {} {} {}'.format( current_level, trainer.train_episodes_counter, test_successes, test_cost, os.linesep)) # decide how to act next print_and_log('old cost was {} at step {}'.format( best_cost, best_cost_global_step)) print_and_log('current learn rates {}'.format( network.get_learn_rates(sess, current_level))) print_and_log('current base stds {}'.format( network.get_base_stds(sess, current_level))) if best_cost is None or test_cost < best_cost: print_and_log('new best cost {} at step {}'.format( test_cost, global_step)) best_cost, best_cost_global_step = test_cost, global_step best_curriculum_coefficient = trainer.curriculum_coefficient no_test_improvement, consecutive_learn_rate_decrease = 0, 0 best_saver.save(sess, global_step) test_trajectories_file = os.path.join( test_trajectories_dir, '{}.txt'.format(global_step)) serialize_compress(endpoints_by_path, test_trajectories_file) else: print_and_log( 'new model is not the best with cost {} at step {}' .format(test_cost, global_step)) no_test_improvement += 1 print_and_log('no improvement count {} of {}'.format( no_test_improvement, decrease_learn_rate_if_static_success)) if reset_best_every > 0 and no_test_improvement % reset_best_every == reset_best_every - 1: # restore the model every once in a while if did not find a better solution in a while restore_best(sess, best_saver, best_curriculum_coefficient, trainer) if no_test_improvement == decrease_learn_rate_if_static_success: # restore the best model if config['model']['restore_on_decrease']: restore_best(sess, best_saver, best_curriculum_coefficient, trainer) # decrease learn rates network.decrease_learn_rates(sess, current_level) no_test_improvement = 0 consecutive_learn_rate_decrease += 1 print_and_log( 'decreasing learn rates {} of {}'.format( consecutive_learn_rate_decrease, stop_training_after_learn_rate_decrease)) print_and_log('new learn rates {}'.format( network.get_learn_rates(sess, current_level))) if consecutive_learn_rate_decrease == stop_training_after_learn_rate_decrease: break if trainer.curriculum_coefficient is not None: if success_ratio > config['curriculum'][ 'raise_when_train_above']: print_and_log( 'current curriculum coefficient {}'.format( trainer.curriculum_coefficient)) trainer.curriculum_coefficient *= config['curriculum'][ 'raise_times'] print_and_log( 'curriculum coefficient raised to {}'.format( trainer.curriculum_coefficient)) # mark in log the end of cycle print_and_log(os.linesep) # if we finished because we ran out of cycles, we still need to make one more test end_of_level_test(best_cost, best_cost_global_step, best_curriculum_coefficient, best_saver, sess, test_trajectories_dir, trainer, current_level) print_and_log('trained all levels - needs to stop') close_log() return best_cost
import tensorflow as tf from simple_ddqrn import DDQRN from target_ddqrn import target_ddqrn import parameter_config as cfg from model_saver import ModelSaver from ddqrn_trainer import DDQRNTrainer sess = tf.Session() ddqrn = DDQRN(sess, "main_DDQRN") ddqrn_target = target_ddqrn(DDQRN(sess, "target_DDQRN"), [tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="main_DDQRN"), tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_DDQRN")]) sess.run(tf.global_variables_initializer()) trainer = DDQRNTrainer(ddqrn, ddqrn_target, sess) model = ModelSaver(ddqrn, trainer) model.load(cfg.save_path) ddqrn_target.update(sess, tau=1.0) model.save(cfg.save_path)
def __init__(self, base_folder, json_files, batch_size=64): self.verbose = False self.spot = False serialization_dir = 'tmp' self.mfb = MultiLabelBinarizer() mf_labels = np.arange(0, 228) # NUM_CLASSES = len(mf_labels) # class_names = mf_labels #image_datasets['train'].classes self.mfb.fit_transform([mf_labels]) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") log_dirs = { 'train': SummaryWriter(os.path.join(serialization_dir, "log", "train")), 'val': SummaryWriter(os.path.join(serialization_dir, "log", "validation")) } self.tensorboard = TensorboardWriter(log_dirs['train'], log_dirs['val']) # summary_interval = 100 self.model_saver = ModelSaver(serialization_dir) data_transforms = { 'train': transforms.Compose([ transforms.Resize((224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } print('[i] Loading datasets...') dataset = MaterialistFashion(base_folder, json_files, data_transforms['train'], id_as_path=True, load_first=8 * 139) validation_split = .2 # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_split * dataset_size)) shuffle_dataset = True random_seed = 42 if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) # image_datasets = { # 'train': MaterialistFashion(train_folder, train_json, data_transforms['train']), # 'val': MaterialistFashion(val_folder, val_json, data_transforms['val']) # } self.dataloaders = { 'train': torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, num_workers=8), 'val': torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler, num_workers=8) } self.dataset_sizes = { 'train': len(train_indices), 'val': len(val_indices) } # {x: len(image_datasets[x]) for x in ['train', 'val']} print('[i] Done loading datasets.')
class MfTrainer: def __init__(self, base_folder, json_files, batch_size=64): self.verbose = False self.spot = False serialization_dir = 'tmp' self.mfb = MultiLabelBinarizer() mf_labels = np.arange(0, 228) # NUM_CLASSES = len(mf_labels) # class_names = mf_labels #image_datasets['train'].classes self.mfb.fit_transform([mf_labels]) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") log_dirs = { 'train': SummaryWriter(os.path.join(serialization_dir, "log", "train")), 'val': SummaryWriter(os.path.join(serialization_dir, "log", "validation")) } self.tensorboard = TensorboardWriter(log_dirs['train'], log_dirs['val']) # summary_interval = 100 self.model_saver = ModelSaver(serialization_dir) data_transforms = { 'train': transforms.Compose([ transforms.Resize((224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } print('[i] Loading datasets...') dataset = MaterialistFashion(base_folder, json_files, data_transforms['train'], id_as_path=True, load_first=8 * 139) validation_split = .2 # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_split * dataset_size)) shuffle_dataset = True random_seed = 42 if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) # image_datasets = { # 'train': MaterialistFashion(train_folder, train_json, data_transforms['train']), # 'val': MaterialistFashion(val_folder, val_json, data_transforms['val']) # } self.dataloaders = { 'train': torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, num_workers=8), 'val': torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler, num_workers=8) } self.dataset_sizes = { 'train': len(train_indices), 'val': len(val_indices) } # {x: len(image_datasets[x]) for x in ['train', 'val']} print('[i] Done loading datasets.') def check_label_distribution(self, dataloader, filter_threshold=100): label_bins = {} for i in range(0, 228): label_bins[i] = 0 for i, (_, labels, _) in enumerate(dataloader): for batch in self.mfb.inverse_transform(labels): for label in batch: label_bins[int(label)] += 1 label_bins = sorted(label_bins.items(), key=lambda x: x[1], reverse=True) max = label_bins[0][1] rescailing = [1] * 228 filtered_mask = [0] * 228 for key, value in label_bins: print("Label ID: {} -> Count: {}".format(key, value)) if not value == max: rescailing[key] = rescailing[key] - value / max else: rescailing[key] = 1e-10 if value >= filter_threshold: filtered_mask[key] = 1 return rescailing, filtered_mask def imshow(self, inp, title=None): """shows a batch of images.""" inp = inp.numpy().transpose((1, 2, 0)) mean = np.array([0.485, 0.456, 0.406]) std = np.array([0.229, 0.224, 0.225]) inp = std * inp + mean inp = np.clip(inp, 0, 1) plt.imshow(inp) if title is not None: plt.title(title) plt.pause(0.001) # pause a bit so that plots are updated def show_first_batch(self): inputs, labels, image_id = next(iter(self.dataloaders['train'])) # Make a grid from batch out = torchvision.utils.make_grid(inputs) self.imshow(out, title=[x for x in image_id]) print("Images shown") # def metrics_to_tensorboard(epoch: int, train_metrics: dict, val_metrics: dict = None) -> None: # """ # Sends all of the train metrics (and validation metrics, if provided) to tensorboard. # """ # metric_names = set(train_metrics.keys()) # if val_metrics is not None: # metric_names.update(val_metrics.keys()) # val_metrics = val_metrics or {} # # for name in metric_names: # train_metric = train_metrics.get(name) # if train_metric is not None: # tensorboard.add_train_scalar(name, train_metric, epoch) # val_metric = val_metrics.get(name) # if val_metric is not None: # tensorboard.add_validation_scalar(name, val_metric, epoch) ###################################################################### # Training the model def train_model(self, model, criterion, optimizer, scheduler, num_epochs=10): since = time.time() best_model_wts = copy.deepcopy(model.state_dict()) best_f1 = 0.0 model, optimizer, epoch_counter, global_step = self.model_saver.restore_checkpoint( model, optimizer) val_step = global_step chosen_threshold = 0.2 for epoch in range(epoch_counter, num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': scheduler.step() model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_f1 = 0.0 low_threshold = 0.01 high_threshold = 0.5 step_threshold = 0.01 running_th_f1 = {} for threshold in np.arange(low_threshold, high_threshold, step=step_threshold): running_th_f1[threshold] = 0.0 # Iterate over data. for n_iter, (inputs, labels, _) in enumerate(self.dataloaders[phase]): inputs = inputs.to(self.device) labels = labels.to(self.device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) soft_out = F.sigmoid( outputs) #F.softmax(outputs, dim=1) # if n_iter % 100 == 0: # self.imshow(torchvision.utils.make_grid(torch.cat((inputs.detach().cpu(),model.stn(inputs).detach().cpu()))), title='stn') th_selection_preds = {} for threshold in np.arange(low_threshold, high_threshold, step=step_threshold): th_selection_preds[threshold] = soft_out.ge( threshold).type(torch.cuda.FloatTensor) preds = soft_out.ge(chosen_threshold).type( torch.cuda.FloatTensor) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': global_step += 1 loss.backward() optimizer.step() else: val_step += 1 # statistics if (n_iter == 1 or self.verbose) and phase == 'val': # self.imshow(torchvision.utils.make_grid(torch.cat((inputs.detach().cpu(),model.stn(inputs).detach().cpu()))),title='stn') for i, (true_label, pred_label) in enumerate( zip(self.mfb.inverse_transform(labels), self.mfb.inverse_transform(preds))): true_label_output_probs = [ soft_out.cpu().data.numpy()[i][x] for x in true_label ] pred_label_output_probs = [ soft_out.cpu().data.numpy()[i][x] for x in pred_label ] print('{} True labels[{}]: {}'.format( phase, i, true_label)) print('{} True probs [{}]: {}'.format( phase, i, true_label_output_probs)) print('{} Pred labels[{}]: {}'.format( phase, i, pred_label)) print('{} Pred probs [{}]: {}'.format( phase, i, pred_label_output_probs)) running_loss += loss.item() * inputs.size(0) mf_f1 = f1_score(labels, preds, average='micro') running_f1 += mf_f1 * inputs.size( 0) # torch.sum(preds == labels.data) for key in th_selection_preds.keys(): th_f1 = f1_score(labels, th_selection_preds[key], average='micro') running_th_f1[key] += th_f1 * inputs.size(0) spot_loss = running_loss / (n_iter + 1) if phase == 'train': self.tensorboard.add_train_scalar( 'loss', spot_loss, global_step) self.tensorboard.add_train_scalar( 'microF1', mf_f1, global_step) else: self.tensorboard.add_validation_scalar( 'loss', spot_loss, val_step) self.tensorboard.add_validation_scalar( 'microF1', mf_f1, val_step) if self.spot: print('{} Spot: Loss: {:.4f} F1: {:.4f} Step: {}'. format(phase, spot_loss, mf_f1, global_step)) epoch_loss = running_loss / self.dataset_sizes[phase] epoch_f1 = running_f1 / self.dataset_sizes[phase] # print('F1 for different thresholds: ') sorted_thresholds = sorted(running_th_f1.items(), key=lambda x: x[1], reverse=True) if sorted_thresholds[0][1] > running_f1 and phase == 'val': chosen_threshold = (chosen_threshold + sorted_thresholds[0][0]) / 2 # for item in sorted_thresholds: # print('{} Threshold: {}, F1: {}'.format(phase, item[0], item[1] / self.dataset_sizes[phase])) print( '{} Loss: {:.4f} F1: {:.4f} Chosen threshold {:.4f} <------------------------------------------------------------------' .format(phase, epoch_loss, epoch_f1, chosen_threshold)) # deep copy the model if phase == 'val': if epoch_f1 > best_f1: best_f1 = epoch_f1 best_model_wts = copy.deepcopy(model.state_dict()) print('[i] Saving new best F1 {:.4f}'.format(best_f1)) self.model_saver.save_checkpoint( model, epoch, optimizer, global_step, True) chosen_threshold = sorted_thresholds[0][0] print("[i] Saving last epoch model.") self.model_saver.save_checkpoint(model, epoch, optimizer, global_step, False) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val F1: {:4f}'.format(best_f1)) print('Best threshold: {}'.format(chosen_threshold)) # load best model weights model.load_state_dict(best_model_wts) return model ###################################################################### # Visualizing the model predictions def visualize_model(self, model, num_images=6): was_training = model.training model.eval() images_so_far = 0 fig = plt.figure() with torch.no_grad(): for i, (inputs, labels, _) in enumerate(self.dataloaders['val']): inputs = inputs.to(self.device) labels = labels.to(self.device) outputs = model(inputs) preds = outputs.ge(0.2).type(torch.cuda.FloatTensor) for j in range(inputs.size()[0]): images_so_far += 1 ax = plt.subplot(num_images // 2, 2, images_so_far) ax.axis('off') ax.set_title('predicted: {}'.format( self.mfb.inverse_transform(preds[j]))) self.imshow(inputs.cpu().data[j]) if images_so_far == num_images: model.train(mode=was_training) return model.train(mode=was_training) ###################################################################### def train_fashion_model(self, num_epochs=10): model_ft = FashionModel() model_ft = model_ft.to(self.device) criterion = nn.MultiLabelSoftMarginLoss() params = list(model_ft.localization.parameters()) + \ list(model_ft.fc_loc.parameters()) + \ list(model_ft.resnet.fc.parameters()) + \ list(model_ft.fc.parameters()) # params = list(model_ft.resnet.fc.parameters()) + list(model_ft.fc.parameters()) optimizer_ft = optim.Adam(params) # , lr=0.001, momentum=0.9) exp_lr_scheduler = lr_scheduler.StepLR( optimizer_ft, step_size=10, gamma=0.1) # Decay LR by a factor of 0.1 every 7 epochs return self.train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=num_epochs) def train_standard_resnet(self, num_epochs): # rescaling_weights, filtered_mask = check_label_distribution(dataloaders['train']) model_ft = models.resnet50(pretrained=True) num_ftrs = model_ft.fc.in_features model_ft.fc = nn.Linear(num_ftrs, 228) model_ft = model_ft.to(self.device) criterion = nn.MultiLabelSoftMarginLoss() optimizer_ft = optim.Adam( model_ft.parameters()) # , lr=0.001, momentum=0.9) exp_lr_scheduler = lr_scheduler.StepLR( optimizer_ft, step_size=7, gamma=0.1) # Decay LR by a factor of 0.1 every 7 epochs return self.train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=num_epochs) def train_extended_standard_resnet(self, num_epochs): # rescaling_weights, filtered_mask = check_label_distribution(dataloaders['train']) model_ft = nn.Sequential(models.resnet50(pretrained=True), nn.Linear(1000, 512, bias=True), nn.Dropout(), nn.ReLU(), nn.Linear(512, 228, bias=True)) model_ft = model_ft.to(self.device) criterion = nn.MultiLabelSoftMarginLoss() params = list(model_ft[0].fc.parameters()) + \ list(model_ft[1].parameters()) + \ list(model_ft[4].parameters()) optimizer_ft = optim.Adam(params) exp_lr_scheduler = lr_scheduler.StepLR( optimizer_ft, step_size=7, gamma=0.1) # Decay LR by a factor of 0.1 every 7 epochs return self.train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=num_epochs)
def train_and_evaluate(): """Train the model with custom training loop, evaluating at given intervals.""" # Set mixed precision policy if FLAGS.mixed_precision: policy = mixed_precision.Policy('mixed_float16') mixed_precision.set_policy(policy) # Get dataset dataset = _get_dataset(dataset=FLAGS.dataset, label_mode=FLAGS.label_mode, input_mode=FLAGS.input_mode, input_length=FLAGS.input_length, seq_shift=FLAGS.seq_shift, def_val=DEF_VAL) # Define representation rep = Representation(blank_index=BLANK_INDEX, def_val=DEF_VAL, loss_mode=FLAGS.loss_mode, num_event_classes=dataset.num_event_classes(), pad_val=PAD_VAL, use_def=FLAGS.use_def, decode_fn=FLAGS.decode_fn, beam_width=FLAGS.beam_width) # Get model model = _get_model(model=FLAGS.model, dataset=FLAGS.dataset, num_classes=rep.get_num_classes(), input_length=FLAGS.input_length, l2_lambda=L2_LAMBDA) seq_length = model.get_seq_length() rep.set_seq_length(seq_length) # Instantiate learning rate schedule and optimizer if FLAGS.lr_decay_fn == "exponential": lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=FLAGS.lr_base, decay_steps=LR_DECAY_STEPS, decay_rate=FLAGS.lr_decay_rate, staircase=True) elif FLAGS.lr_decay_fn == "piecewise_constant": values = np.divide(FLAGS.lr_base, LR_VALUE_DIV) lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay( boundaries=LR_BOUNDARIES, values=values.tolist()) elif FLAGS.lr_decay_fn == "constant": lr_schedule = ConstantLR(FLAGS.lr_base) optimizer = Adam(learning_rate=lr_schedule) # Get LossScaleOptimizer if FLAGS.mixed_precision: optimizer = LossScaleOptimizer(optimizer=optimizer, loss_scale='dynamic') # Get loss function train_loss_fn = rep.get_loss_fn(batch_size=FLAGS.batch_size) eval_loss_fn = rep.get_loss_fn(batch_size=FLAGS.eval_batch_size) # Get train and eval dataset collapse_fn = rep.get_loss_collapse_fn() train_dataset = dataset(batch_size=FLAGS.batch_size, data_dir=FLAGS.train_dir, is_predicting=False, is_training=True, label_fn=model.get_label_fn(FLAGS.batch_size), collapse_fn=collapse_fn, num_shuffle=FLAGS.num_shuffle) eval_dataset = dataset(batch_size=FLAGS.eval_batch_size, data_dir=FLAGS.eval_dir, is_predicting=False, is_training=False, label_fn=model.get_label_fn(FLAGS.eval_batch_size), collapse_fn=collapse_fn, num_shuffle=FLAGS.num_shuffle) # Load model if FLAGS.model_ckpt is not None: logging.info("Loading model from {}".format(FLAGS.model_ckpt)) load_status = model.load_weights( os.path.join(FLAGS.model_dir, "checkpoints", FLAGS.model_ckpt)) load_status.assert_consumed() # Set up log writer and metrics train_writer = tf.summary.create_file_writer( os.path.join(FLAGS.model_dir, "log/train")) eval_writer = tf.summary.create_file_writer( os.path.join(FLAGS.model_dir, "log/eval")) train_metrics = TrainMetrics(representation=rep, writer=train_writer) eval_metrics = EvalMetrics(representation=rep, writer=eval_writer) # Save best checkpoints in terms of f1 model_saver = ModelSaver(os.path.join(FLAGS.model_dir, "checkpoints"), compare_fn=lambda x, y: x.score > y.score, sort_reverse=True) # Keep track of total global step global_step = 0 # Iterate over epochs for epoch in range(FLAGS.train_epochs): logging.info('Starting epoch %d' % (epoch, )) # Iterate over training batches for step, (train_features, train_labels, train_labels_c, train_labels_l) in enumerate(train_dataset): # Assert sizes assert train_labels.shape == [ FLAGS.batch_size, seq_length ], "Labels shape [batch_size, seq_length]" # Run the train step train_logits, train_loss, train_l2_loss, train_grads = train_step( model, train_features, train_labels, train_labels_c, train_labels_l, train_loss_fn, optimizer) # Assert sizes assert train_logits.shape == [ FLAGS.batch_size, seq_length, rep.get_num_classes() ], "Logits shape [batch_size, seq_length, num_classes]" # Log every FLAGS.log_steps steps. if global_step % FLAGS.log_steps == 0: logging.info("Memory used: {} GB".format( psutil.virtual_memory().used / 2**30)) # Decode logits into predictions train_predictions_u = None if FLAGS.loss_mode == "ctc": train_predictions_u, _ = rep.get_decode_fn( FLAGS.batch_size)(train_logits) train_predictions_u = rep.get_inference_collapse_fn()( train_predictions_u) # General logs logging.info('Step %s in epoch %s; global step %s' % (step, epoch, global_step)) logging.info('Seen this epoch: %s samples' % ((step + 1) * FLAGS.batch_size)) logging.info('Total loss (this step): %s' % float(train_loss + train_l2_loss)) with train_writer.as_default(): tf.summary.scalar("training/global_gradient_norm", data=tf.linalg.global_norm(train_grads), step=global_step) tf.summary.scalar('training/loss', data=train_loss, step=global_step) tf.summary.scalar('training/l2_loss', data=train_l2_loss, step=global_step) tf.summary.scalar('training/total_loss', data=train_loss + train_l2_loss, step=global_step) tf.summary.scalar('training/learning_rate', data=lr_schedule(epoch), step=global_step) # Update metrics train_metrics.update(train_labels, train_logits, train_predictions_u) # Log metrics train_metrics.log(global_step) # Save latest model model_saver.save_latest(model=model, step=global_step, file="model") # Flush TensorBoard train_writer.flush() # Evaluate every FLAGS.eval_steps steps. if global_step % FLAGS.eval_steps == 0: logging.info('Evaluating at global step %s' % global_step) # Keep track of eval losses eval_losses = [] eval_l2_losses = [] # Iterate through eval batches for i, (eval_features, eval_labels, eval_labels_c, eval_labels_l) in enumerate(eval_dataset): # Assert sizes assert eval_labels.shape == [ FLAGS.eval_batch_size, seq_length ], "Labels shape [batch_size, seq_length]" # Run the eval step eval_logits, eval_loss, eval_l2_loss = eval_step( model, eval_features, eval_labels, eval_labels_c, eval_labels_l, eval_loss_fn) eval_losses.append(eval_loss.numpy()) eval_l2_losses.append(eval_l2_loss.numpy()) # Assert sizes assert eval_logits.shape == [ FLAGS.eval_batch_size, seq_length, rep.get_num_classes() ], "Logits shape [batch_size, seq_length, num_classes]" # Decode logits into predictions eval_predictions_u = None if FLAGS.loss_mode == "ctc": eval_predictions_u, _ = rep.get_decode_fn( FLAGS.eval_batch_size)(eval_logits) eval_predictions_u = rep.get_inference_collapse_fn()( eval_predictions_u) # Update metrics for this batch eval_metrics.update_i(eval_labels, eval_logits, eval_predictions_u) # Update mean metrics eval_score = eval_metrics.update() # General logs eval_loss = np.mean(eval_losses) eval_l2_loss = np.mean(eval_l2_losses) logging.info('Evaluation loss: %s' % float(eval_loss + eval_l2_loss)) with eval_writer.as_default(): tf.summary.scalar('training/loss', data=eval_loss, step=global_step) tf.summary.scalar('training/l2_loss', data=eval_l2_loss, step=global_step) tf.summary.scalar('training/total_loss', data=eval_loss + eval_l2_loss, step=global_step) # Log metrics eval_metrics.log(global_step) # Save best models model_saver.save_best(model=model, score=float(eval_score), step=global_step, file="model") # Flush TensorBoard eval_writer.flush() # Clean up memory tf.keras.backend.clear_session() gc.collect() # Increment global step global_step += 1 # Save and keep latest model for every 10th epoch if epoch % 10 == 9: model_saver.save_keep(model=model, step=global_step, file="model") logging.info('Finished epoch %s' % (epoch, )) optimizer.finish_epoch() # Save final model model_saver.save_latest(model=model, step=global_step, file="model") # Finished training logging.info("Finished training")
class GloveModel: def __init__(self, one_file, class_file): self.saver = ModelSaver(one_file, class_file) self.one_class, self.multi_class = self.saver.load_all() self.one_file = one_file self.class_file = class_file def train(self, data, data_novelty): X, y = self.preprocessing(data) X_novelty, _ = self.preprocessing(data_novelty) print("start one") self.train_outlier(X, X_novelty) print("start two") self.train_class(X, y) print("end") self.saver.save_all(self.one_class, self.multi_class) def train_outlier(self, X, X_novelty): X_train = np.concatenate((X, X_novelty)) y_train = [1] * len(X) + [-1] * len(X_novelty) tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': ['scale'], 'nu': [.5, .7, .9] }] clf = GridSearchCV(OneClassSVM(), tuned_parameters, scoring="recall", verbose=0) clf.fit(X_train, y_train) self.one_class = clf def train_class(self, X, y): param_grid = { 'C': [0.1, 1, 10, 100], 'gamma': ['scale'], 'kernel': ['rbf', 'linear'], 'degree': [1, 2, 3, 4] } self.multi_class = GridSearchCV(SVC(), param_grid, refit=True, verbose=0) self.multi_class.fit(X, y) def predict(self, X): is_predict = self.predict_outlier(X) #print(is_predict) if is_predict[0] == -1: return [""] pred = self.predic_class(X) return pred def predict_outlier(self, X): return self.one_class.predict(X) def predic_class(self, X): return self.multi_class.predict(X) def load_model(self): pass def save_model(self): with open(self.class_file, "wb") as f: pickle.dump(self.multi_class, self.f) def preprocessing(self, data): other = [] for i in range(6): other.extend(["gx%d" % i, "gy%d" % i, "gz%d" % i]) other.append("category") other.append("timestamp") X = data.drop(other, axis=1).to_numpy() y = data["category"].values return X, y
def __init__(self, one_file, class_file): self.saver = ModelSaver(one_file, class_file) self.one_class, self.multi_class = self.saver.load_all() self.one_file = one_file self.class_file = class_file
def SaveModel(self, model_name, op_parser): saver = ModelSaver(model_name, op_parser) if self.save_config == True: saver.SaveConfigInfo(self.save_prefix)
trajectory_global_step = '128600' trajectory_name = 'success_310.txt' # read the config config = read_config() # where we save all the outputs scenario = config['general']['scenario'] working_dir = os.path.join(get_base_directory(), scenario) saver_dir = os.path.join(working_dir, 'models', model_name) best_saver_path = os.path.join(saver_dir, 'best_model') # generate graph: network = Network(config, ) best_saver = ModelSaver(best_saver_path, 1, 'best') # read trajectory trajectory_file_path = os.path.join(working_dir, 'trajectories', model_name, trajectory_global_step, trajectory_name) with open(trajectory_file_path, 'r') as f: endpoints = [parse_trajectory_line(l) for l in f.readlines()] start = endpoints[0] goal = endpoints[-1] mid = endpoints[(len(endpoints) - 1) / 2] with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=config['general'] ['gpu_usage']))) as sess: x = np.linspace(-1, 1, 500)
def train_on_tasks(task_dict, PARAMS, logger, is_fine_tuning): # Clear GPU cache torch.cuda.empty_cache() # Initialize the object for saving models model_saver = ModelSaver(model_dir="./models") # Get the per task eval metric against which best models are chosen task_eval_metrics = {task_name: [0] for task_name, task in task_dict.items()} # Evaluation engine for each task task_eval_engines = {task_name: create_eval_engine(model=task.model, is_multilabel=task.is_multilabel, n_classes=task.n_classes, cpu=PARAMS.cpu) for task_name, task in task_dict.items()} # Get a list of task names to determine order of training, with one entry for each batch of that task (e.g. [Maalej2015, Maalej2015, Maalej2015] for Maalej2015 if it had 3 batches) task_training_list = [] for task_name, task in task_dict.items(): task_training_list.extend([task_name]*task.train_length) # You only need to shuffle training tasks when the tasks have a shared layer (I.e. are not fine tuning) if not is_fine_tuning: # Set the random seeds for shuffling the train task list random.seed(PARAMS.random_state) # Shuffle task list during multi-task training so that tasks are trained roughly evenly throughout random.shuffle(task_training_list) # initialize global step number step_num = 0 # Record the number of steps taken for each task in a dict task_steps = {task_name: 0 for task_name, task in task_dict.items()} # Record the number of epochs since the best performance of the model epochs_since_last_best = {task_name: 0 for task_name, task in task_dict.items()} # Specify in the logs whether a given result is from fine tuning or multi-task training run_type_log_prefix = "FT " if is_fine_tuning else "MTL " # Get the required number of epochs for training epochs = PARAMS.num_fine_tuning_epochs if is_fine_tuning else PARAMS.num_epochs def is_patience_exceeded(task_name): return is_fine_tuning and epochs_since_last_best[task_name] >= PARAMS.early_stopping_patience # Start clock before training to measure how long it takes to find a validated best model train_time_start = time.time() # Save initial model before training starts (overwriting any previous models that may have been on disc) for task_name, task in task_dict.items(): model_saver.save_model(file_name=task_name, model=task.model) for epoch in range(epochs): # Clean GPU cache torch.cuda.empty_cache() # Reset iterable for each task and set model for training for task_name, task in task_dict.items(): task.model.train() task.training_iterable = iter(task.train_data) # TRAIN for task_name in task_training_list: # Skip training this task if training patience already exceeded (during fine tuning only). # We do not skip on MTL training as there could be complex interactions between the training of multiple tasks. if is_patience_exceeded(task_name): print(f"{task_name} patience exceeded, ceasing training on this task") continue task = task_dict[task_name] X, y = next(task.training_iterable) loss_fn = task.loss_fn() if PARAMS.cpu: logits = task.model(X.cpu()) golds = y.cpu() else: logits = task.model(X.cuda()) golds = y.cuda() if task.is_multilabel: loss = loss_fn(logits.view(-1, task.n_classes), golds) else: loss = loss_fn(logits.view(-1, task.n_classes), golds.view(-1)) loss.backward() task.optimizer.step() task.model.zero_grad() logger.log_metric(f'{run_type_log_prefix} {task_name} - loss', x=task_steps[task_name], y=loss.item()) # Only log overall loss when the tasks have a shared language model layer. During fine tuning, their models are no longer shared, making this metric useless. if not is_fine_tuning: logger.log_metric(f'{run_type_log_prefix} overall loss', x=step_num, y=loss.item()) step_num += 1 task_steps[task_name] += 1 # Moves the golds and logits from the GPU del golds, logits, loss # VALIDATE for task_name, task in task_dict.items(): torch.cuda.empty_cache() with torch.no_grad(): task.model.eval() if is_patience_exceeded(task_name): print(f"{task_name} patience exceeded, ceasing evaluation on this task") continue validation_results = task_eval_engines[task_name].run(task.valid_data).metrics logger.log_results(run_type_log_prefix + task_name, "valid", epoch, validation_results) # What metric will we compare all previous performance against comparison_metric = validation_results[PARAMS.best_metric] if comparison_metric > max(task_eval_metrics[task_name]): model_saver.save_model(file_name=task_name, model=task.model) epochs_since_last_best[task_name] = 0 else: epochs_since_last_best[task_name] += 1 task_eval_metrics[task_name].append(comparison_metric) train_time_end = time.time() task_eval_metrics["time_elapsed"] = train_time_end - train_time_start # TEST task_test_metrics = {task_name: None for task_name, task in task_dict.items()} for task_name, task in task_dict.items(): torch.cuda.empty_cache() with torch.no_grad(): task.model.eval() model_saver.load_model(file_name=task_name, model=task.model) test_engine = create_eval_engine(model=task.model, is_multilabel=task.is_multilabel, n_classes=task.n_classes, cpu=PARAMS.cpu) test_results = test_engine.run(task.test_data).metrics task_test_metrics[task_name] = test_results epoch = 1 if is_fine_tuning else 0 logger.log_results(run_type_log_prefix + task_name, "test", epoch, test_results) return task_eval_metrics, task_test_metrics
def run_for_config(config): # set the name of the model model_name = config['general']['name'] now = datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S') model_name = now + '_' + model_name if model_name is not None else now # where we save all the outputs scenario = config['general']['scenario'] working_dir = os.path.join(get_base_directory(), 'sequential', scenario) init_dir(working_dir) saver_dir = os.path.join(working_dir, 'models', model_name) init_dir(saver_dir) init_log(log_file_path=os.path.join(saver_dir, 'log.txt')) copy_config(config, os.path.join(saver_dir, 'config.yml')) episodic_success_rates_path = os.path.join(saver_dir, 'results.txt') weights_log_dir = os.path.join(saver_dir, 'weights_logs') init_dir(weights_log_dir) test_trajectories_dir = os.path.join(working_dir, 'test_trajectories', model_name) init_dir(test_trajectories_dir) # generate game game = _get_game(config) network = NetworkSequential(config, game.get_state_space_size(), game.get_action_space_size(), is_rollout_agent=False) network_variables = network.get_all_variables() # save model latest_saver = ModelSaver(os.path.join(saver_dir, 'latest_model'), 2, 'latest', variables=network_variables) best_saver = ModelSaver(os.path.join(saver_dir, 'best_model'), 1, 'best', variables=network_variables) summaries_collector = SummariesCollector( os.path.join(working_dir, 'tensorboard', model_name), model_name) with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto( gpu_options=tf.compat.v1.GPUOptions( per_process_gpu_memory_fraction=config['general'] ['gpu_usage']))) as sess: sess.run(tf.compat.v1.global_variables_initializer()) episode_runner = EpisodeRunnerSequential( config, game, curriculum_coefficient=get_initial_curriculum(config)) trainer = TrainerSequential(model_name, config, working_dir, network, sess, episode_runner, summaries_collector) decrease_learn_rate_if_static_success = config['model'][ 'decrease_learn_rate_if_static_success'] stop_training_after_learn_rate_decrease = config['model'][ 'stop_training_after_learn_rate_decrease'] reset_best_every = config['model']['reset_best_every'] global_step = 0 best_cost, best_cost_global_step, best_curriculum_coefficient = None, None, None no_test_improvement, consecutive_learn_rate_decrease = 0, 0 for cycle in range(config['general']['training_cycles']): print_and_log('starting cycle {}'.format(cycle)) global_step, success_ratio = trainer.train_policy(global_step) if (cycle + 1) % config['policy']['decrease_std_every'] == 0: network.decrease_base_std(sess) print_and_log('new base stds {}'.format( network.get_base_std(sess))) print_and_log('done training cycle {} global step {}'.format( cycle, global_step)) # save every now and then if cycle % config['general']['save_every_cycles'] == 0: latest_saver.save(sess, global_step=global_step) if cycle % config['general']['test_frequency'] == 0: # do test test_successes, test_cost, _, endpoints_by_path = trainer.collect_data( config['general']['test_episodes'], is_train=False, use_fixed_start_goal_pairs=True) summaries_collector.write_test_success_summaries( sess, global_step, test_successes, test_cost, episode_runner.curriculum_coefficient) with open(episodic_success_rates_path, 'a') as f: f.write('{} {} {} {}'.format( trainer.train_episodes_counter, test_successes, test_cost, os.linesep)) # decide how to act next print_and_log('old cost was {} at step {}'.format( best_cost, best_cost_global_step)) print_and_log('current learn rates {}'.format( network.get_learn_rate(sess))) print_and_log('current base stds {}'.format( network.get_base_std(sess))) if best_cost is None or test_cost < best_cost: print_and_log('new best cost {} at step {}'.format( test_cost, global_step)) best_cost, best_cost_global_step = test_cost, global_step best_curriculum_coefficient = episode_runner.curriculum_coefficient no_test_improvement = 0 consecutive_learn_rate_decrease = 0 best_saver.save(sess, global_step) test_trajectories_file = os.path.join( test_trajectories_dir, '{}.txt'.format(global_step)) serialize_compress(endpoints_by_path, test_trajectories_file) else: print_and_log( 'new model is not the best with cost {} at step {}'. format(test_cost, global_step)) no_test_improvement += 1 print_and_log('no improvement count {} of {}'.format( no_test_improvement, decrease_learn_rate_if_static_success)) if reset_best_every > 0 and no_test_improvement % reset_best_every == reset_best_every - 1: # restore the model every once in a while if did not find a better solution in a while best_saver.restore(sess) episode_runner.curriculum_coefficient = best_curriculum_coefficient if no_test_improvement == decrease_learn_rate_if_static_success: # restore the best model if config['model']['restore_on_decrease']: best_saver.restore(sess) episode_runner.curriculum_coefficient = best_curriculum_coefficient network.decrease_learn_rates(sess) no_test_improvement = 0 consecutive_learn_rate_decrease += 1 print_and_log('decreasing learn rates {} of {}'.format( consecutive_learn_rate_decrease, stop_training_after_learn_rate_decrease)) print_and_log('new learn rates {}'.format( network.get_learn_rate(sess))) if consecutive_learn_rate_decrease == stop_training_after_learn_rate_decrease: print_and_log('needs to stop') best_saver.restore(sess) break if episode_runner.curriculum_coefficient is not None: if success_ratio > config['curriculum'][ 'raise_when_train_above']: print_and_log('current curriculum coefficient {}'.format( episode_runner.curriculum_coefficient)) episode_runner.curriculum_coefficient *= config[ 'curriculum']['raise_times'] print_and_log('curriculum coefficient raised to {}'.format( episode_runner.curriculum_coefficient)) # mark in log the end of cycle print_and_log(os.linesep) print_and_log('end of run best: {} from step: {}'.format( best_cost, best_cost_global_step)) print_and_log('testing on a new set of start-goal pairs') best_saver.restore(sess) test_trajectories_file = os.path.join(test_trajectories_dir, '-1.txt') endpoints_by_path = trainer.collect_data( config['general']['test_episodes'], is_train=False, use_fixed_start_goal_pairs=True)[-1] serialize_compress(endpoints_by_path, test_trajectories_file) close_log() return best_cost
def _get_callbacks(self): tensor_board_callback = MyTensorBoard(log_dir=self._log_directory, histogram_freq=1, embeddings_layer_names=True, write_graph=True) model_saver_callback = ModelSaver(self._save_model_path, monitor='mean_q', mode='max', logger=self._logger) episode_logger_callback = EpisodeLogger(logger=self._logger) callbacks = [tensor_board_callback, model_saver_callback, episode_logger_callback] return callbacks