def eval_on_dataset(self, data_loader): def reset_confidence_interval_95(): try: del self.eval_acc except AttributeError as e: pass self.eval_acc = {"Top1Acc": [], "Top5Acc": []} # reset_confidence_interval_95() self._model.eval() set_random_seeds(0) # always evaluate on the same dataset. eval_stats = DAverageMeter() n = len(data_loader) with torch.no_grad(): for i, batch in enumerate( tqdm(data_loader) if self.is_tqdm else data_loader): eval_stats_this = self.eval_on_batch(batch) eval_stats.update(eval_stats_this) self.eval_acc['Top1Acc'].append(eval_stats_this['Top1Acc']) self.eval_acc['Top5Acc'].append(eval_stats_this['Top5Acc']) if (i + 1) == n: for k, v in self.eval_acc.items(): stds = torch.std(torch.tensor(v).float()) ci95 = 1.96 * stds / torch.sqrt( torch.tensor(n).float()) eval_stats.update({ k + '_std': stds.item(), k + '_cnf95': ci95.item() }) return eval_stats.average()
def main(unused_argv): set_random_seeds() get_datapath() # The dataset path get_steps() # setting steps according data_size tf.logging.set_verbosity(tf.logging.INFO) print('Now the mode of this mode is {} !'.format(FLAGS.mode)) # if log_dir is not exited, create it. if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) if FLAGS.mode == 'decode': FLAGS.branch_batch_size = FLAGS.beam_size # for beam search FLAGS.TS_mode = False hps = make_hps() # make a hps namedtuple # Vocabulary vocab = Vocab(hps.vocab_path, hps.vocab_size) # Train or Inference if hps.mode == 'train': batcher = Batcher(hps.data_path, vocab, hps) eval_hps = hps._replace(mode='eval') eval_batcher = Batcher(hps.eval_data_path, vocab, eval_hps) model = GSNModel(hps, vocab) train(model, batcher, eval_batcher, vocab, hps) elif hps.mode == 'decode': decode_mdl_hps = hps._replace(max_dec_steps=1) batcher = Batcher(hps.test_data_path, vocab, decode_mdl_hps) # for test model = GSNModel(decode_mdl_hps, vocab) decoder = BeamSearchDecoder(model, batcher, vocab) decoder._decode()
def test_epoch(epoch, experiment): testloaders, testsets = experiment.create_test_dataloaders() use_cuda = experiment.use_cuda net = experiment.net summaries = experiment.summaries criterion = experiment.criterion net.eval() utils.set_random_seeds(1234) with torch.no_grad(): for i, (testloader, testname) in enumerate(testloaders): stats = get_stats() print("Testing on {}".format(testname)) for batch_idx, input_set in enumerate(testloader): experiment.step = epoch * len(experiment.trainloader) + int( batch_idx / len(testloader) * len(experiment.trainloader)) experiment.iter = batch_idx torch.cuda.empty_cache() inputs, targets = input_set if use_cuda: inputs = inputs.cuda() targets = targets.cuda() # inputs, targets = experiment.data_preprocessing(inputs) # inputs, targets = Variable(inputs, requires_grad=False), Variable(targets, requires_grad=False) pred = torch.clamp(net(inputs), 0.0, 1.0) batch_loss = criterion(pred, targets) loss = batch_loss.mean() stats["loss"].update(loss.data) psnr_iter = metrics.psnr(pred, targets, maxval=1).mean().data ssim_iter = metrics.ssim(pred, targets) stats["psnr"].update(psnr_iter, pred.size(0)) stats["ssim"].update(ssim_iter.data, pred.size(0)) progress_bar( batch_idx, len(testloader), 'Loss: %.5f | PSNR: %.2f | SSIM: %.3f' % (stats["loss"].avg, stats["psnr"].avg, stats["ssim"].avg)) # save predicted image learned_img = Image.fromarray( (255 * pred[0, 0].cpu().data.numpy()).astype(np.uint8)) filename = os.path.join( './n3net-results', testsets[0][i].at(batch_idx).split( '/home/pacole2/Projects/datasets/DeepLesionTestPreprocessed/miniStudies/' )[1]) directory = os.path.dirname(filename) if not os.path.exists(directory): os.makedirs(directory) learned_img.save(os.path.join(filename)) del pred, inputs, targets add_summary(experiment, summaries, testname + "/epoch", epoch) for k, stat in stats.items(): add_summary(experiment, summaries, testname + "/" + k, stat.avg)
def run_one_config(opt, model_type, case_study=False): set_random_seeds() dataset = DataSet(opt, model_type) model_manager = ModelManager(opt) model, train_time = model_manager.build_model(model_type, dataset) evaluator = Evaluator(opt) metrics = evaluator.eval(model, model_type, dataset.test_loader) evaluator.write_performance(model_type, metrics, train_time) run_case_study(model, dataset, opt, case_study)
def train_on_dataset(self, data_loader): self._model.train() set_random_seeds( self.curr_epoch ) # train on a different subset each time and all epoches come in static order. train_stats = DAverageMeter() for i, batch in enumerate( tqdm(data_loader) if self.is_tqdm else data_loader): train_stats_this = self.train_on_batch(batch) train_stats.update(train_stats_this) return train_stats.average()
def main(): random_seed = 0 num_classes = 10 l1_regularization_strength = 0 l2_regularization_strength = 1e-4 learning_rate = 1e-1 num_epochs = 200 cuda_device = torch.device("cuda:0") cpu_device = torch.device("cpu:0") model_dir = "saved_models" model_filename = "resnet18_cifar10.pt" model_filepath = os.path.join(model_dir, model_filename) set_random_seeds(random_seed=random_seed) # Create an untrained model. model = create_model(num_classes=num_classes) train_loader, test_loader, classes = prepare_dataloader( num_workers=8, train_batch_size=128, eval_batch_size=256) # Train model. print("Training Model...") model = train_model(model=model, train_loader=train_loader, test_loader=test_loader, device=cuda_device, l1_regularization_strength=l1_regularization_strength, l2_regularization_strength=l2_regularization_strength, learning_rate=learning_rate, num_epochs=num_epochs) # Save model. save_model(model=model, model_dir=model_dir, model_filename=model_filename) # Load a pretrained model. model = load_model(model=model, model_filepath=model_filepath, device=cuda_device) _, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=cuda_device, criterion=None) classification_report = create_classification_report( model=model, test_loader=test_loader, device=cuda_device) print("Test Accuracy: {:.3f}".format(eval_accuracy)) print("Classification Report:") print(classification_report)
def run_weight_test(reset_rmsprop): tf.reset_default_graph() utils.set_random_seeds(0) sess = tf.Session() env = generic_preprocess(gym.make('Pong-v0'), max_n_noops=0) env.seed(0) with tf.variable_scope('global'): make_inference_network(n_actions=env.action_space.n, weight_inits='glorot') shared_variables = tf.global_variables() optimizer = tf.train.RMSPropOptimizer(learning_rate=5e-4, decay=0.99, epsilon=1e-5) network1 = Network(scope="worker_1", n_actions=env.action_space.n, entropy_bonus=0.01, value_loss_coef=0.5, weight_inits='glorot', max_grad_norm=0.5, optimizer=optimizer, summaries=False, debug=False) w1 = Worker(sess=sess, env=env, network=network1, log_dir='/tmp') network2 = Network(scope="worker_2", n_actions=env.action_space.n, entropy_bonus=0.01, value_loss_coef=0.5, weight_inits='glorot', max_grad_norm=0.5, optimizer=optimizer, summaries=False, debug=False) w2 = Worker(sess=sess, env=env, network=network2, log_dir='/tmp') rmsprop_init_ops = [v.initializer for v in optimizer.variables()] sess.run(tf.global_variables_initializer()) vars_sum_init = sess.run(get_var_sum(shared_variables)) w1.run_update(n_steps=1) vars_sum_post_w1_update = sess.run(get_var_sum(shared_variables)) if reset_rmsprop: sess.run(rmsprop_init_ops) w2.run_update(n_steps=1) vars_sum_post_w2_update = sess.run(get_var_sum(shared_variables)) return vars_sum_init, vars_sum_post_w1_update, vars_sum_post_w2_update
def test_epoch(epoch, experiment): testloaders = experiment.create_test_dataloaders() use_cuda = experiment.use_cuda net = experiment.net summaries = experiment.summaries criterion = experiment.criterion net.eval() utils.set_random_seeds(1234) with torch.no_grad(): for testloader, testname in testloaders: stats = get_stats() print("Testing on {}".format(testname)) for batch_idx, inputs in enumerate(testloader): experiment.step = epoch * len(experiment.trainloader) + int( batch_idx / len(testloader) * len(experiment.trainloader)) experiment.iter = batch_idx torch.cuda.empty_cache() if use_cuda: inputs = inputs.cuda() inputs, targets = experiment.data_preprocessing(inputs) # CLAMP values to [0,1] after adding noise inputs = torch.clamp(inputs, min=0, max=1) inputs, targets = Variable( inputs, requires_grad=False), Variable(targets, requires_grad=False) pred = net(inputs) batch_loss = criterion(pred, targets) loss = batch_loss.mean() stats["loss"].update(loss.data) psnr_iter = metrics.psnr(pred, targets, maxval=1).mean().data ssim_iter = metrics.ssim(pred, targets) stats["psnr"].update(psnr_iter, pred.size(0)) stats["ssim"].update(ssim_iter.data, pred.size(0)) progress_bar( batch_idx, len(testloader), 'Loss: %.5f | PSNR: %.2f | SSIM: %.3f' % (stats["loss"].avg, stats["psnr"].avg, stats["ssim"].avg)) del pred, inputs, targets add_summary(experiment, summaries, testname + "/epoch", epoch) for k, stat in stats.items(): add_summary(experiment, summaries, testname + "/" + k, stat.avg)
def main() -> None: """ Program entry point. Parses command line arguments to decide which dataset and model to use. Originally written as a group for the common pipeline. Later amended by Adam Jaamour. :return: None. """ set_random_seeds() parse_command_line_arguments() print_num_gpus_available() # Create label encoder. l_e = create_label_encoder() # Run in training mode. if config.run_mode == "train": print("-- Training model --\n") # Start recording time. start_time = time.time() # Multi-class classification (mini-MIAS dataset) if config.dataset == "mini-MIAS": # Import entire dataset. images, labels = import_minimias_dataset(data_dir="../data/{}/images_processed".format(config.dataset), label_encoder=l_e) # Split dataset into training/test/validation sets (80/20% split). X_train, X_test, y_train, y_test = dataset_stratified_split(split=0.20, dataset=images, labels=labels) # Create CNN model and split training/validation set (80/20% split). model = CnnModel(config.model, l_e.classes_.size) X_train, X_val, y_train, y_val = dataset_stratified_split(split=0.25, dataset=X_train, labels=y_train) # Calculate class weights. class_weights = calculate_class_weights(y_train, l_e) # Data augmentation. y_train_before_data_aug = y_train X_train, y_train = generate_image_transforms(X_train, y_train) y_train_after_data_aug = y_train np.random.shuffle(y_train) if config.verbose_mode: print("Before data augmentation:") print(Counter(list(map(str, y_train_before_data_aug)))) print("After data augmentation:") print(Counter(list(map(str, y_train_after_data_aug)))) # Fit model. if config.verbose_mode: print("Training set size: {}".format(X_train.shape[0])) print("Validation set size: {}".format(X_val.shape[0])) print("Test set size: {}".format(X_test.shape[0])) model.train_model(X_train, X_val, y_train, y_val, class_weights) # Binary classification (binarised mini-MIAS dataset) elif config.dataset == "mini-MIAS-binary": # Import entire dataset. images, labels = import_minimias_dataset(data_dir="../data/{}/images_processed".format(config.dataset), label_encoder=l_e) # Split dataset into training/test/validation sets (80/20% split). X_train, X_val, y_train, y_val = dataset_stratified_split(split=0.20, dataset=images, labels=labels) # Create CNN model and split training/validation set (80/20% split). model = CnnModel(config.model, l_e.classes_.size) # model.load_minimias_weights() # model.load_minimias_fc_weights() # Fit model. if config.verbose_mode: print("Training set size: {}".format(X_train.shape[0])) print("Validation set size: {}".format(X_val.shape[0])) model.train_model(X_train, X_val, y_train, y_val, None) # Binary classification (CBIS-DDSM dataset). elif config.dataset == "CBIS-DDSM": images, labels = import_cbisddsm_training_dataset(l_e) # Split training dataset into training/validation sets (75%/25% split). X_train, X_val, y_train, y_val = dataset_stratified_split(split=0.25, dataset=images, labels=labels) train_dataset = create_dataset(X_train, y_train) validation_dataset = create_dataset(X_val, y_val) # Calculate class weights. class_weights = calculate_class_weights(y_train, l_e) # Create and train CNN model. model = CnnModel(config.model, l_e.classes_.size) # model.load_minimias_fc_weights() # model.load_minimias_weights() # Fit model. if config.verbose_mode: print("Training set size: {}".format(X_train.shape[0])) print("Validation set size: {}".format(X_val.shape[0])) model.train_model(train_dataset, validation_dataset, None, None, class_weights) # Save training runtime. runtime = round(time.time() - start_time, 2) # Save the model and its weights/biases. model.save_model() model.save_weights() # Evaluate training results. print_cli_arguments() if config.dataset == "mini-MIAS": model.make_prediction(X_val) model.evaluate_model(y_val, l_e, 'N-B-M', runtime) elif config.dataset == "mini-MIAS-binary": model.make_prediction(X_val) model.evaluate_model(y_val, l_e, 'B-M', runtime) elif config.dataset == "CBIS-DDSM": model.make_prediction(validation_dataset) model.evaluate_model(y_val, l_e, 'B-M', runtime) print_runtime("Training", runtime) # Run in testing mode. elif config.run_mode == "test": print("-- Testing model --\n") # Start recording time. start_time = time.time() # Test multi-class classification (mini-MIAS dataset). if config.dataset == "mini-MIAS": images, labels = import_minimias_dataset(data_dir="../data/{}/images_processed".format(config.dataset), label_encoder=l_e) _, X_test, _, y_test = dataset_stratified_split(split=0.20, dataset=images, labels=labels) model = load_trained_model() predictions = model.predict(x=X_test) runtime = round(time.time() - start_time, 2) test_model_evaluation(y_test, predictions, l_e, 'N-B-M', runtime) # Test binary classification (binarised mini-MIAS dataset). elif config.dataset == "mini-MIAS-binary": pass # Test binary classification (CBIS-DDSM dataset). elif config.dataset == "CBIS-DDSM": images, labels = import_cbisddsm_testing_dataset(l_e) test_dataset = create_dataset(images, labels) model = load_trained_model() predictions = model.predict(x=test_dataset) runtime = round(time.time() - start_time, 2) test_model_evaluation(labels, predictions, l_e, 'B-M', runtime) print_runtime("Testing", runtime)
# Required Imports import os, sys import numpy as np import pandas as pd import argparse from datetime import datetime import multiprocessing as mp import matplotlib.pyplot as plt from sklearn.decomposition import PCA from utils import debug, get_parser, timeit, parse_arg, get_seed, set_random_seeds from data_utils import RandomForestFeatureSelector, get_bins # Keras imports in main() so that we can select cpu or specific gpu # Set random seeds set_random_seeds() # Set matplotlib params font = { 'family': 'sans-serif', 'style': 'normal', 'weight': 'bold', 'size': 22 } plt.rc('font', **font) # Debug (print) arguments ########################################### def debug_args(args): """Print out all of the arguments if debugging."""
def main(argv): args = parser.parse_args(argv) # Load configuration conf = Configuration.from_json(args.config) conf.args = args if args.conf: new_conf_entries = {} for arg in args.conf: key, value = arg.split('=') new_conf_entries[key] = value conf.update(new_conf_entries) # Setup log directory if args.run_dir: conf.run_dir = args.run_dir elif args.resume: if os.path.exists(args.resume): conf.run_dir = os.path.dirname(args.resume) if not conf.has_attr('run_dir'): run_name = conf.get_attr('run_name', default='unnamed_run') conf.run_dir = get_run_dir(args.log_dir, run_name) if not args.dry: if not os.path.isdir(conf.run_dir): os.mkdir(conf.run_dir) setup_logging(conf.run_dir, 'train', args.verbose, args.dry) logging.info('Commandline arguments: {}'.format(' '.join(argv))) if not args.dry: logging.info('This run is saved to: {}'.format(conf.run_dir)) config_path = get_config_path(conf.run_dir) conf.serialize(config_path) if args.cuda != '': try: args.cuda = utils.set_cuda_env(args.cuda) except Exception: logging.critical('No free GPU on this machine. Aborting run.') return logging.info('Running on GPU {}'.format(args.cuda)) if args.verbose: logging.debug(str(conf)) utils.set_random_seeds(conf.seed) # Setup model logging.info('Setting up training runner {}'.format(conf.runner_type)) runner = build_runner(conf, conf.runner_type, args.cuda, mode='train') if args.print_model: print(str(runner)) if args.print_parameters: print_model_parameters(runner) # Handle resuming from checkpoint restore_state = None if args.resume: if os.path.exists(args.resume): restore_state = restore_checkpoint(args.resume, runner) logging.info('Restored checkpoint from {}'.format(args.resume)) else: logging.critical(('Checkpoint {} to restore ' 'from not found').format(args.resume)) return use_tensorboard = conf.get_attr('use_tensorboard', default=DEFAULT_USE_TENSORBOARD) if use_tensorboard and not args.dry: from tensorboardX import SummaryWriter summary_writer = SummaryWriter(conf.run_dir) logging.debug('Using tensorboardX summary writer') else: summary_writer = None # Load datasets num_workers = conf.get_attr('num_data_workers', default=DEFAULT_NUM_WORKERS) num_train_samples = conf.get_attr('num_train_subset_samples', default=None) num_val_samples = conf.get_attr('num_validation_subset_samples', default=None) train_dataset_name = conf.get_attr('train_dataset', alternative='dataset') logging.info('Loading training dataset {}'.format(train_dataset_name)) train_dataset = load_dataset(conf, args.data_dir, train_dataset_name, 'train') train_sampler = maybe_get_subset_sampler(num_train_samples, train_dataset) train_loader = DataLoader(dataset=train_dataset, num_workers=num_workers, batch_size=conf.batch_size, sampler=train_sampler, shuffle=train_sampler is None, worker_init_fn=utils.set_worker_seeds) val_dataset_name = conf.get_attr('validation_dataset', alternative='dataset') logging.info('Loading validation dataset {}'.format(val_dataset_name)) val_dataset = load_dataset(conf, args.data_dir, val_dataset_name, 'val') val_sampler = maybe_get_subset_sampler(num_val_samples, val_dataset) val_loader = DataLoader(dataset=val_dataset, num_workers=num_workers, batch_size=conf.get_attr('validation_batch_size', default=conf.batch_size), sampler=val_sampler, shuffle=False, worker_init_fn=utils.set_worker_seeds) # Setup validation checkpoints chkpt_metrics = conf.get_attr('validation_checkpoint_metrics', default=[]) chkpt_metric_dirs = { metric: os.path.join(conf.run_dir, 'best_' + metric) for metric in chkpt_metrics } for metric_dir in chkpt_metric_dirs.values(): if not args.dry and not os.path.isdir(metric_dir): os.mkdir(metric_dir) # Setup early stopping if conf.has_attr('early_stopping'): from training.early_stopping import EarlyStopper early_stoppers = [ EarlyStopper(conf.early_stopping['metric_name'], conf.early_stopping['patience'], conf.early_stopping.get('min_value', None), conf.early_stopping.get('max_difference', None)) ] elif conf.has_attr('early_stoppers'): from training.early_stopping import EarlyStopper early_stoppers = [] for early_stopping_conf in conf.early_stoppers: min_value = early_stopping_conf.get('min_value', None) max_diff = early_stopping_conf.get('max_difference', None) early_stoppers.append( EarlyStopper(early_stopping_conf['metric_name'], early_stopping_conf['patience'], min_value, max_diff)) else: early_stoppers = [] logging.info('Starting training run of {} epochs'.format(conf.num_epochs)) # Train try: train_net(conf, runner, train_loader, val_loader, args.cuda, chkpt_metric_dirs, restore_state, summary_writer, early_stoppers) except KeyboardInterrupt: if summary_writer is not None: summary_writer.close()
parser.add_argument('--use-linear-lr-decay', action='store_true') parser.add_argument('--use-clipped-value-loss', action='store_true') parser.add_argument('--use-tensorboard', action='store_true') parser.add_argument('--debug', action='store_true') parser.add_argument('--no-render', action='store_true', default=False) if __name__ == '__main__': # parse arguments args = parser.parse_args() args.cuda = False args.render = not args.no_render # set device and random seeds device = torch.device("cpu") torch.set_num_threads(1) utils.set_random_seeds(args.seed, args.cuda, args.debug) # setup environment envs = make_vec_envs(env_id=args.env_id, seed=args.seed, num_processes=args.num_processes, gamma=None, log_dir=None, device=device, obs_keys=['observation', 'desired_goal'] if not args.env_id.startswith( 'metaworld') else None, allow_early_resets=True, max_steps=args.num_steps, evaluating=True) # create agent
def main(): args, lr_args, log_dir, preprocess_wrapper, ckpt_timer = parse_args() easy_tf_log.set_dir(log_dir) utils.set_random_seeds(args.seed) sess = tf.Session() envs = make_envs(args.env_id, preprocess_wrapper, args.max_n_noops, args.n_workers, args.seed, args.debug, log_dir) step_counter = utils.GraphCounter(sess) update_counter = utils.GraphCounter(sess) lr = make_lr(lr_args, step_counter.value) optimizer = make_optimizer(lr) networks = make_networks(n_workers=args.n_workers, n_actions=envs[0].action_space.n, weight_inits=args.weight_inits, value_loss_coef=args.value_loss_coef, entropy_bonus=args.entropy_bonus, max_grad_norm=args.max_grad_norm, optimizer=optimizer, debug=args.debug) # Why save_relative_paths=True? # So that the plain-text 'checkpoint' file written uses relative paths, # which seems to be needed in order to avoid confusing saver.restore() # when restoring from FloydHub runs. global_vars = tf.trainable_variables('global') saver = tf.train.Saver(global_vars, max_to_keep=1, save_relative_paths=True) checkpoint_dir = osp.join(log_dir, 'checkpoints') os.makedirs(checkpoint_dir) checkpoint_file = osp.join(checkpoint_dir, 'network.ckpt') if args.load_ckpt: print("Restoring from checkpoint '%s'..." % args.load_ckpt, end='', flush=True) saver.restore(sess, args.load_ckpt) print("done!") else: sess.run(tf.global_variables_initializer()) workers = make_workers(sess=sess, envs=envs, networks=networks, n_workers=args.n_workers, log_dir=log_dir) worker_threads = start_workers(n_steps=args.n_steps, steps_per_update=args.steps_per_update, step_counter=step_counter, update_counter=update_counter, workers=workers) ckpt_timer.reset() step_rate = utils.RateMeasure() step_rate.reset(int(step_counter)) while True: time.sleep(args.wake_interval_seconds) steps_per_second = step_rate.measure(int(step_counter)) easy_tf_log.tflog('misc/steps_per_second', steps_per_second) easy_tf_log.tflog('misc/steps', int(step_counter)) easy_tf_log.tflog('misc/updates', int(update_counter)) easy_tf_log.tflog('misc/lr', sess.run(lr)) alive = [t.is_alive() for t in worker_threads] if ckpt_timer.done() or not any(alive): saver.save(sess, checkpoint_file, int(step_counter)) print("Checkpoint saved to '{}'".format(checkpoint_file)) ckpt_timer.reset() if not any(alive): break for env in envs: env.close()
def run_worker(env_id, preprocess_wrapper, seed, worker_n, n_steps_to_run, ckpt_timer, load_ckpt_file, render, log_dir, max_n_noops, debug, steps_per_update): utils.set_random_seeds(seed) mem_log = osp.join(log_dir, "worker_{}_memory.log".format(worker_n)) memory_profiler = MemoryProfiler(pid=-1, log_path=mem_log) memory_profiler.start() worker_log_dir = osp.join(log_dir, "worker_{}".format(worker_n)) easy_tf_log_dir = osp.join(worker_log_dir, 'easy_tf_log') os.makedirs(easy_tf_log_dir) easy_tf_log.set_dir(easy_tf_log_dir) server = tf.train.Server(cluster, job_name="worker", task_index=worker_n) sess = tf.Session(server.target) with tf.device("/job:worker/task:0"): create_network('global') with tf.device("/job:worker/task:%d" % worker_n): w = Worker(sess=sess, env_id=env_id, preprocess_wrapper=preprocess_wrapper, worker_n=worker_n, seed=seed, log_dir=worker_log_dir, max_n_noops=max_n_noops, debug=debug) init_op = tf.global_variables_initializer() if render: w.render = True # Worker 0 initialises the global network as well as the per-worker networks # Other workers only initialise their own per-worker networks sess.run(init_op) if worker_n == 0: saver = tf.train.Saver() checkpoint_dir = osp.join(log_dir, 'checkpoints') os.makedirs(checkpoint_dir) checkpoint_file = osp.join(checkpoint_dir, 'network.ckpt') if load_ckpt_file is not None: print("Restoring from checkpoint '%s'..." % load_ckpt_file, end='', flush=True) saver.restore(sess, load_ckpt_file) print("done!") updates = 0 steps = 0 ckpt_timer.reset() while steps < n_steps_to_run: start_time = time.time() steps_ran = w.run_update(steps_per_update) steps += steps_ran updates += 1 end_time = time.time() steps_per_second = steps_ran / (end_time - start_time) easy_tf_log.tflog('misc/steps_per_second', steps_per_second) easy_tf_log.tflog('misc/steps', steps) easy_tf_log.tflog('misc/updates', updates) if worker_n == 0 and ckpt_timer.done(): saver.save(sess, checkpoint_file) print("Checkpoint saved to '{}'".format(checkpoint_file)) ckpt_timer.reset() memory_profiler.stop()
if early_stopper.early_stop_check(early_stopper_metric): logger.info('No improvement over {} epochs, stop training'.format(early_stopper.max_round)) logger.info(f'Loading the best model at epoch {early_stopper.best_epoch}') encoder.load_state_dict(torch.load(MODEL_SAVE_PATH+get_model_name('Encoder'))) decoder.load_state_dict(torch.load(MODEL_SAVE_PATH+get_model_name('Decoder'))) test_result = [early_stopper.best_ap, early_stopper.best_auc, early_stopper.best_acc, early_stopper.best_loss] break test_ap, test_auc, test_acc, test_loss = eval_epoch(args, logger, g, test_loader, encoder, decoder, msg2mail, loss_fcn, device, num_test_samples) logger.info('Test {} Task | ap: {:.4f} | auc: {:.4f} | acc: {:.4f} | Loss: {:.4f}'.format(args.tasks, test_ap, test_auc, test_acc, test_loss)) test_result = [test_ap, test_auc, test_acc, test_loss] if early_stopper.best_epoch == epoch: early_stopper.best_ap = test_ap early_stopper.best_auc = test_auc early_stopper.best_acc = test_acc early_stopper.best_loss = test_loss logger.info(f'Saving the best model at epoch {early_stopper.best_epoch}') torch.save(encoder.state_dict(), MODEL_SAVE_PATH+get_model_name('Encoder')) torch.save(decoder.state_dict(), MODEL_SAVE_PATH+get_model_name('Decoder')) if __name__ == '__main__': args = get_args() logger = set_logger() logger.info(args) set_random_seeds(args.seed) train(args, logger)
def test_random_seed(self): # Note: TensorFlow random seeding doesn't work completely as expected. # tf.set_random_seed sets a the graph-level seed in the current graph. # But operations also have their own operation-level seed, which is # chosen deterministically based on the graph-level seed, but also # based on other things. # # So if you create multiple operations in the same graph, # each one will be given a different operation-level seed. # The graph-level seed just determines what the sequence of # operation-level seeds will be. # # To get a bunch of operations with the same sequence of # operation-level seeds, we need to reset the graph before creation # of each bunch of operations. # Generate some random numbers from a specific seed tf.reset_default_graph() sess = tf.Session() set_random_seeds(0) tf_rand_var = tf.random_normal([10]) numpy_rand_1 = np.random.rand(10) numpy_rand_2 = np.random.rand(10) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, numpy_rand_1, numpy_rand_2) tensorflow_rand_1 = sess.run(tf_rand_var) tensorflow_rand_2 = sess.run(tf_rand_var) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, tensorflow_rand_1, tensorflow_rand_2) python_rand_1 = [random.random() for _ in range(10)] python_rand_2 = [random.random() for _ in range(10)] np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, python_rand_1, python_rand_2) # Put the seed back and check we get the same numbers tf.reset_default_graph() sess = tf.Session() set_random_seeds(0) tf_rand_var = tf.random_normal([10]) numpy_rand_3 = np.random.rand(10) numpy_rand_4 = np.random.rand(10) np.testing.assert_equal(numpy_rand_1, numpy_rand_3) np.testing.assert_equal(numpy_rand_2, numpy_rand_4) tensorflow_rand_3 = sess.run(tf_rand_var) tensorflow_rand_4 = sess.run(tf_rand_var) np.testing.assert_equal(tensorflow_rand_1, tensorflow_rand_3) np.testing.assert_equal(tensorflow_rand_2, tensorflow_rand_4) python_rand_3 = [random.random() for _ in range(10)] python_rand_4 = [random.random() for _ in range(10)] np.testing.assert_equal(python_rand_1, python_rand_3) np.testing.assert_equal(python_rand_2, python_rand_4) # Set a different seed and make sure we get different numbers set_random_seeds(1) numpy_rand_5 = np.random.rand(10) numpy_rand_6 = np.random.rand(10) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, numpy_rand_5, numpy_rand_1) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, numpy_rand_6, numpy_rand_2) tensorflow_rand_5 = sess.run(tf_rand_var) tensorflow_rand_6 = sess.run(tf_rand_var) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, tensorflow_rand_5, tensorflow_rand_1) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, tensorflow_rand_6, tensorflow_rand_2) python_rand_5 = [random.random() for _ in range(10)] python_rand_6 = [random.random() for _ in range(10)] np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, python_rand_5, python_rand_1) np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, python_rand_6, python_rand_2)
from utils import get_configuration from utils import set_random_seeds from utils import set_configuration from TSC_Env import TSC_Env if __name__ == '__main__': args = set_configuration() para_config = get_configuration(args.para_dir) env_name = args.env port = args.port gui = args.gui print(para_config) print(env_name) total_episodes = para_config['total_episodes'] sim_seed = para_config['sim_seed'] set_random_seeds(sim_seed) env = TSC_Env(env_name, para_config, gui=gui, port=args.port) if args.load_model: env.agents.load_model(args.load_model_dir) env.run() if args.save_model: env.agents.save_model(args.save_model_dir) env.output_data() env.close()
def main(argv): args = parser.parse_args(argv) setup_logging(os.path.dirname(args.checkpoint), 'eval', args.verbose, args.dry) logging.info('Commandline arguments: {}'.format(' '.join(argv))) if args.cuda != '': try: args.cuda = utils.set_cuda_env(args.cuda) except Exception: logging.critical('No free GPU on this machine. Aborting run.') return logging.info('Running on GPU {}'.format(args.cuda)) # Load configuration conf = Configuration.from_json(args.config) conf.args = args if args.conf: new_conf_entries = {} for arg in args.conf: key, value = arg.split('=') new_conf_entries[key] = value conf.update(new_conf_entries) if args.verbose: logging.debug(conf) utils.set_random_seeds(conf.seed) if args.raw: # This is a hack to suppress the output transform when we request raw data conf.application = 'none' if conf.has_attr('tasks'): for name, task in conf.tasks.items(): if 'application' in task: logging.debug(('Changing output transform in task {} ' 'from {} to none').format(name, task['application'])) task['application'] = 'none' # Setup model runner = build_runner(conf, conf.runner_type, args.cuda, mode='test') # Handle resuming from checkpoint if args.checkpoint != 'NONE': if os.path.exists(args.checkpoint): _ = restore_checkpoint(args.checkpoint, runner, cuda=args.cuda) logging.info('Restored checkpoint from {}'.format(args.checkpoint)) else: logging.critical(('Checkpoint {} to restore ' 'from not found').format(args.checkpoint)) return # Load datasets mode = 'dataset' if len(args.files_or_dirs) == 0: datasets = [load_dataset(conf, args.data_dir, conf.validation_dataset, args.fold)] else: datasets = [] for f in args.files_or_dirs: if is_dataset(f): dataset = load_dataset(conf, args.data_dir, f, args.fold) datasets.append(dataset) if args.raw: mode = 'raw' num_samples = conf.get_attr('num_validation_subset_samples', default=None) # Evaluate all datasets for dataset in datasets: logging.info('Evaluating dataset {}'.format(dataset.name)) sampler = maybe_get_subset_sampler(num_samples, dataset) loader = DataLoader(dataset=dataset, num_workers=DEFAULT_NUM_WORKERS, batch_size=1, sampler=sampler, shuffle=False) if mode == 'dataset': data, _, val_metrics = runner.validate(loader, len(loader)) res_str = 'Average metrics for {}\n'.format(dataset.name) for metric_name, metric in val_metrics.items(): res_str += ' {}: {}\n'.format(metric_name, metric) logging.info(res_str) else: data = runner.infer(loader) if not args.dry and (args.infer or args.dump): if mode == 'dataset' or mode == 'raw': conf_name = os.path.splitext(os.path.basename(conf.file))[0] output_dir = get_run_dir(args.out_dir, '{}_{}'.format(dataset.name, conf_name)) if not os.path.isdir(output_dir): os.mkdir(output_dir) logging.info('Writing images to {}'.format(output_dir)) file_idx = 0 for batch in data: if mode == 'image': output_dir = os.path.dirname(dataset.images[file_idx]) named_batch = runner.get_named_outputs(batch) inp = named_batch['input'] if 'prediction' in named_batch: batch_size = named_batch['prediction'].shape[0] filenames = [dataset.get_filename(idx) for idx in range(file_idx, file_idx + batch_size)] save_output_images(dataset, inp, named_batch['prediction'], named_batch['target'], output_dir, filenames, 'default', args.dump, args.raw) file_idx += len(filenames) logging.info(('Finished writing images for ' 'dataset {}').format(dataset.name))
def train(params, _run=None): params = Params(params) set_random_seeds(params.seed) time_now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") params.save_root = params.save_root + f'/{params.project_name}_{time_now}_{params.version}' os.makedirs(params.save_root, exist_ok=True) logging.basicConfig(filename=f'{params.save_root}/{params.project_name}_{time_now}_{params.version}.log', filemode='a', format='%{asctime}s - %(levalname)s: %(message)s') if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' logging.info(f'Available GPUs: {torch.cuda.device_count()}') train2007, train_label_2007, train_bb_2007 = load_annotation(os.path.join(params.data_root, 'VOC2007'), 'trainval') test2007, test_label_2007, test_bb_2007 = load_annotation(os.path.join(params.data_root, 'VOC2007'), 'test') train2012, train_label_2012, train_bb_2012 = load_annotation(os.path.join(params.data_root, 'VOC2012'), 'trainval') test2012, test_label_2012, test_bb_2012 = load_annotation(os.path.join(params.data_root, 'VOC2012'), 'test') train_data = train2007+test2007+train2012 train_label = train_label_2007+test_label_2007+train_label_2012 train_bb = train_bb_2007 + test_bb_2007 + train_bb_2012 test_data = test2012 test_label = test_label_2012 test_bb = test_bb_2012 train_dataset = YoloDataset(train_data, train_bb, train_label, params, train=True) eval_dataset = YoloDataset(test_data, test_bb, test_label, params, train=False) train_loader = DataLoader(dataset=train_dataset, num_workers=params.num_gpus*8, batch_size=params.batch_size, shuffle=True, drop_last=True, pin_memory=True) eval_loader = DataLoader(dataset=eval_dataset, num_workers=1, batch_size=1, shuffle=False, pin_memory=True) model = Backbone() last_step = 0 last_epoch = 0 if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = nn.DataParallel(model) if params.optim == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=params.learning_rate) else: optimizer = torch.optim.SGD(model.parameters(), lr=params.learning_rate, momentum=0.9, nesterov=True, weight_decay=0.0005) criterion = SumSquareError() schedule = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, factor=0.5, verbose=True, patience=10) epoch = 0 begin_epoch = max(0, last_epoch) step = max(0, last_step) best_loss = 1e6 logging.info('Begin to train...') model.train() import cv2 as cv try: for epoch in range(begin_epoch, params.epoch): for iter, (img, annotation) in enumerate(train_loader): output = model(img.cuda()) loss = criterion(output, annotation.cuda()) optimizer.zero_grad() loss.backward() optimizer.step() if iter % params.save_interval == 0: logging.info(f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} ' f'Train Epoch: {epoch} iter: {iter} loss: {loss.item()}') step += 1 if epoch % params.eval_interval == 0: model.eval() epoch_loss = 0 with torch.no_grad(): for iter, (img, annotation) in enumerate(eval_loader): output = model(img.cuda()) loss = criterion(output, annotation.cuda()).item() epoch_loss += loss * len(img) loss = epoch_loss / len(eval_dataset) logging.info(f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} ' f'Eval Epoch: {epoch} loss: {loss}') schedule.step(loss) if loss < best_loss: best_loss = loss save_checkpoint(model, f'{params.save_root}/{epoch}_{step}.pth') model.train() except KeyboardInterrupt: save_checkpoint(model, f'{params.save_root}/Interrupt_{epoch}_{step}.pth')
import torch from torch import nn from torch import optim import torchvision import torchvision.transforms as transforms import config as con from models import Net from time import time from utils import accuracy, get_cifar10_data, set_random_seeds, get_model_layers, get_prms_rqr_grd try: from apex import amp except: print('the apex module does not exists') set_random_seeds(con.random_seed) trainset, trainloader, testset, testloader = next(get_cifar10_data()) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') if con.use_cuda: net = Net().cuda() else: net = Net() optimizer = optim.Adam(net.parameters()) if con.use_apex: net, optimizer = amp.initialize(net, optimizer, opt_level=con.apex_opt_level) criterion = nn.CrossEntropyLoss() print('num of net parameters:', sum(p.numel() for p in net.parameters()))
def main(argv): args = parser.parse_args(argv) if args.cuda != '': try: args.cuda = utils.set_cuda_env(args.cuda) except Exception: print('No free GPU on this machine. Aborting run.') return print('Running on GPU {}'.format(args.cuda)) # Load configuration conf = Configuration.from_json(args.config) conf.args = args if args.conf: new_conf_entries = {} for arg in args.conf: key, value = arg.split('=') new_conf_entries[key] = value conf.update(new_conf_entries) if args.verbose: print(conf) utils.set_random_seeds(conf.seed) # Setup model runner = build_runner(conf, conf.runner_type, args.cuda, mode='train', resume=args.resume is not None) if args.print_model: print(str(runner)) # Handle resuming from checkpoint restore_state = None if args.resume: if os.path.exists(args.resume): restore_state = restore_checkpoint(args.resume, runner) conf.run_dir = os.path.dirname(args.resume) print('Restored checkpoint from {}'.format(args.resume)) else: print('Checkpoint {} to restore from not found'.format( args.resume)) return # Setup log directory if args.run_dir: conf.run_dir = args.run_dir if not conf.has_attr('run_dir'): run_name = conf.get_attr('run_name', default='unnamed_run') conf.run_dir = get_run_dir(args.log_dir, run_name) if not args.dry: if not os.path.isdir(conf.run_dir): os.mkdir(conf.run_dir) print('This run is saved to: {}'.format(conf.run_dir)) config_path = get_config_path(conf.run_dir) conf.serialize(config_path) use_tensorboard = conf.get_attr('use_tensorboard', default=DEFAULT_USE_TENSORBOARD) if use_tensorboard and not args.dry: from tensorboardX import SummaryWriter summary_writer = SummaryWriter(conf.run_dir) else: summary_writer = None # Load datasets num_workers = conf.get_attr('num_data_workers', default=DEFAULT_NUM_WORKERS) num_train_samples = conf.get_attr('num_train_subset_samples', default=None) num_val_samples = conf.get_attr('num_validation_subset_samples', default=None) train_dataset_name = conf.get_attr('train_dataset', alternative='dataset') train_dataset = load_dataset(conf, args.data_dir, train_dataset_name, 'train') train_sampler = maybe_get_subset_sampler(num_train_samples, train_dataset) train_loader = DataLoader(dataset=train_dataset, num_workers=num_workers, batch_size=conf.batch_size, sampler=train_sampler, shuffle=train_sampler is None) val_dataset_name = conf.get_attr('validation_dataset', alternative='dataset') val_dataset = load_dataset(conf, args.data_dir, val_dataset_name, 'val') val_sampler = maybe_get_subset_sampler(num_val_samples, val_dataset) val_loader = DataLoader(dataset=val_dataset, num_workers=num_workers, batch_size=conf.get_attr('validation_batch_size', default=conf.batch_size), sampler=val_sampler, shuffle=False) chkpt_metrics = conf.get_attr('validation_checkpoint_metrics', default=[]) chkpt_metric_dirs = { metric: os.path.join(conf.run_dir, 'best_' + metric) for metric in chkpt_metrics } for metric_dir in chkpt_metric_dirs.values(): if not args.dry and not os.path.isdir(metric_dir): os.mkdir(metric_dir) # Train try: train_net(conf, runner, train_loader, val_loader, args.cuda, chkpt_metric_dirs, restore_state, summary_writer) except KeyboardInterrupt: if summary_writer is not None: summary_writer.close()
def main(): num_classes = 10 random_seed = 1 l1_regularization_strength = 0 l2_regularization_strength = 1e-4 learning_rate = 1e-3 learning_rate_decay = 1 cuda_device = torch.device("cuda:0") cpu_device = torch.device("cpu:0") model_dir = "saved_models" model_filename = "resnet18_cifar10.pt" model_filename_prefix = "pruned_model" pruned_model_filename = "resnet18_pruned_cifar10.pt" model_filepath = os.path.join(model_dir, model_filename) pruned_model_filepath = os.path.join(model_dir, pruned_model_filename) set_random_seeds(random_seed=random_seed) # Create an untrained model. model = create_model(num_classes=num_classes) # Load a pretrained model. model = load_model(model=model, model_filepath=model_filepath, device=cuda_device) train_loader, test_loader, classes = prepare_dataloader( num_workers=8, train_batch_size=128, eval_batch_size=256) _, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=cuda_device, criterion=None) classification_report = create_classification_report( model=model, test_loader=test_loader, device=cuda_device) num_zeros, num_elements, sparsity = measure_global_sparsity(model) print("Test Accuracy: {:.3f}".format(eval_accuracy)) print("Classification Report:") print(classification_report) print("Global Sparsity:") print("{:.2f}".format(sparsity)) print("Iterative Pruning + Fine-Tuning...") pruned_model = copy.deepcopy(model) # iterative_pruning_finetuning( # model=pruned_model, # train_loader=train_loader, # test_loader=test_loader, # device=cuda_device, # learning_rate=learning_rate, # learning_rate_decay=learning_rate_decay, # l1_regularization_strength=l1_regularization_strength, # l2_regularization_strength=l2_regularization_strength, # conv2d_prune_amount=0.3, # linear_prune_amount=0, # num_iterations=8, # num_epochs_per_iteration=50, # model_filename_prefix=model_filename_prefix, # model_dir=model_dir, # grouped_pruning=True) iterative_pruning_finetuning( model=pruned_model, train_loader=train_loader, test_loader=test_loader, device=cuda_device, learning_rate=learning_rate, learning_rate_decay=learning_rate_decay, l1_regularization_strength=l1_regularization_strength, l2_regularization_strength=l2_regularization_strength, conv2d_prune_amount=0.98, linear_prune_amount=0, num_iterations=1, num_epochs_per_iteration=200, model_filename_prefix=model_filename_prefix, model_dir=model_dir, grouped_pruning=True) # Apply mask to the parameters and remove the mask. remove_parameters(model=pruned_model) _, eval_accuracy = evaluate_model(model=pruned_model, test_loader=test_loader, device=cuda_device, criterion=None) classification_report = create_classification_report( model=pruned_model, test_loader=test_loader, device=cuda_device) num_zeros, num_elements, sparsity = measure_global_sparsity(pruned_model) print("Test Accuracy: {:.3f}".format(eval_accuracy)) print("Classification Report:") print(classification_report) print("Global Sparsity:") print("{:.2f}".format(sparsity)) save_model(model=model, model_dir=model_dir, model_filename=model_filename)
def main(argv): args = parser.parse_args(argv) if args.cuda != '': try: args.cuda = utils.set_cuda_env(args.cuda) except Exception: print('No free GPU on this machine. Aborting run.') return print('Running on GPU {}'.format(args.cuda)) # Load configuration conf = Configuration.from_json(args.config) conf.args = args if args.conf: new_conf_entries = {} for arg in args.conf: key, value = arg.split('=') new_conf_entries[key] = value conf.update(new_conf_entries) if args.verbose: print(conf) utils.set_random_seeds(conf.seed) # Setup model runner = build_runner(conf, conf.runner_type, args.cuda, mode='test') # Handle resuming from checkpoint if args.checkpoint != 'NONE': if os.path.exists(args.checkpoint): _ = restore_checkpoint(args.checkpoint, runner, cuda=args.cuda) print('Restored checkpoint from {}'.format(args.checkpoint)) else: print('Checkpoint {} to restore from not found'.format(args.checkpoint)) return # Evaluate on full image, not crops conf.full_image = True # Load datasets mode = 'dataset' if len(args.files_or_dirs) == 0: datasets = [load_dataset(conf, args.data_dir, conf.validation_dataset, args.fold)] else: datasets = [] for f in args.files_or_dirs: if is_dataset(f): dataset = load_dataset(conf, args.data_dir, f, args.fold) datasets.append(dataset) else: mode = 'image' transform = get_sr_transform(conf, 'test', downscale=False) datasets = [make_sr_dataset_from_folder(conf, f, transform, inference=True) for f in args.files_or_dirs] num_workers = conf.get_attr('num_data_workers', default=DEFAULT_NUM_WORKERS) # Evaluate all datasets for dataset in datasets: loader = DataLoader(dataset=dataset, num_workers=num_workers, batch_size=1, shuffle=False) if mode == 'dataset': data, _, val_metrics = runner.validate(loader, len(loader)) print('Average metrics for {}'.format(dataset.name)) for metric_name, metric in val_metrics.items(): print(' {}: {}'.format(metric_name, metric)) else: data = runner.infer(loader) if args.infer or args.dump: if mode == 'dataset': output_dir = get_run_dir(args.out_dir, dataset.name) if not os.path.isdir(output_dir): os.mkdir(output_dir) file_idx = 0 for batch in data: if mode == 'image': output_dir = os.path.dirname(dataset.images[file_idx]) named_batch = runner.get_named_outputs(batch) inputs = named_batch['input'] predictions = named_batch['prediction'] targets = named_batch['target'] for (inp, target, prediction) in zip(inputs, targets, predictions): image_file = os.path.basename(dataset.images[file_idx]) name, _ = os.path.splitext(image_file) file_idx += 1 if args.dump: input_file = os.path.join(output_dir, '{}_input.png'.format(name)) save_image(inp.data, input_file) target_file = os.path.join(output_dir, '{}_target.png'.format(name)) save_image(target.data, target_file) pred_file = os.path.join(output_dir, '{}_pred.png'.format(name)) save_image(prediction.data, pred_file)