def main(): with open("12/input.txt", encoding="UTF-8") as file: lines = file.read().splitlines() network = get_network(lines) pathfinder = PathFinder1(network) paths = pathfinder.get_distinct_paths() print("Part 1: ", len(paths)) network = get_network(lines, part2=True) pathfinder = PathFinder2(network) paths = pathfinder.get_distinct_paths() print("Part 2: ", len(paths))
def main(args): # load configuration config = load_config(os.path.join(args.restore, 'config.json')) # create autoencoder ae = get_network(config['hiddens'], logger=g_logger) # build graph sess, saver, _ = build_graph(ae, input_shape=[None, 784]) restore(sess, saver, args.restore) test_result = os.path.join(args.result, 'test') # make result directory if not exists if not os.path.exists(test_result): os.makedirs(test_result) # use mnist for test mnist = tf.contrib.learn.datasets.load_dataset('mnist') row_col_size = 10 cnt = 0 for x, y in next_mnist_data(mnist, 'test', batch_size=row_col_size**2): x_ = sess.run(ae.x_, feed_dict={ae.x: x}) save_mnist_images(x, test_result, cnt, suffix='original', row_col_size=row_col_size) save_mnist_images(x_, test_result, cnt, suffix='reconstruct', row_col_size=row_col_size) cnt += 1
def autotvm_tune(network, batch_size, dtype, target, log_prefix): kernel_log = log_prefix + ".kernel.log" graph_log = log_prefix + ".graph.log" os.makedirs(os.path.dirname(graph_log), exist_ok=True) if os.path.exists(kernel_log): os.remove(kernel_log) if os.path.exists(graph_log): os.remove(graph_log) layout = "NCHW" mod, params, input_name, input_shape, output_shape = get_network( network, batch_size, dtype, layout) tuning_opt = get_tuning_option(network, batch_size, dtype, target, kernel_log) ops = [ relay.op.get("nn.batch_matmul"), relay.op.get("nn.dense"), relay.op.get("nn.conv2d"), ] tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params, ops=ops) tune_kernels(tasks, **tuning_opt) if use_graph_tuner(network, batch_size, dtype, target): tune_graph(mod["main"], input_name, input_shape, target, kernel_log, graph_log)
def main(): args = parser() cfg = Config.fromfile(args.config) log = Logger('./cache/log/' + args.net + '_trainlog.txt', level='info') log.logger.info('Preparing data') train_loader , val_loader = dataLoad(cfg) start_epoch = 0 if args.pretrain: log.logger.info('Loading Pretrain Data') net = get_network(args).cuda() model_params(net, log) criterion = CrossEntropy().cuda() # criterion = MeanSquaredError().cuda() optimizer = optim.SGD(net.parameters(), lr=cfg.PARA.train.LR, momentum=cfg.PARA.train.momentum, weight_decay=cfg.PARA.train.wd) net = torch.nn.DataParallel(net, device_ids=cfg.PARA.train.device_ids) torch.backends.cudnn.benchmark = True if args.resume: log.logger.info('Resuming from checkpoint') weighted_file = os.path.join('./cache/checkpoint/'+args.net, args.epoch + 'ckpt.pth') checkpoint = torch.load(weighted_file) net.load_state_dict(checkpoint['net']) start_epoch = checkpoint['epoch'] train(start_epoch, train_loader, val_loader, cfg, net, criterion, optimizer, args, log) log.logger.info("Training Finished, Total EPOCH=%d" % cfg.PARA.train.EPOCH)
def plot(): """ Serve a plot of the network. """ G = utils.get_network() result = {'plot': utils.plot(G)} return render_template('plot.html', result=result)
def __init__(self, config, storage, replay_buffer, state=None): set_all_seeds(config.seed) self.run_tag = config.run_tag self.group_tag = config.group_tag self.worker_id = 'learner' self.replay_buffer = replay_buffer self.storage = storage self.config = deepcopy(config) if "learner" in self.config.use_gpu_for: if torch.cuda.is_available(): if self.config.learner_gpu_device_id is not None: device_id = self.config.learner_gpu_device_id self.device = torch.device("cuda:{}".format(device_id)) else: self.device = torch.device("cuda") else: raise RuntimeError( "GPU was requested but torch.cuda.is_available() is False." ) else: self.device = torch.device("cpu") self.network = get_network(config, self.device) self.network.to(self.device) self.network.train() self.optimizer = get_optimizer(config, self.network.parameters()) self.lr_scheduler = get_lr_scheduler(config, self.optimizer) self.scalar_loss_fn, self.policy_loss_fn = get_loss_functions(config) self.training_step = 0 self.losses_to_log = {'reward': 0., 'value': 0., 'policy': 0.} self.throughput = { 'total_frames': 0, 'total_games': 0, 'training_step': 0, 'time': { 'ups': 0, 'fps': 0 } } if self.config.norm_obs: self.obs_min = np.array(self.config.obs_range[::2], dtype=np.float32) self.obs_max = np.array(self.config.obs_range[1::2], dtype=np.float32) self.obs_range = self.obs_max - self.obs_min if state is not None: self.load_state(state) Logger.__init__(self)
def evaluate_network(test_data, targets, model_file, model_type, batch_size, extra_args=None): # load the model file model = cPickle.load(open(model_file, 'r')) n_train_samples, data_dim = test_data.shape n_classes = len(set(targets)) if data_dim != model['in_dim'] or n_classes != model['n_classes']: print("This data is not compatible with this network, exiting", file=sys.stderr) return False net = get_network(x=test_data, in_dim=model['in_dim'], n_classes=model['n_classes'], model_type=model_type, hidden_dim=model['hidden_dim'], extra_args=extra_args) net.load_from_object(model=model, careful=True) errors, probs = predict(test_data=test_data, true_labels=targets, batch_size=batch_size, model=net, model_file=None) return errors, probs
def main(): args = parser.parse_args() config = vars(args) train_loader,val_loader,test_loader = get_data_loader(dataset_name=config['dataset'],\ data_path=config['dataset_path'],\ TRAIN_BATCH_SIZE=config['train_batch_size'],\ VAL_BATCH_SIZE=config['val_batch_size'],\ TEST_BATCH_SIZE=config['test_batch_size']) model = get_network(config['network']) model.train() model.cuda() train_reg2(model, train_loader, config) evaluate(model, val_loader, test_loader, config)
def inference(config_file, image_file): """ Run text recognition network on an image file. """ # Get config FLAGS = Flags(config_file).get() out_charset = load_charset(FLAGS.charset) num_classes = len(out_charset) net = get_network(FLAGS, out_charset) if FLAGS.use_rgb: num_channel = 3 mode = cv2.IMREAD_COLOR else: num_channel = 1 mode = cv2.IMREAD_GRAYSCALE # Input node image = tf.placeholder(tf.uint8, shape=[None, None, num_channel], name='input_node') # Network proc_image = net.preprocess_image(image, is_train=False) proc_image = tf.expand_dims(proc_image, axis=0) proc_image.set_shape( [None, FLAGS.resize_hw.height, FLAGS.resize_hw.width, num_channel]) logits, sequence_length = net.get_logits(proc_image, is_train=False, label=None) prediction, log_prob = net.get_prediction(logits, sequence_length) prediction = tf.sparse_to_dense(sparse_indices=prediction.indices, sparse_values=prediction.values, output_shape=prediction.dense_shape, default_value=num_classes, name='output_node') # Restore restore_model = get_init_trained() sess = tf.Session() restore_model(sess, FLAGS.eval.model_path) # Run img = cv2.imread(image_file, mode) img = np.reshape(img, [img.shape[0], img.shape[1], num_channel]) predicted = sess.run(prediction, feed_dict={image: img}) string = get_string(predicted[0], out_charset) string = adjust_string(string, FLAGS.eval.lowercase, FLAGS.eval.alphanumeric) print(string) return string
def do_partition(partition, device_id): # You have to set the right variables as global for visibility global net, optimizer, clr_scheduler, loss_function, fprint # The predecessor you load, all else is re-executed predecessor_epoch = partition[0] - 1 if not flor.is_initialized(): # Ray creates a new instance of the library per worker, so we have to re-init flor.initialize(**user_settings, predecessor_id=predecessor_epoch) # This line is so parallel workers don't collide fprint = flor.utils.fprint(['data', 'rogarcia', 'flor_output'], device_id) # Do the general initialization # The code below is copy/pasteed from __main__ # Each worker needs to initialize its own Neural Net so it's in the right GPU # Anything that goes on the GPU or reads from the GPU has to be initialized in each worker net = get_network(args, use_gpu=True) flor.namespace_stack.test_force(net, 'net') loss_function = nn.CrossEntropyLoss() flor.namespace_stack.test_force(loss_function, 'loss_function') optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.0, weight_decay=0.0) flor.namespace_stack.test_force(optimizer, 'optimizer') clr_scheduler = CLR_Scheduler(optimizer, net_steps=(iter_per_epoch * settings.EPOCH), min_lr=args.lr, max_lr=3.0, tail_frac=0.0) flor.namespace_stack.test_force(clr_scheduler, 'clr_scheduler') # Load the end state of the predecessor so we can re-execute in the middle if predecessor_epoch >= 0: # Initialize the Previous Epoch train(predecessor_epoch) eval_training(predecessor_epoch) # Re-execute in the middle flor.SKIP = False # THIS IS IMPORTANT, otherwise flor will SKIP for epoch in partition: # This is just good old fashined re-execution train(epoch) (loss, acc) = eval_training(epoch) fprint('Test set: Average loss: {:.4f}, Accuracy: {:.4f}'.format( loss, acc)) # Clear the memory for cleanliness, this step might be optional torch.cuda.empty_cache()
def benchmark(network, batch_size, dtype, target, log_prefix, repeat): layout = "NCHW" mod, params, input_name, input_shape, output_shape = get_network( network, batch_size, dtype, layout) if use_graph_tuner(network, batch_size, dtype, target): log_file = log_prefix + ".graph.log" history_best_context = autotvm.apply_graph_best(log_file) else: log_file = log_prefix + ".kernel.log" history_best_context = autotvm.apply_history_best(log_file) assert os.path.exists( log_file), "The log file '%s' does not exist." % log_file print("Use log file %s" % log_file) if network in ["bert"]: # Build module with history_best_context: with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) ctx = tvm.context(str(target), 0) module = runtime.GraphModule(lib["default"](ctx)) # Feed input data seq_length = input_shape[0][1] data = np.random.uniform(size=input_shape[0]) token_types = np.random.uniform(size=input_shape[1]) valid_length = np.array([seq_length] * batch_size) module.set_input(data0=data, data1=token_types, data2=valid_length) else: # Build module with history_best_context: with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) ctx = tvm.context(str(target), 0) module = runtime.GraphModule(lib["default"](ctx)) # Feed input data data = np.random.uniform(size=input_shape) module.set_input(input_name, data) # Evaluate ftimer = module.module.time_evaluator("run", ctx, min_repeat_ms=500, repeat=repeat) return np.array(ftimer().results)
def main(): random.seed(1234) np.random.seed(1234) tf.random.set_seed(1234) tf.keras.backend.set_floatx('float32') args = get_args() if not os.path.exists(args.output_path): os.makedirs(args.output_path) ref_dataloader = DataLoader(args.ref_train_path, args.ref_val_path, args.ref_test_path, args.cls_num, args.input_size, name="ref_dataloader", output_path=args.output_path) tar_dataloader = DataLoader(args.tar_train_path, args.tar_val_path, args.tar_test_path, args.cls_num, args.input_size, name="tar_dataloader", output_path=args.output_path) network = utils.get_network(args.nntype) network.freeze_layers(19) #if args.ckpt is not None: # network.load_weights(args.ckpt).expect_partial() # expect_partial enables to ignore training information for prediction optimizer = tf.keras.optimizers.Adam(learning_rate=args.lr) D_loss = tf.keras.losses.CategoricalCrossentropy() C_loss = compactnes_loss features_model = network.get_features_model(args.test_layer) trainer = TrainTestHelper(network, optimizer, D_loss, C_loss, args.lambd, training=True) validator = TrainTestHelper(network, optimizer, D_loss, C_loss, args.lambd, training=False) test_images, labels = ref_dataloader.read_batch(200, "test") save_predicted_results(test_images, labels, network, ref_dataloader.paths_logger["test"], D_loss, "before_training", args.output_path) random.seed(1234) np.random.seed(1234) tf.random.set_seed(1234) test_helper = TestHelper(ref_dataloader, tar_dataloader, args.templates_num, args.test_num, features_model, args.output_path) random.seed(1234) np.random.seed(1234) tf.random.set_seed(1234) train(ref_dataloader, tar_dataloader, trainer, validator, args.batchs_num, args.train_iterations, args.print_freq, test_helper, args.output_path, network) save_predicted_results(test_images, labels, network, ref_dataloader.paths_logger["test"], D_loss, "after_training", args.output_path) network.save_model(args.train_iterations, args.output_path)
def load_model(session, model_name): model_path = Path(__file__).parent / 'models' / model_name params_path = Path(__file__).parent / 'params' / ('%s.json' % model_name) with open(params_path) as f: params = json.load(f) assert model_path.exists() inputs = tf.placeholder(tf.float32) network = get_network(inputs, params) checkpoint = tf.train.get_checkpoint_state(model_path) saver = tf.train.Saver() saver.restore(session, checkpoint.model_checkpoint_path) return network
def main(): args = parser() cfg = Config.fromfile(args.config) log = Logger('./cache/log/' + args.net + '_testlog.txt', level='info') log.logger.info('==> Preparing data <==') test_loader = dataLoad(cfg) log.logger.info('==> Loading model <==') net = get_network(args).cuda() net = torch.nn.DataParallel(net, device_ids=cfg.PARA.train.device_ids) log.logger.info("==> Waiting Test <==") # for epoch in range(1, cfg.PARA.train.EPOCH+1): epoch = 121 checkpoint = torch.load('./cache/checkpoint/' + args.net + '/' + str(epoch) + 'ckpt.pth') net.load_state_dict(checkpoint['net']) test(net, epoch, test_loader, log, args, cfg)
def main(): np.random.seed(1234) tf.random.set_seed(1234) tf.keras.backend.set_floatx('float32') args = configurations.get_args() if not os.path.exists(args.output_path): os.makedirs(args.output_path) ref_dataloader = DataLoader(args.ref_train_path, args.ref_val_path, args.ref_test_path, args.cls_num, args.input_size, name="ref_dataloader", output_path=args.output_path) tar_dataloader = DataLoader(args.tar_train_path, args.tar_val_path, args.tar_test_path, args.cls_num, args.input_size, name="tar_dataloader", output_path=args.output_path) network = utils.get_network(args.nntype) network.freeze_layers(args.last_frozen_layer) optimizer = tf.keras.optimizers.Adam(learning_rate=args.lr) D_loss = tf.keras.losses.SparseCategoricalCrossentropy() C_loss = compactnes_loss trainer = TrainTestHelper(network, optimizer, D_loss, C_loss, args.lambd, training=True) validator = TrainTestHelper(network, optimizer, D_loss, C_loss, args.lambd, training=False) test_helper = TestHelper(ref_dataloader, tar_dataloader, args.templates_num, args.test_num, network) train(ref_dataloader, tar_dataloader, trainer, validator, args.batchs_num, args.train_iterations, args.print_freq, test_helper, args.output_path)
def benchmark(network, batch_size, dtype, target, log_file, repeat): layout = "NHWC" mod, params, input_name, input_shape, output_shape = get_network( network, batch_size, dtype, layout ) assert os.path.exists(log_file), "The log file '%s' does not exist." % log_file print("Use log file %s" % log_file) if network in ["bert"]: # Build module with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True} ): lib = relay.build(mod, target=target, params=params) ctx = tvm.context(str(target), 0) module = runtime.GraphModule(lib["default"](ctx)) # Feed input data seq_length = input_shape[0][1] data = np.random.uniform(size=input_shape[0]) token_types = np.random.uniform(size=input_shape[1]) valid_length = np.array([seq_length] * batch_size) module.set_input(data0=data, data1=token_types, data2=valid_length) else: # Build module with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True} ): lib = relay.build(mod, target=target, params=params) ctx = tvm.context(str(target), 0) module = runtime.GraphModule(lib["default"](ctx)) # Feed input data data = np.random.uniform(size=input_shape) module.set_input(input_name, data) # Evaluate ftimer = module.module.time_evaluator("run", ctx, min_repeat_ms=500, repeat=repeat) return np.array(ftimer().results)
def main(): args = configurations.get_args() ref_labels = dataloader.read_labels_file(args.reflabelpath) classes_num = len(np.unique(ref_labels)) ref_images_paths = dataloader.get_images_path(args.refpath) target_images_paths = get_target_images_by_classes(args.targetpath, ["knife", "sword"]) ref_dataloader = dataloader.Dataloader(ref_images_paths, classes_num, ref_labels) target_dataloader = dataloader.Dataloader(target_images_paths, classes_num) network = utils.get_network(args.nntype) optimizer = tf.keras.optimizers.Adam(learning_rate=args.lr) trainer = Trainer(network, optimizer, args.lambd, compactnes_loss, descriptiveness_loss) num_iterations = max(len(ref_images_paths) / args.batches, 1) train(ref_dataloader, target_dataloader, trainer, args.batches, num_iterations, args.epochs)
def auto_scheduler_tune(network, batch_size, dtype, target, log_file): os.makedirs(os.path.dirname(log_file), exist_ok=True) #if os.path.exists(log_file): # os.remove(log_file) layout = "NHWC" mod, params, input_name, input_shape, output_shape = get_network( network, batch_size, dtype, layout) n_trials = network_to_n_trials[(network, batch_size, dtype, str(target.kind))] if "cpu" in target.keys: tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=n_trials, runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) else: min_repeat_ms = 450 if network in ["bert"] else 300 measure_ctx = auto_scheduler.LocalRPCMeasureContext( repeat=1, min_repeat_ms=min_repeat_ms, timeout=10) tuning_opt = auto_scheduler.TuningOptions( num_measure_trials=n_trials, runner=measure_ctx.runner, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) print(log_file) update_file(log_file, tasks) return for idx, task in enumerate(tasks): print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) print(task.compute_dag) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune(tuning_opt)
def main(): random.seed(1234) np.random.seed(1234) tf.random.set_seed(1234) tf.keras.backend.set_floatx('float32') args = get_args() if not os.path.exists(args.output_path): os.makedirs(args.output_path) ref_dataloader = DataLoader(args.ref_train_path, args.ref_val_path, args.ref_test_path, args.cls_num, args.input_size, name="ref_dataloader", output_path=args.output_path) tar_dataloader = DataLoader(args.tar_train_path, args.tar_val_path, args.tar_test_path, args.cls_num, args.input_size, name="tar_dataloader", output_path=args.output_path) network = utils.get_network(args.nntype) network.freeze_layers(19) network.load_model(args.ckpt_dir) features_model = network.get_features_model(args.test_layer) test_helper = TestHelper(ref_dataloader, tar_dataloader, args.templates_num, args.test_num, features_model, args.output_path) if args.hot_map_paths != None: paths = [] with open(args.hot_map_paths, "r") as f: for line in f: paths.append(line.rstrip('\n')) test_helper.predict_hot_maps(paths, args.kernel_size, args.stride, args.input_size)
def main(): tf.keras.backend.set_floatx('float32') args = get_args() if not os.path.exists(args.output_path): os.makedirs(args.output_path) dataloader = DataLoader(args.train_path, args.val_path, args.test_path, args.cls_num, args.input_size, name="dataloader", output_path=args.output_path) network = utils.get_network(args.nntype) network.freeze_layers(19) optimizer = tf.keras.optimizers.Adam(learning_rate=args.lr) loss = tf.keras.losses.SparseCategoricalCrossentropy() trainer = TrainTestHelper(network, optimizer, loss, training=True) validator = TrainTestHelper(network, optimizer, loss, training=False) test_images, labels = dataloader.read_batch(200, "test") save_predicted_results(test_images, labels, network, dataloader.paths_logger["test"], loss, "before_training", args.output_path) train(dataloader, trainer, validator, args.batchs_num, args.train_iterations, args.print_freq) save_predicted_results(test_images, labels, network, dataloader.paths_logger["test"], loss, "after_training", args.output_path)
def plot(): """ Serve a plot of the network. """ scale = int(request.args.get('scale') or '10') log = request.args.get('log') or 'false' if log.lower() in ['0', 'false', 'off', 'no']: log = False else: log = True drop = request.args.get('drop') or 'true' if drop.lower() in ['1', 'true', 'on', 'yes']: drop = True else: drop = False years = utils.get_years() G = utils.get_network(years) if len(G) < 1: return render_template('plot.html', result={}) result = {'network_plot': utils.plot_network(G, years, scale=scale)} result['years_plot'] = utils.plot_bars(years, sort=True, drop=drop, log=log) lasts = utils.get_lasts() result['lasts_plot'] = utils.plot_bars(lasts, title="Current position") lens = utils.get_lens() result['lens_plot'] = utils.plot_bars(lens, title="Career length so far", lpos=0.5) return render_template('plot.html', result=result)
parser.add_argument('-eval', action='store_true', default=False, help='evaluate only') parser.add_argument('-pth', type=str, default=None, help='path to model folder') parser.add_argument('-ckpt', type=str, default=None, help='path to model .pth file') args = parser.parse_args() net = get_network(args) # net = torchvision.models.resnet50().cuda() if args.distill: teacher_net = get_network(args) distill_loss = DistillationOrthogonalProjectionLoss() else: teacher_net = None distill_loss = None if args.dataset == "aircraft": # data preprocessing: normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform_list = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(),
default=0.01, help='initial learning rate') parser.add_argument('-act', type=str, default='RELU', help='Activation function to use') parser.add_argument('-error', type=float, default=0.1, help='Error Rate') parser.add_argument('-resume', type=str, default='yes', help='Resume the training') args = parser.parse_args() error = args.error print(args.act) net = get_network(args, use_gpu=args.gpu) #net = vgg16_bn() #data preprocessing: cifar100_training_loader = get_training_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=args.w, batch_size=args.b, shuffle=args.s) cifar100_test_loader = get_test_dataloader(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=args.w, batch_size=args.b, shuffle=args.s)
def main(args): ### CHECKPOINT_PATH = 'checkpoint' EPOCH = 75 MILESTONES = [50] TIME_NOW = datetime.now().isoformat() LOG_DIR = 'runs' DATASET = 'cifar-100' SAVE_EPOCH = 15 ### classes = [i for i in range(100)] training_batches = [ classes[i:i + args.step_classes] for i in range(0, len(classes), args.step_classes) ] net = get_network(args, use_gpu=True) checkpoint_path = os.path.join(CHECKPOINT_PATH, DATASET, str(args.step_classes), str(args.buffer_size), args.net, str(TIME_NOW)) old_data_batch = [] incremental_accuracy = [] criterion = nn.CrossEntropyLoss() replay_dataloader = None replay_dataset = get_buffer_dataset(buffer_size=args.buffer_size) for idx, training_batch in enumerate(training_batches): print('Training batch: '.format(training_batch)) # data preprocessing: training_loader = get_training_dataloader(include_list=training_batch, num_workers=args.w, batch_size=args.b, shuffle=args.s) test_loader = get_test_dataloader(include_list=training_batch + old_data_batch, num_workers=args.w, batch_size=args.b, shuffle=args.s) new_test_loader = get_test_dataloader(include_list=training_batch, num_workers=args.w, batch_size=args.b, shuffle=args.s) if idx > 0: old_test_loader = get_test_dataloader(include_list=old_data_batch, num_workers=args.w, batch_size=args.b, shuffle=args.s) if idx > 0: EPOCH = 30 #Monica if idx > len(training_batches) // 3: lr = 0.01 else: lr = 0.1 new_data_optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) #optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4) train_scheduler = optim.lr_scheduler.MultiStepLR(new_data_optimizer, milestones=MILESTONES, gamma=0.1) iter_per_epoch = float(len(training_loader)) # create checkpoint folder to save model if not os.path.exists(checkpoint_path): Path(checkpoint_path).mkdir(parents=True, exist_ok=True) ckp_path = os.path.join(checkpoint_path, '{net}-{idx}-{epoch}-{type}.pth') with tqdm(total=EPOCH) as pbar: for epoch in range(1, EPOCH): if epoch == EPOCH // 3 and idx > 0: lr *= .1 net.train() avg_learning_ratio = 0 if idx > 0: # old_dataloader = replay_manager.get_dataloader(batch_size=args.b) # old_dataiter = iter(old_dataloader) replay_dataloader = DataLoader(dataset=replay_dataset, shuffle=True, batch_size=args.b) old_dataiter = iter(replay_dataloader) for batch_index, (images, labels) in enumerate(training_loader): if idx > 0: try: old_images, old_labels = next(old_dataiter) except StopIteration: old_dataiter = iter(replay_dataloader) old_images, old_labels = next(old_dataiter) from PIL import Image # im = Image.fromarray(old_images[0].mul_(255).permute(1, 2, 0).to('cpu', torch.uint8).numpy()) # im.save('sample_old.png') old_images_gpu = old_images.cuda() old_labels_gpu = old_labels.cuda() net.zero_grad() old_outputs = net(old_images_gpu) old_data_loss = criterion(old_outputs, old_labels_gpu) old_data_loss.backward() old_data_gradient_magnitudes = [] # old_gradient_data = [] for f in net.parameters(): old_data_gradient_magnitudes.append( f.grad.norm(2).item()**2) # old_gradient_data.append(f.grad.data) old_magnitude = np.sum( np.asarray(old_data_gradient_magnitudes)) new_labels_gpu = labels.cuda() new_images_gpu = images.cuda() net.zero_grad() outputs = net(new_images_gpu) new_data_loss = criterion(outputs, new_labels_gpu) new_data_loss.backward() new_data_gradient_magnitudes = [] # new_gradient_data = [] for f in net.parameters(): new_data_gradient_magnitudes.append( f.grad.norm(2).item()**2) # new_gradient_data.append(f.grad.data) new_magnitude = np.sum( np.asarray(new_data_gradient_magnitudes)) if idx > 0: learning_ratio = old_magnitude / new_magnitude avg_learning_ratio += learning_ratio if learning_ratio < .01: net.zero_grad() outputs = net(new_images_gpu) new_data_loss = criterion(outputs, new_labels_gpu) new_data_loss.backward() for f in net.parameters(): f.data.sub_(lr * f.grad.data) # print('Learning weighted new -- {}'.format(learning_ratio)) elif learning_ratio < .1: combined_images = torch.cat([images, old_images], axis=0) combined_labels = torch.cat([labels, old_labels], axis=0) combined_images = combined_images.cuda() combined_labels = combined_labels.cuda() net.zero_grad() outputs = net(combined_images) combined_data_loss = criterion( outputs, combined_labels) combined_data_loss.backward() for f in net.parameters(): f.data.sub_(lr * f.grad.data) # print('Learning combined! -- {}'.format(learning_ratio)) else: net.zero_grad() old_outputs = net(old_images_gpu) old_data_loss = criterion(old_outputs, old_labels_gpu) old_data_loss.backward() for f in net.parameters(): f.data.sub_(0.1 * f.grad.data) # print('Learning old! -- {}'.format(learning_ratio)) else: new_data_optimizer.step() train_scheduler.step(epoch) if (epoch == 1 or epoch == EPOCH - 1) and batch_index == 0: print('New Batch Magnitude is {} at epoch {}'.format( new_magnitude, epoch)) draw_magnitudes( new_data_gradient_magnitudes, '_'.join(str(i) for i in training_batch), checkpoint_path, '{}_{}'.format(idx, epoch)) if idx > 0: print( 'Old Batch Magnitude is {} at epoch {}'.format( old_magnitude, epoch)) draw_magnitudes(old_data_gradient_magnitudes, 'old Class', checkpoint_path, 'old_{}_{}'.format(idx, epoch)) print('Learning magnitude ratio {}'.format( avg_learning_ratio / iter_per_epoch)) if idx > 0: print( 'Training Epoch: {epoch} \tNew Loss: {:0.4f}\t Old Loss: {:0.4f}' .format(new_data_loss.item() / images.size(0), old_data_loss.item() / old_images.size(0), epoch=epoch)) loss_value, acc = evaluate(net, new_test_loader, criterion) print('New Test set: Average loss: {:.4f}, Accuracy: {:.4f}'. format(loss_value, acc)) if idx > 0: loss_value, acc = evaluate(net, old_test_loader, criterion) print( 'Old Test set: Average loss: {:.4f}, Accuracy: {:.4f}'. format(loss_value, acc)) loss_value, acc = evaluate(net, test_loader, criterion) print( 'Complete Test set: Average loss: {:.4f}, Accuracy: {:.4f}' .format(loss_value, acc)) if epoch == EPOCH - 1: incremental_accuracy.append(acc.float()) if not epoch % SAVE_EPOCH: torch.save( net.state_dict(), ckp_path.format(net=args.net, idx=idx, epoch=epoch, type='regular')) pbar.update(1) torch.save( net.state_dict(), ckp_path.format(net=args.net, idx=idx, epoch=epoch, type='end')) # Populate Replay Buffer replay_dataset.append_data(training_batch) old_data_batch += training_batch replay_dataloader = DataLoader(dataset=replay_dataset, batch_size=args.b) loss_value, acc = evaluate(net, replay_dataloader, criterion) print( 'Replay Train set: Average loss: {:.4f}, Accuracy: {:.4f}'.format( loss_value, acc)) print(incremental_accuracy)
}, is_best=is_best, checkpoint=model_dir) if __name__ == '__main__': # Load the parameters from parser args = parser.parse_args() model_name = args.model lr = args.lr epochs = args.epoch batch_size = args.batch_size logging.info("Loading the training dataset...") # fetch train dataloader train_dataloader = data_loader.train_data_loader() logging.info("- done.") # Define the model and optimizer model = utils.get_network(args) optimizer = utils.get_optimizer(model_name, model, lr) # fetch loss function loss_fn = nn.CrossEntropyLoss() # Train the model logging.info("Starting training for {} epoch(s).".format(epochs)) train(model, optimizer, loss_fn, train_dataloader)
parser.add_argument('--norecord', dest='record', action='store_false', help='whether to save checkpoint and events') parser.add_argument('--record', dest='record', action='store_true', help='whether to save checkpoint and events') parser.add_argument('--nomean', dest='meanweight', action='store_false', help='whether to') parser.set_defaults(record=False) parser.set_defaults(meanweight=True) args = parser.parse_args() print(args) print(settings.TIME_NOW) if args.gpu=='-1': use_gpu=False print(use_gpu) else: use_gpu=True os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) net = get_network(args.net,use_gpu=use_gpu) #data preprocessing: cifar100_training_loader = get_training_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=args.w, batch_size=args.b, shuffle=args.s ) cifar100_test_loader = get_test_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=args.w, batch_size=args.b,
def main(config_file): """ Train text recognition network """ # Parse configs FLAGS = Flags(config_file).get() # Set directory, seed, logger model_dir = create_model_dir(FLAGS.model_dir) logger = get_logger(model_dir, 'train') best_model_dir = os.path.join(model_dir, 'best_models') set_seed(FLAGS.seed) # Print configs flag_strs = [ '{}:\t{}'.format(name, value) for name, value in FLAGS._asdict().items() ] log_formatted(logger, '[+] Model configurations', *flag_strs) # Print system environments num_gpus = count_available_gpus() num_cpus = os.cpu_count() mem_size = virtual_memory().available // (1024**3) log_formatted(logger, '[+] System environments', 'The number of gpus : {}'.format(num_gpus), 'The number of cpus : {}'.format(num_cpus), 'Memory Size : {}G'.format(mem_size)) # Get optimizer and network global_step = tf.train.get_or_create_global_step() optimizer, learning_rate = get_optimizer(FLAGS.train.optimizer, global_step) out_charset = load_charset(FLAGS.charset) net = get_network(FLAGS, out_charset) is_ctc = (net.loss_fn == 'ctc_loss') # Multi tower for multi-gpu training tower_grads = [] tower_extra_update_ops = [] tower_preds = [] tower_gts = [] tower_losses = [] batch_size = FLAGS.train.batch_size tower_batch_size = batch_size // num_gpus val_tower_outputs = [] eval_tower_outputs = [] for gpu_indx in range(num_gpus): # Train tower print('[+] Build Train tower GPU:%d' % gpu_indx) input_device = '/gpu:%d' % gpu_indx tower_batch_size = tower_batch_size \ if gpu_indx < num_gpus-1 \ else batch_size - tower_batch_size * (num_gpus-1) train_loader = DatasetLodaer( dataset_paths=FLAGS.train.dataset_paths, dataset_portions=FLAGS.train.dataset_portions, batch_size=tower_batch_size, label_maxlen=FLAGS.label_maxlen, out_charset=out_charset, preprocess_image=net.preprocess_image, is_train=True, is_ctc=is_ctc, shuffle_and_repeat=True, concat_batch=True, input_device=input_device, num_cpus=num_cpus, num_gpus=num_gpus, worker_index=gpu_indx, use_rgb=FLAGS.use_rgb, seed=FLAGS.seed, name='train') tower_output = single_tower(net, gpu_indx, train_loader, out_charset, optimizer, name='train', is_train=True) tower_grads.append([x for x in tower_output.grads if x[0] is not None]) tower_extra_update_ops.append(tower_output.extra_update_ops) tower_preds.append(tower_output.prediction) tower_gts.append(tower_output.text) tower_losses.append(tower_output.loss) # Print network structure if gpu_indx == 0: param_stats = tf.profiler.profile(tf.get_default_graph()) logger.info('total_params: %d\n' % param_stats.total_parameters) # Valid tower print('[+] Build Valid tower GPU:%d' % gpu_indx) valid_loader = DatasetLodaer(dataset_paths=FLAGS.valid.dataset_paths, dataset_portions=None, batch_size=FLAGS.valid.batch_size // num_gpus, label_maxlen=FLAGS.label_maxlen, out_charset=out_charset, preprocess_image=net.preprocess_image, is_train=False, is_ctc=is_ctc, shuffle_and_repeat=False, concat_batch=False, input_device=input_device, num_cpus=num_cpus, num_gpus=num_gpus, worker_index=gpu_indx, use_rgb=FLAGS.use_rgb, seed=FLAGS.seed, name='valid') val_tower_output = single_tower(net, gpu_indx, valid_loader, out_charset, optimizer=None, name='valid', is_train=False) val_tower_outputs.append( (val_tower_output.loss, val_tower_output.prediction, val_tower_output.text, val_tower_output.filename, val_tower_output.dataset)) # Aggregate gradients losses = tf.reduce_mean(tower_losses) grads = _average_gradients(tower_grads) with tf.control_dependencies(tower_extra_update_ops[-1]): if FLAGS.train.optimizer.grad_clip_norm is not None: grads, global_norm = _clip_gradients( grads, FLAGS.train.optimizer.grad_clip_norm) tf.summary.scalar('global_norm', global_norm) train_op = optimizer.apply_gradients(grads, global_step=global_step) # Define config, scaffold saver = tf.train.Saver() sess_config = get_session_config() scaffold = get_scaffold(saver, FLAGS.train.tune_from, 'train') restore_model = get_init_trained() # Define validation saver, summary writer summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) val_summary_op = tf.summary.merge( [s for s in summaries if 'valid' in s.name]) val_summary_writer = { dataset_name: tf.summary.FileWriter(os.path.join(model_dir, 'valid', dataset_name)) for dataset_name in valid_loader.dataset_names } val_summary_writer['total_valid'] = tf.summary.FileWriter( os.path.join(model_dir, 'valid', 'total_valid')) val_saver = tf.train.Saver(max_to_keep=len(valid_loader.dataset_names) + 1) best_val_err_rates = {} best_steps = {} # Training print('[+] Make Session...') with tf.train.MonitoredTrainingSession( checkpoint_dir=model_dir, scaffold=scaffold, config=sess_config, save_checkpoint_steps=FLAGS.train.save_steps, save_checkpoint_secs=None, save_summaries_steps=FLAGS.train.summary_steps, save_summaries_secs=None, ) as sess: log_formatted(logger, 'Training started!') _step = 0 train_t = 0 start_t = time.time() while _step < FLAGS.train.max_num_steps \ and not sess.should_stop(): # Train step step_t = time.time() [step_loss, _, _step, preds, gts, lr] = sess.run([ losses, train_op, global_step, tower_preds[0], tower_gts[0], learning_rate ]) train_t += time.time() - step_t # Summary if _step % FLAGS.valid.steps == 0: # Train summary train_err = 0. for i, (p, g) in enumerate(zip(preds, gts)): s = get_string(p, out_charset, is_ctc=is_ctc) g = g.decode('utf8').replace(DELIMITER, '') s = adjust_string(s, FLAGS.train.lowercase, FLAGS.train.alphanumeric) g = adjust_string(g, FLAGS.train.lowercase, FLAGS.train.alphanumeric) e = int(s != g) train_err += e if FLAGS.train.verbose and i < 5: print('TRAIN :\t{}\t{}\t{}'.format(s, g, not bool(e))) train_err_rate = \ train_err / len(gts) # Valid summary val_cnts, val_errs, val_err_rates, _ = \ validate(sess, _step, val_tower_outputs, out_charset, is_ctc, val_summary_op, val_summary_writer, val_saver, best_val_err_rates, best_steps, best_model_dir, FLAGS.valid.lowercase, FLAGS.valid.alphanumeric) # Logging log_strings = ['', '-' * 28 + ' VALID_DETAIL ' + '-' * 28, ''] for dataset in sorted(val_err_rates.keys()): if dataset == 'total_valid': continue cnt = val_cnts[dataset] err = val_errs[dataset] err_rate = val_err_rates[dataset] best_step = best_steps[dataset] s = '%s : %.2f%%(%d/%d)\tBEST_STEP : %d' % \ (dataset, (1.-err_rate)*100, cnt-err, cnt, best_step) log_strings.append(s) elapsed_t = float(time.time() - start_t) / 60 remain_t = (elapsed_t / (_step+1)) * \ (FLAGS.train.max_num_steps - _step - 1) log_formatted( logger, 'STEP : %d\tTRAIN_LOSS : %f' % (_step, step_loss), 'ELAPSED : %.2f min\tREMAIN : %.2f min\t' 'STEP_TIME: %.1f sec' % (elapsed_t, remain_t, float(train_t) / (_step + 1)), 'TRAIN_SEQ_ERR : %f\tVALID_SEQ_ERR : %f' % (train_err_rate, val_err_rates['total_valid']), 'BEST_STEP : %d\tBEST_VALID_SEQ_ERR : %f' % (best_steps['total_valid'], best_val_err_rates['total_valid']), *log_strings) log_formatted(logger, 'Training is completed!')
default=True, help='whether shuffle the dataset') parser.add_argument('-warm', type=int, default=1, help='warm up training phase') parser.add_argument('-lr', type=float, default=0.1, help='initial learning rate') args = parser.parse_args() if torch.cuda.is_available() and args.gpu: device = torch.cuda.current_device() net = get_network(args, use_gpu=args.gpu, device=device) #data preprocessing: cifar100_training_loader = get_training_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=args.w, batch_size=args.b, shuffle=args.s) cifar100_test_loader = get_test_dataloader(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=args.w, batch_size=args.b, shuffle=args.s)
def mini_batch_sgd_with_annealing(motif, train_data, labels, xTrain_data, xTrain_targets, learning_rate, L1_reg, L2_reg, epochs, batch_size, hidden_dim, model_type, model_file=None, trained_model_dir=None, verbose=True, extra_args=None): # Preamble # # determine dimensionality of data and number of classes n_train_samples, data_dim = train_data.shape n_classes = len(set(labels)) # compute number of mini-batches for training, validation and testing train_set_x, train_set_y = shared_dataset(train_data, labels, True) xtrain_set_x, xtrain_set_y = shared_dataset(xTrain_data, xTrain_targets, True) n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_xtrain_batches = xtrain_set_x.get_value(borrow=True).shape[0] / batch_size batch_index = T.lscalar() # containers to hold mini-batches x = T.matrix('x') y = T.ivector('y') net = get_network(x=x, in_dim=data_dim, n_classes=n_classes, hidden_dim=hidden_dim, model_type=model_type, extra_args=extra_args) if net is False: return False # cost function cost = (net.negative_log_likelihood(labels=y) + L1_reg * net.L1 + (L2_reg / n_train_samples) * net.L2_sq) xtrain_fcn = theano.function(inputs=[batch_index], outputs=net.errors(y), givens={ x: xtrain_set_x[batch_index * batch_size: (batch_index + 1) * batch_size], y: xtrain_set_y[batch_index * batch_size: (batch_index + 1) * batch_size] }) # gradients nambla_params = [T.grad(cost, param) for param in net.params] # update tuple dynamic_learning_rate = T.as_tensor_variable(learning_rate) # dynamic_learning_rate = learning_rate updates = [(param, param - dynamic_learning_rate * nambla_param) for param, nambla_param in zip(net.params, nambla_params)] # main function? could make this an attribute and reduce redundant code train_fcn = theano.function(inputs=[batch_index], outputs=cost, updates=updates, givens={ x: train_set_x[batch_index * batch_size: (batch_index + 1) * batch_size], y: train_set_y[batch_index * batch_size: (batch_index + 1) * batch_size] }) train_error_fcn = theano.function(inputs=[batch_index], outputs=net.errors(y), givens={ x: train_set_x[batch_index * batch_size: (batch_index + 1) * batch_size], y: train_set_y[batch_index * batch_size: (batch_index + 1) * batch_size] }) if model_file is not None: net.load_from_file(file_path=model_file, careful=True) # do the actual training batch_costs = [np.inf] add_to_batch_costs = batch_costs.append xtrain_accuracies = [] add_to_xtrain_acc = xtrain_accuracies.append train_accuracies = [] add_to_train_acc = train_accuracies.append xtrain_costs_bin = [] prev_xtrain_cost = 1e-10 best_xtrain_accuracy = -np.inf best_model = '' check_frequency = int(epochs / 10) for epoch in xrange(0, epochs): # evaluation of training progress and summary stat collection if epoch % check_frequency == 0: # get the accuracy on the cross-train data xtrain_errors = [xtrain_fcn(_) for _ in xrange(n_xtrain_batches)] avg_xtrain_errors = np.mean(xtrain_errors) avg_xtrain_accuracy = 100 * (1 - avg_xtrain_errors) # then the training set train_errors = [train_error_fcn(_) for _ in xrange(n_train_batches)] avg_training_errors = np.mean(train_errors) avg_train_accuracy = 100 * (1 - avg_training_errors) # collect for tracking progress add_to_xtrain_acc(avg_xtrain_accuracy) add_to_train_acc(avg_train_accuracy) xtrain_costs_bin += xtrain_errors if verbose: print("{0}: epoch {1}, batch cost {2}, train accuracy {3}, cross-train accuracy {4}" .format(motif, epoch, batch_costs[-1], avg_train_accuracy, avg_xtrain_accuracy), file=sys.stderr) # if we're getting better, save the model, the 'oldest' model should be the one with the highest # cross-train accuracy if avg_xtrain_accuracy >= best_xtrain_accuracy and trained_model_dir is not None: if not os.path.exists(trained_model_dir): os.makedirs(trained_model_dir) # update the best accuracy and best model best_xtrain_accuracy = avg_xtrain_accuracy best_model = "{0}model{1}.pkl".format(trained_model_dir, epoch) net.write(best_model) for i in xrange(n_train_batches): batch_avg_cost = train_fcn(i) if i % (n_train_batches / 10) == 0: add_to_batch_costs(float(batch_avg_cost)) # annealing protocol mean_xtrain_cost = np.mean([xtrain_fcn(_) for _ in xrange(n_xtrain_batches)]) if mean_xtrain_cost / prev_xtrain_cost < 1.0: dynamic_learning_rate *= 0.9 if mean_xtrain_cost > prev_xtrain_cost: dynamic_learning_rate *= 1.05 prev_xtrain_cost = mean_xtrain_cost # pickle the summary stats for the training summary = { "batch_costs": batch_costs, "xtrain_accuracies": xtrain_accuracies, "train_accuracies": train_accuracies, "xtrain_errors": xtrain_costs_bin, "best_model": best_model } if trained_model_dir is not None: with open("{}summary_stats.pkl".format(trained_model_dir), 'w') as f: cPickle.dump(summary, f) return net, summary
help='batch size for dataloader') parser.add_argument('-warm', type=int, default=1, help='warm up training phase') parser.add_argument('-lr', type=float, default=0.1, help='initial learning rate') parser.add_argument('-resume', action='store_true', default=False, help='resume training') args = parser.parse_args() net = get_network(args) #data preprocessing: cifar100_training_loader = get_training_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=4, batch_size=args.b, shuffle=True) cifar100_test_loader = get_test_dataloader(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=4, batch_size=args.b, shuffle=True)
params['spatial_stride']) logging.info('Loading validation dataset...') validation_set = data.ImageDataset(params['validation_partitions'], params['temporal_patch_size']) logging.info('Loading test dataset...') test_set = data.ImageDataset(params['test_partitions'], params['temporal_patch_size']) inputs = tf.placeholder(tf.float32) ground_truth = tf.placeholder(tf.float32) global_step = tf.Variable(0, trainable=False, name='global_step') network = get_network(inputs, params) base_loss = tf.losses.mean_squared_error(network.outputs, ground_truth) weight_loss = params['weight_decay'] * tf.reduce_sum( tf.stack([tf.nn.l2_loss(weight) for weight in network.weights])) loss = base_loss + weight_loss accuracy = tf.placeholder(tf.float32, shape=[]) precision = tf.placeholder(tf.float32, shape=[]) recall = tf.placeholder(tf.float32, shape=[]) f1_score = tf.placeholder(tf.float32, shape=[]) tf.summary.scalar('accuracy', accuracy) tf.summary.scalar('precision', precision) tf.summary.scalar('recall', recall) tf.summary.scalar('f1_score', f1_score)
def main( expt, model_name, device, gpu_id, optimizer, arch, num_layers, n_classes, img_size, batch_size, test_batch_size, subset, init_w, ckpt_g, n_epochs, lr_clfs, weight_decays, milestones, gamma, ): device = torch_device(device, gpu_id[0]) num_clfs = len([_ for _ in n_classes if _ > 0]) if arch == 'resnet': print('Using resnet') Net = get_resnet(num_layers) else: print('Using {}'.format(arch)) Net = get_network(arch, num_layers) net_G = define_G(cfg.num_channels[expt], cfg.num_channels[expt], 64, gpu_id=device) clfs = [ Net(num_channels=cfg.num_channels[expt], num_classes=_).to(device) for _ in n_classes if _ > 0 ] if len(gpu_id) > 1: net_G = nn.DataParallel(net_G, device_ids=gpu_id) clfs = [nn.DataParallel(clf, device_ids=gpu_id) for clf in clfs] assert len(clfs) == num_clfs print("Loading weights...\n{}".format(ckpt_g)) net_G.load_state_dict(torch.load(ckpt_g)) if init_w: print("Init weights...") for clf in clfs: clf.apply(weights_init) scheduler = torch.optim.lr_scheduler.MultiStepLR if optimizer == 'sgd': opt_clfs = [ torch.optim.SGD(clf.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decays[0]) for lr, clf in zip(lr_clfs, clfs) ] elif optimizer == 'adam': opt_clfs = [ torch.optim.SGD(clf.parameters(), lr=lr, weight_decay=weight_decays[0]) for lr, clf in zip(lr_clfs, clfs) ] sch_clfs = [ scheduler(optim, milestones, gamma=gamma) for optim in opt_clfs ] assert len(opt_clfs) == num_clfs criterionNLL = nn.CrossEntropyLoss().to(device) train_loader = get_loader(expt, batch_size, True, img_size=img_size, subset=subset) valid_loader = get_loader(expt, test_batch_size, False, img_size=img_size, subset=subset) template = '{}'.format(model_name) loss_history = defaultdict(list) acc_history = defaultdict(list) for epoch in range(n_epochs): logging.info( "Train Epoch " + ' '.join(["\t Clf: {}".format(_) for _ in range(num_clfs)])) for iteration, (image, labels) in enumerate(train_loader, 1): real = image.to(device) with torch.no_grad(): X = net_G(real) ys = [_.to(device) for _ in labels] [opt.zero_grad() for opt in opt_clfs] ys_hat = [clf(X) for clf in clfs] loss = [criterionNLL(y_hat, y) for y_hat, y in zip(ys_hat, ys)] ys_hat = [_.argmax(1, keepdim=True) for _ in ys_hat] acc = [ y_hat.eq(y.view_as(y_hat)).sum().item() / len(y) for y_hat, y in zip(ys_hat, ys) ] [l.backward() for l in loss] [opt.step() for opt in opt_clfs] iloss = [l.item() for l in loss] assert len(iloss) == num_clfs logging.info('[{}]({}/{}) '.format( epoch, iteration, len(train_loader), ) + ' '.join([ '\t {:.4f} ({:.2f})'.format(l, a) for l, a in zip(iloss, acc) ])) loss_history['train_epoch'].append(epoch) acc_history['train_epoch'].append(epoch) for idx, (l, a) in enumerate(zip(iloss, acc)): loss_history['train_M_{}'.format(idx)].append(l) acc_history['train_M_{}'.format(idx)].append(a) logging.info( "Valid Epoch " + ' '.join(["\t Clf: {}".format(_) for _ in range(num_clfs)])) loss_m_batch = [0 for _ in range(num_clfs)] acc_m_batch = [0 for _ in range(num_clfs)] for iteration, (image, labels) in enumerate(valid_loader, 1): X = net_G(image.to(device)) ys = [_.to(device) for _ in labels] ys_hat = [clf(X) for clf in clfs] loss = [criterionNLL(y_hat, y) for y_hat, y in zip(ys_hat, ys)] ys_hat = [_.argmax(1, keepdim=True) for _ in ys_hat] acc = [ y_hat.eq(y.view_as(y_hat)).sum().item() / len(y) for y_hat, y in zip(ys_hat, ys) ] iloss = [l.item() for l in loss] for idx, (l, a) in enumerate(zip(iloss, acc)): loss_m_batch[idx] += l acc_m_batch[idx] += a logging.info('[{}]({}/{}) '.format( epoch, iteration, len(valid_loader), ) + ' '.join([ '\t {:.4f} ({:.2f})'.format(l, a) for l, a in zip(iloss, acc) ])) num_samples = len(valid_loader) logging.info('[{}](batch) '.format(epoch, ) + ' '.join([ '\t {:.4f} ({:.2f})'.format(l / num_samples, a / num_samples) for l, a in zip(loss_m_batch, acc_m_batch) ])) num_samples = len(valid_loader) loss_history['valid_epoch'].append(epoch) acc_history['valid_epoch'].append(epoch) for idx, (l, a) in enumerate(zip(loss_m_batch, acc_m_batch)): loss_history['valid_M_{}'.format(idx)].append(l / num_samples) acc_history['valid_M_{}'.format(idx)].append(a / num_samples) [sch.step() for sch in sch_clfs] train_loss_keys = [ _ for _ in loss_history if 'train' in _ and 'epoch' not in _ ] valid_loss_keys = [ _ for _ in loss_history if 'valid' in _ and 'epoch' not in _ ] train_acc_keys = [ _ for _ in acc_history if 'train' in _ and 'epoch' not in _ ] valid_acc_keys = [ _ for _ in acc_history if 'valid' in _ and 'epoch' not in _ ] cols = 5 rows = len(train_loss_keys) // cols + 1 fig = plt.figure(figsize=(7 * cols, 5 * rows)) base = cols * 100 + rows * 10 for idx, (tr_l, val_l) in enumerate(zip(train_loss_keys, valid_loss_keys)): ax = fig.add_subplot(rows, cols, idx + 1) ax.plot(loss_history['train_epoch'], loss_history[tr_l], 'b.:') ax.plot(loss_history['valid_epoch'], loss_history[val_l], 'bs-.') ax.set_xlabel('epochs') ax.set_ylabel('loss') ax.set_title(tr_l[6:]) ax.grid() if tr_l in acc_history: ax2 = plt.twinx() ax2.plot(acc_history['train_epoch'], acc_history[tr_l], 'r.:') ax2.plot(acc_history['valid_epoch'], acc_history[val_l], 'rs-.') ax2.set_ylabel('accuracy') fig.subplots_adjust(wspace=0.4, hspace=0.3) plt_ckpt = '{}/{}/plots/{}.jpg'.format(cfg.ckpt_folder, expt, model_name) logging.info('Plot: {}'.format(plt_ckpt)) plt.savefig(plt_ckpt, bbox_inches='tight', dpi=80) hist_ckpt = '{}/{}/history/{}.pkl'.format(cfg.ckpt_folder, expt, model_name) logging.info('History: {}'.format(hist_ckpt)) pkl.dump((loss_history, acc_history), open(hist_ckpt, 'wb')) for idx, clf in enumerate(clfs): model_ckpt = '{}/{}/models/{}_clf_{}.stop'.format( cfg.ckpt_folder, expt, model_name, idx) logging.info('Model: {}'.format(model_ckpt)) torch.save(clf.state_dict(), model_ckpt)