self, idx): #default of torch: float32, default of np: float64 (double) return torch.as_tensor(self.X[idx].toarray()).float(), torch.as_tensor( self.y[idx].toarray()).view(1, 1).float() ds = PrepareData(X=data[:, :-1], y=data[:, -1]) #do not shuffle as we have sequence here! #We can not have batch similar to feed-forward nn due to the h_prev! We have to manully prop the update inside the training loop ds = DataLoader(ds, batch_size=1, shuffle=False) rnnlm = MikolovRNNLM(params.rnnlm) criterion = torch.nn.NLLLoss() optimizer = optim.SGD(rnnlm.parameters(), lr=params.rnnlm['lr']) scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=False) h_prev = torch.randn(params.rnnlm['h']).view(1, params.rnnlm['h']) test_loss = [] running_loss = [] e = 1 while 1: # for e in range(params.rnnlm['e']):#epochs epoch_loss = 0 l_prev = sys.maxsize for i, (X, y) in enumerate(ds): X = X.view(X.shape[0], X.shape[2])
nn.LeakyReLU(inplace=True), nn.Linear(200, 200), nn.LeakyReLU(inplace=True), nn.Linear(200, 10), nn.LeakyReLU(inplace=True), ) def forward(self, x): x = self.model(x) return x device = torch.device('cuda:0') net = MLP().to(device) optimizer = optim.SGD(net.parameters(), lr=learning_rate) criteon = nn.CrossEntropyLoss().to(device) for epoch in range(epochs): for batch_idx, (data, target) in enumerate(train_loader): data = data.view(-1, 28 * 28) data, target = data.to(device), target.cuda() logits = net(data) loss = criteon(logits, target) optimizer.zero_grad() loss.backward() # print(w1.grad.norm(), w2.grad.norm()) optimizer.step()
def run(): batch_size = 32 train_transform = transforms.Compose([ transforms.Resize(144, interpolation=3), transforms.RandomCrop((256, 128)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) test_transform = transforms.Compose([ transforms.Resize((288, 144), interpolation=3), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) test_flip_transform = transforms.Compose([ transforms.Resize((288, 144), interpolation=3), functional.hflip, transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) train_dataset = Market1501(root + '/bounding_box_train', transform=train_transform) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) query_dataset = Market1501(root + '/query', transform=test_transform) query_flip_dataset = Market1501(root + '/query', transform=test_flip_transform) query_loader = DataLoader(query_dataset, batch_size=batch_size, shuffle=False) query_flip_loader = DataLoader(query_flip_dataset, batch_size=batch_size, shuffle=False) test_dataset = Market1501(root + '/bounding_box_test', transform=test_transform) test_flip_dataset = Market1501(root + '/bounding_box_test', transform=test_flip_transform) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) test_flip_loader = DataLoader(test_flip_dataset, batch_size=batch_size, shuffle=False) ide = IDE(num_classes=len(train_dataset.unique_ids)).to(DEVICE) criterion = nn.CrossEntropyLoss() params = [ { 'params': ide.backbone.parameters(), 'lr': 0.01 }, { 'params': ide.classifier.parameters(), 'lr': 0.1 }, ] optimizer = optim.SGD(params, momentum=0.9, weight_decay=5e-4, nesterov=True) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1) epochs = 50 for epoch in range(epochs): ide.train() scheduler.step() running_loss = 0.0 for i, data in enumerate(train_loader): inputs, labels = data inputs, labels = inputs.to(DEVICE), labels.to(DEVICE) optimizer.zero_grad() outputs = ide(inputs) loss = criterion(outputs[1], labels) loss.backward() optimizer.step() running_loss += loss.item() print('%d/%d - %d/%d - loss: %f' % (epoch, epochs, i, len(train_loader), loss.item())) print('epoch: %d/%d - loss: %f' % (epoch, epochs, running_loss / len(train_loader))) if epoch % 10 == 9: ide.eval() query = np.concatenate([ ide(inputs.to(DEVICE))[0].detach().cpu().numpy() for inputs, _ in query_loader ]) query_flip = np.concatenate([ ide(inputs.to(DEVICE))[0].detach().cpu().numpy() for inputs, _ in query_flip_loader ]) test = np.concatenate([ ide(inputs.to(DEVICE))[0].detach().cpu().numpy() for inputs, _ in test_loader ]) test_flip = np.concatenate([ ide(inputs.to(DEVICE))[0].detach().cpu().numpy() for inputs, _ in test_flip_loader ]) # dist = cdist((query + query_flip) / 2., (test + test_flip) / 2.) dist = cdist(normalize(query + query_flip), normalize(test + test_flip)) r = cmc(dist, query_dataset.ids, test_dataset.ids, query_dataset.cameras, test_dataset.cameras, separate_camera_set=False, single_gallery_shot=False, first_match_break=True) m_ap = mean_ap(dist, query_dataset.ids, test_dataset.ids, query_dataset.cameras, test_dataset.cameras) print('epoch[%d]: mAP=%f, r@1=%f, r@3=%f, r@5=%f, r@10=%f' % (epoch + 1, m_ap, r[0], r[2], r[4], r[9]))
in_size = x.size(0) # one batch # x: 64*10*12*12 x = F.relu(self.mp(self.conv1(x))) # x: 64*20*4*4 x = F.relu(self.mp(self.conv2(x))) # x: 64*320 x = x.view(in_size, -1) # flatten the tensor # x: 64*10 x = self.fc(x) return F.log_softmax(x, dim=1) model = Net() if torch.cuda.is_available(): model = model.cuda() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) def train(epoch): loss_all = 0 for batch_idx, (data, target) in enumerate(train_loader): data, target = Variable(data), Variable(target) if torch.cuda.is_available(): data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() loss_all += loss.data if batch_idx % 200 == 0:
def main(): """Do stuff.""" args = parser.parse_args() # don't use this, neither set learning rate as a linear function # of the count of gpus, it will make accuracy lower # args.batch_size = args.batch_size * torch.cuda.device_count() if args.mode == 'prune': args.save_folder = os.path.join(args.save_folder, str(args.target_sparsity)) if args.initial_sparsity != 0.0: args.load_folder = os.path.join(args.load_folder, str(args.initial_sparsity)) if args.save_folder and not os.path.isdir(args.save_folder): os.makedirs(args.save_folder) if args.log_path: set_logger(args.log_path) if args.pruning_ratio_to_acc_record_file and not os.path.isdir( args.pruning_ratio_to_acc_record_file.rsplit('/', 1)[0]): os.makedirs(args.pruning_ratio_to_acc_record_file.rsplit('/', 1)[0]) if not torch.cuda.is_available(): logging.info('no gpu device available') args.cuda = False torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) cudnn.benchmark = True # If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 resume_folder = args.load_folder for try_epoch in range(200, 0, -1): if os.path.exists( args.checkpoint_format.format(save_folder=resume_folder, epoch=try_epoch)): resume_from_epoch = try_epoch break if args.restore_epoch: resume_from_epoch = args.restore_epoch # Set default train and test path if not provided as input. utils.set_dataset_paths(args) if resume_from_epoch: filepath = args.checkpoint_format.format(save_folder=resume_folder, epoch=resume_from_epoch) checkpoint = torch.load(filepath) checkpoint_keys = checkpoint.keys() dataset_history = checkpoint['dataset_history'] dataset2num_classes = checkpoint['dataset2num_classes'] masks = checkpoint['masks'] shared_layer_info = checkpoint['shared_layer_info'] piggymask_floats = checkpoint['piggymask_floats'] piggymask_task_tags = checkpoint['piggymask_task_tags'] if 'num_for_construct' in checkpoint_keys: num_for_construct = checkpoint['num_for_construct'] if args.mode == 'inference' and 'network_width_multiplier' in shared_layer_info[ args.dataset]: # TODO, temporary solution args.network_width_multiplier = shared_layer_info[ args.dataset]['network_width_multiplier'] else: dataset_history = [] dataset2num_classes = {} masks = {} shared_layer_info = {} piggymask_floats = {} piggymask_task_tags = {} if args.baseline_acc_file is None or not os.path.isfile( args.baseline_acc_file): sys.exit(3) with open(args.baseline_acc_file, 'r') as jsonfile: json_data = json.load(jsonfile) baseline_acc = float(json_data[args.dataset]) if args.mode == 'prune' and not args.pruning_ratio_to_acc_record_file: sys.exit(-1) if args.arch == 'resnet50': num_for_construct = [ 64, 64, 64 * 4, 128, 128 * 4, 256, 256 * 4, 512, 512 * 4 ] model = models.__dict__[args.arch](pretrained=True, num_for_construct=num_for_construct, threshold=args.threshold) elif 'vgg' in args.arch: custom_cfg = [ 64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M' ] model = models.__dict__[args.arch]( custom_cfg, dataset_history=dataset_history, dataset2num_classes=dataset2num_classes, network_width_multiplier=args.network_width_multiplier, shared_layer_info=shared_layer_info, groups=int(args.network_width_multiplier)) else: print('Error!') sys.exit(1) # Add and set the model dataset. model.add_dataset(args.dataset, args.num_classes) model.set_dataset(args.dataset) model = nn.DataParallel(model) model = model.cuda() NEED_ADJUST_MASK = False task_id = model.module.datasets.index(args.dataset) + 1 if not masks: for name, module in model.module.named_modules(): if isinstance(module, nl.SharableConv2d) or isinstance( module, nl.SharableLinear): mask = torch.ByteTensor(module.weight.data.size()).fill_(0) if 'cuda' in module.weight.data.type(): mask = mask.cuda() masks[name] = mask module.packnet_mask = mask else: # when we expand network, we need to allocate new masks for name, module in model.module.named_modules(): if isinstance(module, nl.SharableConv2d): if masks[name].size(0) < module.weight.data.size(0): assert args.mode == 'finetune' NEED_ADJUST_MASK = True elif masks[name].size(0) > module.weight.data.size(0): assert args.mode == 'inference' NEED_ADJUST_MASK = True if NEED_ADJUST_MASK: if args.mode == 'finetune': for name, module in model.module.named_modules(): if isinstance(module, nl.SharableConv2d): mask = torch.ByteTensor( module.weight.data.size()).fill_(task_id) if 'cuda' in module.weight.data.type(): mask = mask.cuda() mask[:masks[name].size(0), :, :, :].copy_(masks[name]) masks[name] = mask elif isinstance(module, nl.SharableLinear): mask = torch.ByteTensor( module.weight.data.size()).fill_(task_id) if 'cuda' in module.weight.data.type(): mask = mask.cuda() mask[:masks[name].size(0), :masks[name].size(1)].copy_( masks[name]) masks[name] = mask elif args.mode == 'inference': for name, module in model.module.named_modules(): if isinstance(module, nl.SharableConv2d): mask = torch.ByteTensor( module.weight.data.size()).fill_(task_id) if 'cuda' in module.weight.data.type(): mask = mask.cuda() mask[:, :, :, :].copy_( masks[name][:mask.size(0), :, :, :]) masks[name] = mask elif isinstance(module, nl.SharableLinear): mask = torch.ByteTensor( module.weight.data.size()).fill_(task_id) if 'cuda' in module.weight.data.type(): mask = mask.cuda() mask[:, :].copy_( masks[name][:mask.size(0), :mask.size(1)]) masks[name] = mask for name, module in model.module.named_modules(): if isinstance(module, nl.SharableConv2d) or isinstance( module, nl.SharableLinear): module.packnet_mask = masks[name] if args.dataset not in shared_layer_info: shared_layer_info[args.dataset] = { 'bias': {}, 'bn_layer_running_mean': {}, 'bn_layer_running_var': {}, 'bn_layer_weight': {}, 'bn_layer_bias': {} } NEED_ADJUST_MASK = False if task_id == 1: for name, module in model.module.named_modules(): if isinstance(module, nl.SharableConv2d) or isinstance( module, nl.SharableLinear): module.inference_task_id = task_id elif task_id == 2 and not piggymask_floats: for name, module in model.module.named_modules(): if isinstance(module, nl.SharableConv2d) or isinstance( module, nl.SharableLinear): piggymask_floats[name] = torch.zeros_like(masks[name], dtype=torch.float32) piggymask_task_tags[name] = torch.zeros_like(masks[name]) piggymask_floats[name] = torch.where( masks[name] != 0, torch.full_like(piggymask_floats[name], 0.01), piggymask_floats[name]) piggymask_task_tags[name] = torch.where( masks[name] != 0, torch.full_like(piggymask_task_tags[name], task_id), piggymask_task_tags[name]) piggymask_floats[name] = Parameter(piggymask_floats[name]) module.piggymask_float = piggymask_floats[name] module.piggymask_task_tag = piggymask_task_tags[name] module.inference_task_id = task_id elif task_id >= 2: # when we expand network, we need to allocate new piggymasks for name, module in model.module.named_modules(): if isinstance(module, nl.SharableConv2d): if piggymask_floats[name].size(0) < module.weight.data.size(0): assert args.mode == 'finetune' NEED_ADJUST_MASK = True elif piggymask_floats[name].size(0) > module.weight.data.size( 0): assert args.mode == 'inference' NEED_ADJUST_MASK = True if NEED_ADJUST_MASK: if args.mode == 'finetune': for name, module in model.module.named_modules(): if isinstance(module, nl.SharableConv2d): piggymask_float = torch.zeros_like(masks[name], dtype=torch.float32) piggymask_task_tag = torch.zeros_like(masks[name]) piggymask_float[:piggymask_floats[name]. size(0), :, :, :].copy_( piggymask_floats[name]) piggymask_task_tag[:piggymask_task_tags[name]. size(0), :, :, :].copy_( piggymask_task_tags[name]) piggymask_floats[name] = Parameter(piggymask_float) piggymask_task_tags[name] = piggymask_task_tag elif isinstance(module, nl.SharableLinear): piggymask_float = torch.zeros_like(masks[name], dtype=torch.float32) piggymask_task_tag = torch.zeros_like(masks[name]) piggymask_float[:piggymask_floats[name].size(0), : piggymask_floats[name].size(1)].copy_( piggymask_floats[name]) piggymask_task_tag[:piggymask_task_tags[name].size( 0), :piggymask_task_tags[name].size(1)].copy_( piggymask_task_tags[name]) piggymask_floats[name] = Parameter(piggymask_float) piggymask_task_tags[name] = piggymask_task_tag elif args.mode == 'inference': for name, module in model.module.named_modules(): if isinstance(module, nl.SharableConv2d): piggymask_float = torch.zeros_like(masks[name], dtype=torch.float32) piggymask_task_tag = torch.zeros_like(masks[name]) piggymask_float[:, :, :, :].copy_( piggymask_floats[name] [:piggymask_float.size(0), :, :, :]) piggymask_floats[name] = Parameter(piggymask_float) piggymask_task_tag[:, :, :, :].copy_( piggymask_task_tags[name] [:piggymask_task_tag.size(0), :, :, :]) piggymask_task_tags[name] = piggymask_task_tag elif isinstance(module, nl.SharableLinear): piggymask_float = torch.zeros_like(masks[name], dtype=torch.float32) piggymask_task_tag = torch.zeros_like(masks[name]) piggymask_float[:, :].copy_( piggymask_floats[name][:piggymask_float.size(0), : piggymask_float.size(1)]) piggymask_floats[name] = Parameter(piggymask_float) piggymask_task_tag[:, :].copy_( piggymask_task_tags[name][:piggymask_task_tag.size( 0), :piggymask_task_tag.size(1)]) piggymask_task_tags[name] = piggymask_task_tag for name, module in model.module.named_modules(): if isinstance(module, nl.SharableConv2d) or isinstance( module, nl.SharableLinear): if args.mode == 'finetune' and not args.finetune_again: piggymask_task_tags[name].data[ piggymask_task_tags[name].data.eq(0) & (masks[name] != 0)] = task_id piggymask_floats[name].data[ piggymask_task_tags[name].data.eq(task_id)] = 0.01 module.piggymask_float = piggymask_floats[name] module.piggymask_task_tag = piggymask_task_tags[name] module.inference_task_id = task_id shared_layer_info[args.dataset][ 'network_width_multiplier'] = args.network_width_multiplier if args.num_classes == 2: train_loader = dataset.cifar100_train_loader_two_class( args.dataset, args.batch_size) val_loader = dataset.cifar100_val_loader_two_class( args.dataset, args.val_batch_size) elif args.num_classes == 5: train_loader = dataset.cifar100_train_loader(args.dataset, args.batch_size) val_loader = dataset.cifar100_val_loader(args.dataset, args.val_batch_size) else: print("num_classes should be either 2 or 5") sys.exit(1) # if we are going to save checkpoint in other folder, then we recalculate the starting epoch if args.save_folder != args.load_folder: start_epoch = 0 else: start_epoch = resume_from_epoch curr_prune_step = begin_prune_step = start_epoch * len(train_loader) end_prune_step = curr_prune_step + args.pruning_interval * len( train_loader) manager = Manager(args, model, shared_layer_info, masks, train_loader, val_loader, begin_prune_step, end_prune_step) if args.mode == 'inference': manager.load_checkpoint_only_for_evaluate(resume_from_epoch, resume_folder) manager.validate(resume_from_epoch - 1) return # manager.inference_dataset_idx lr = args.lr lr_mask = args.lr_mask # update all layers named_params = dict(model.named_parameters()) params_to_optimize_via_SGD = [] named_of_params_to_optimize_via_SGD = [] masks_to_optimize_via_Adam = [] named_of_masks_to_optimize_via_Adam = [] for name, param in named_params.items(): if 'classifiers' in name: if '.{}.'.format(model.module.datasets.index( args.dataset)) in name: params_to_optimize_via_SGD.append(param) named_of_params_to_optimize_via_SGD.append(name) continue elif 'piggymask' in name: masks_to_optimize_via_Adam.append(param) named_of_masks_to_optimize_via_Adam.append(name) else: params_to_optimize_via_SGD.append(param) named_of_params_to_optimize_via_SGD.append(name) optimizer_network = optim.SGD(params_to_optimize_via_SGD, lr=lr, weight_decay=0.0, momentum=0.9, nesterov=True) optimizers = Optimizers() optimizers.add(optimizer_network, lr) if masks_to_optimize_via_Adam: optimizer_mask = optim.Adam(masks_to_optimize_via_Adam, lr=lr_mask) optimizers.add(optimizer_mask, lr_mask) manager.load_checkpoint(optimizers, resume_from_epoch, resume_folder, NEED_ADJUST_MASK) # k = int(args.network_width_multiplier) # assert k >= 2 # for name, module in model.module.named_modules(): # if isinstance(module, nl.SharableConv2d): # n = len(module.weight) # n = int((n // k * (k-1)) * 0.1) # # module.weight.data[:n, :, :, :] = 0.0 # module.packnet_mask[:n, :, :, :] = 255 # if isinstance(module, nl.SharableLinear): # n = len(module.bias) # n = int((n // k * (k-1)) * 0.1) # # module.weight.data[:n, :] = 0.0 # # module.bias.data[:n] = 0.0 # module.packnet_mask[:n, :] = 255 # if isinstance(module, nn.BatchNorm2d): # n = len(module.weight) # n = int((n // k * (k-1)) * 0.1) # # module.weight.data[:n] = 0.0 """Performs training.""" curr_lrs = [] for optimizer in optimizers: for param_group in optimizer.param_groups: curr_lrs.append(param_group['lr']) break if args.mode == 'prune': if 'gradual_prune' in args.load_folder and args.save_folder == args.load_folder: args.epochs = 20 + resume_from_epoch logging.info('') logging.info('Before pruning: ') logging.info('Sparsity range: {} -> {}'.format(args.initial_sparsity, args.target_sparsity)) must_pruning_ratio_for_curr_task = 0.0 json_data = {} if os.path.isfile(args.pruning_ratio_to_acc_record_file): with open(args.pruning_ratio_to_acc_record_file, 'r') as json_file: json_data = json.load(json_file) if args.network_width_multiplier == args.max_allowed_network_width_multiplier and json_data[ '0.0'] < baseline_acc: # if we reach the upperbound and still do not get the accuracy over our target on curr task, we still do pruning logging.info( 'we reach the upperbound and still do not get the accuracy over our target on curr task' ) remain_num_tasks = args.total_num_tasks - len(dataset_history) logging.info('remain_num_tasks: {}'.format(remain_num_tasks)) ratio_allow_for_curr_task = round(1.0 / (remain_num_tasks + 1), 1) logging.info('ratio_allow_for_curr_task: {:.4f}'.format( ratio_allow_for_curr_task)) must_pruning_ratio_for_curr_task = 1.0 - ratio_allow_for_curr_task if args.initial_sparsity >= must_pruning_ratio_for_curr_task: sys.exit(6) manager.validate(start_epoch - 1) logging.info('') elif args.mode == 'finetune': if not args.finetune_again: manager.pruner.make_finetuning_mask() logging.info('Finetune stage...') else: logging.info('Piggymask Retrain...') history_best_avg_val_acc_when_retraining = manager.validate( start_epoch - 1) num_epochs_that_criterion_does_not_get_better = 0 stop_lr_mask = True if manager.pruner.calculate_curr_task_ratio() == 0.0: logging.info( 'There is no left space in convolutional layer for curr task' ', we will try to use prior experience as long as possible') stop_lr_mask = False for epoch_idx in range(start_epoch, args.epochs): avg_train_acc, curr_prune_step = manager.train(optimizers, epoch_idx, curr_lrs, curr_prune_step) avg_val_acc = manager.validate(epoch_idx) # if args.mode == 'prune' and (epoch_idx+1) >= (args.pruning_interval + start_epoch) and ( # avg_val_acc > history_best_avg_val_acc_when_prune): # pass if args.finetune_again: if avg_val_acc > history_best_avg_val_acc_when_retraining: history_best_avg_val_acc_when_retraining = avg_val_acc num_epochs_that_criterion_does_not_get_better = 0 if args.save_folder is not None: for path in os.listdir(args.save_folder): if '.pth.tar' in path: os.remove(os.path.join(args.save_folder, path)) else: print('Something is wrong! Block the program with pdb') pdb.set_trace() history_best_avg_val_acc = avg_val_acc manager.save_checkpoint(optimizers, epoch_idx, args.save_folder) else: num_epochs_that_criterion_does_not_get_better += 1 if args.finetune_again and num_epochs_that_criterion_does_not_get_better == 5: logging.info("stop retraining") sys.exit(0) if args.mode == 'finetune': if epoch_idx + 1 == 50 or epoch_idx + 1 == 80: for param_group in optimizers[0].param_groups: param_group['lr'] *= 0.1 curr_lrs[0] = param_group['lr'] if len(optimizers.lrs) == 2: if epoch_idx + 1 == 50: for param_group in optimizers[1].param_groups: param_group['lr'] *= 0.2 if stop_lr_mask and epoch_idx + 1 == 70: for param_group in optimizers[1].param_groups: param_group['lr'] *= 0.0 curr_lrs[1] = param_group['lr'] if args.save_folder is not None: pass # paths = os.listdir(args.save_folder) # if paths and '.pth.tar' in paths[0]: # for checkpoint_file in paths: # os.remove(os.path.join(args.save_folder, checkpoint_file)) else: print('Something is wrong! Block the program with pdb') if task_id >= 2: for name, module in model.module.named_modules(): if isinstance(module, nl.SharableConv2d) or isinstance( module, nl.SharableLinear): if args.mode == 'finetune': module.piggymask_task_tag[module.piggymask_float.le( 0.005)] = 0 if avg_train_acc > 0.95: manager.save_checkpoint(optimizers, epoch_idx, args.save_folder) logging.info('-' * 16) if args.pruning_ratio_to_acc_record_file: json_data = {} if os.path.isfile(args.pruning_ratio_to_acc_record_file): with open(args.pruning_ratio_to_acc_record_file, 'r') as json_file: json_data = json.load(json_file) if args.mode == 'finetune' and not args.test_piggymask: json_data[0.0] = round(avg_val_acc, 4) with open(args.pruning_ratio_to_acc_record_file, 'w') as json_file: json.dump(json_data, json_file) if avg_train_acc > 0.95 and avg_val_acc >= baseline_acc: pass else: logging.info("It's time to expand the Network") logging.info('Auto expand network') sys.exit(2) if manager.pruner.calculate_curr_task_ratio() == 0.0: logging.info( 'There is no left space in convolutional layer for curr task, so needless to prune' ) sys.exit(5) elif args.mode == 'prune': if avg_train_acc > 0.95: json_data[args.target_sparsity] = round(avg_val_acc, 4) with open(args.pruning_ratio_to_acc_record_file, 'w') as json_file: json.dump(json_data, json_file) else: sys.exit(6) must_pruning_ratio_for_curr_task = 0.0 if args.network_width_multiplier == args.max_allowed_network_width_multiplier and json_data[ '0.0'] < baseline_acc: # if we reach the upperbound and still do not get the accuracy over our target on curr task, we still do pruning logging.info( 'we reach the upperbound and still do not get the accuracy over our target on curr task' ) remain_num_tasks = args.total_num_tasks - len(dataset_history) logging.info('remain_num_tasks: {}'.format(remain_num_tasks)) ratio_allow_for_curr_task = round(1.0 / (remain_num_tasks + 1), 1) logging.info('ratio_allow_for_curr_task: {:.4f}'.format( ratio_allow_for_curr_task)) must_pruning_ratio_for_curr_task = 1.0 - ratio_allow_for_curr_task if args.target_sparsity >= must_pruning_ratio_for_curr_task: sys.exit(6)
############################# net = load_model(args.checkpoint, 'cpu') ############################# # # Loss and optimizer # ############################# miner = miners.BatchHardMiner().to(device) loss_func = losses.TripletMarginLoss(margin=0.3).to(device) if args.optimizer == 'ADAM': optimizer = optim.Adam(net.parameters(), lr=learning_rate_)#1e-3) elif args.optimizer == 'SGD': optimizer = optim.SGD(net.parameters(), lr=learning_rate_, momentum=0.9, weight_decay=5e-4, nesterov=True) ############################# # # Resume # ############################# if dataset_name =='DeepFashion': checkpoint_file_name = '{}/{}/{}_{}_{}'.format(checkpoint_path,dataset_name,dataset_name,args.optimizer,combinations_type) if args.resume: # Load checkpoint. load_epoch_num = args.load_epoch_num - 1 checkpoint_number = '{}_{}_ckpt.t7'.format(checkpoint_file_name,load_epoch_num) print(checkpoint_number)
# if y_hat.requires_grad: def log_hook(grad_input): print("logging", grad_input.shape) grads[0] = grad_input # torch.cat((grad_input.detach().cpu(), y_hat.detach().cpu()), dim=0) # grad_input_batch = torch.cat(tuple(torch.cat(tuple(vis(e_0[c]) for c in range(e_0.shape[0])), dim=1) for e_0 in grad_input), dim=2) # self.logger.experiment.add_image(f'train_regression_grad', grad_input_batch, self.global_step) # handle.remove() handle = embedding.register_hook(log_hook) optimizer = optim.SGD(list(sslt.parameters()) + [ embedding, ], lr=1e1) for it in range(0, iterations): optimizer.zero_grad() loss, _, _ = sslt.loss_CPCshift(None, embedding, (4, 2)) loss.backward() vis_grad = grads[0].detach() vis_grad = vis_grad / (vis_grad.abs().max() + 1e-8) vis_grad = vis_grad.detach().cpu().numpy() vis_emb = embedding.detach() vis_emb = vis_emb / (vis_emb.abs().max() + 1e-8) vis_emb = vis_emb.detach().cpu().numpy() for c in range(1):
elif args.model == 'resnet18': model = resnet18(num_classes=10) elif args.model == 'resnet34': model = resnet34(num_classes=10) elif args.model == 'resnet50': model = resnet50(num_classes=10) elif args.model == 'vgg16': model = VGG(vgg_name='vgg16', num_classes=10) elif args.model == 'MLP': model = MLP() else: raise ValueError('Unrecognized training model') if args.optim == 'SGD': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optim == 'LBFGS': optimizer = optim.LBFGS(model.parameters(), lr=args.lr) num_epochs = args.epoch lr = args.lr print_itr = args.print_frq criterion = nn.CrossEntropyLoss() start_epoch = 0 if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load('./checkpoint/{}/{}_best.pth'.format(
def train(): if args.dataset == 'COCO': if args.dataset_root == VOC_ROOT: if not os.path.exists(COCO_ROOT): parser.error('Must specify dataset_root if specifying dataset') print("WARNING: Using default COCO dataset_root because " + "--dataset_root was not specified.") args.dataset_root = COCO_ROOT cfg = coco dataset = COCODetection(root=args.dataset_root, transform=SSDAugmentation(cfg['min_dim'], MEANS)) elif args.dataset == 'VOC': if args.dataset_root == COCO_ROOT: parser.error('Must specify dataset if specifying dataset_root') cfg = voc dataset = VOCDetection(root=args.dataset_root, transform=SSDAugmentation(cfg['min_dim'], MEANS)) if args.visdom: import visdom global viz viz = visdom.Visdom() ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes']) net = ssd_net if args.cuda: net = torch.nn.DataParallel(ssd_net) cudnn.benchmark = True if args.resume: print('Resuming training, loading {}...'.format(args.resume)) ssd_net.load_weights(args.resume) else: vgg_weights = torch.load(args.save_folder + args.basenet) print('Loading base network...') ssd_net.vgg.load_state_dict(vgg_weights) if args.cuda: net = net.cuda() if not args.resume: print('Initializing weights...') # initialize newly added layers' weights with xavier method ssd_net.extras.apply(weights_init) ssd_net.loc.apply(weights_init) ssd_net.conf.apply(weights_init) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, False, args.cuda) net.train() # loss counters loc_loss = 0 conf_loss = 0 epoch = 0 print('Loading the dataset...') epoch_size = len(dataset) // args.batch_size print('Training SSD on:', dataset.name) print('Using the specified args:') print(args) step_index = 0 if args.visdom: vis_title = 'SSD.PyTorch on ' + dataset.name vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss'] iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend) epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend) data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) # create batch iterator batch_iterator = iter(data_loader) for iteration in range(args.start_iter, cfg['max_iter']): if args.visdom and iteration != 0 and (iteration % epoch_size == 0): update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None, 'append', epoch_size) # reset epoch loss counters loc_loss = 0 conf_loss = 0 epoch += 1 if iteration in cfg['lr_steps']: step_index += 1 adjust_learning_rate(optimizer, args.gamma, step_index) # load train data # images, targets = next(batch_iterator) try: images, targets = next(batch_iterator) except StopIteration: batch_iterator = iter(data_loader) images, targets = next(batch_iterator) if args.cuda: images = Variable(images.cuda()) targets = [Variable(ann.cuda(), volatile=True) for ann in targets] else: images = Variable(images) targets = [Variable(ann, volatile=True) for ann in targets] # forward t0 = time.time() out = net(images) # backprop optimizer.zero_grad() loss_l, loss_c = criterion(out, targets) loss = loss_l + loss_c loss.backward() optimizer.step() t1 = time.time() #loc_loss += loss_l.data[0] #conf_loss += loss_c.data[0] loc_loss += loss_l.item() conf_loss += loss_c.item() if iteration % 10 == 0: print('timer: %.4f sec.' % (t1 - t0)) #print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data[0]), end=' ') print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.item()), end=' ') if args.visdom: # update_vis_plot(iteration, loss_l.data[0], loss_c.data[0], update_vis_plot(iteration, loss_l.item(), loss_c.item(), iter_plot, epoch_plot, 'append') if iteration != 0 and iteration % 5000 == 0: print('Saving state, iter:', iteration) torch.save(ssd_net.state_dict(), 'weights/ssd300_COCO_' + repr(iteration) + '.pth') torch.save(ssd_net.state_dict(), args.save_folder + '' + args.dataset + '.pth')
y_test = np.zeros((25000, )) y_test[0:12500] = 1 #calling the model vocab_size += 1 model = BOW_model(vocab_size, 500) model.cuda() # opt = 'sgd' # LR = 0.01 opt = 'adam' LR = 0.001 if (opt == 'adam'): optimizer = optim.Adam(model.parameters(), lr=LR) elif (opt == 'sgd'): optimizer = optim.SGD(model.parameters(), lr=LR, momentum=0.9) batch_size = 200 no_of_epochs = 6 L_Y_train = len(y_train) L_Y_test = len(y_test) model.train() train_loss = [] train_accu = [] test_accu = [] for epoch in range(no_of_epochs): # training
net = Net() # 3. Define a Loss function and optimizer # --------------------------------------- # # Let's use a Classification Cross-Entropy loss and SGD with momentum # In[4]: import torch.optim as optim criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) # 4. Train the network # -------------------- # # This is when things start to get interesting. # We simply have to loop over our data iterator, and feed the inputs to the # network and optimize # In[11]: start_time = time.time() fileName = "NetworkConfiguration_1_2.txt" file = open(fileName, 'w') file.close()
def create_optimizer(net, lr, mom): optimizer = optim.SGD(net.parameters(), lr, mom) return optimizer
torch.set_num_threads(os.cpu_count()) print(f'Using {device}: {torch.get_num_threads()} threads') # load data train_loader, test_loader = get_data_loader(opt, im_size=32) # model + loss function + optimizer + scheduler net = VGGAttention(mode=opt.attention_mode) criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): model = nn.DataParallel( net, device_ids=list(range(torch.cuda.device_count()))).to(device) else: model = net.to(device) criterion.to(device) optimizer = optim.SGD(model.parameters(), lr=opt.lr, momentum=0.9, weight_decay=5e-4) scheduler = lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda epoch: np.power(0.5, int(epoch / 25))) # time to train/validate obj = AttentionNetwork(opt=opt, model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, device=device) obj.train_validate(train_loader=train_loader, test_loader=test_loader)
def train(): # initialize the model model_path = os.path.join(constant.MODEL_DIR, constant.PRETRAINED_MODEL) c3d = model.C3D(constant.NUM_CLASSES) print(model_path) device = get_default_device() if device == torch.device('cpu'): pretrained_param = torch.load(model_path, map_location='cpu') else: pretrained_param = torch.load(model_path) to_load = {} for key in pretrained_param.keys(): if 'conv' in key: to_load[key] = pretrained_param[key] else: to_load[key] = c3d.state_dict()[key] c3d.load_state_dict(to_load) print(c3d.state_dict()) train_params = [{'params': c3d.get_conv_1x_lr_param(), 'weight_decay': constant.WEIGHT_DECAY}, {'params': c3d.get_conv_2x_lr_param(), 'lr': constant.BASE_LR * 2}, {'params': c3d.get_fc_1x_lr_param(), 'weight_decay': constant.WEIGHT_DECAY}, {'params': c3d.get_fc_2x_lr_param(), 'lr': constant.BASE_LR * 2}] # import input data trainset = UCF101DataSet(framelist_file=constant.TRAIN_LIST, clip_len=constant.CLIP_LENGTH, crop_size=constant.CROP_SIZE, split="training") trainloader = torch.utils.data.DataLoader(trainset, batch_size=constant.TRAIN_BATCH_SIZE, shuffle=True, num_workers=10) c3d.to(device, non_blocking=True, dtype=torch.float) c3d.train() # define loss function (Cross Entropy loss) criterion = nn.CrossEntropyLoss() criterion.to(device) # define optimizer optimizer = optim.SGD(train_params, lr=constant.BASE_LR, momentum=constant.MOMENTUM, weight_decay=0) print(optimizer.state_dict()) # define lr schedule scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=constant.LR_DECAY_STEP_SIZE, gamma=constant.LR_DECAY_GAMMA) writer = SummaryWriter() for epoch in range(constant.NUM_EPOCHES): running_loss = 0.0 running_accuracy = 0.0 scheduler.step() for i, data in enumerate(trainloader, 0): step = epoch * len(trainloader) + i inputs, labels = data['clip'].to(device, dtype=torch.float), data['label'].to( device=device, dtype=torch.int64) optimizer.zero_grad() outputs = c3d(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() print('Step %d, loss: %.3f' % (i, loss.item())) writer.add_scalar('Train/Loss', loss.item(), step) outputs = nn.Softmax(dim=1)(outputs) _, predict_label = outputs.max(1) correct = (predict_label == labels).sum().item() accuracy = float(correct) / float(constant.TRAIN_BATCH_SIZE) running_accuracy += accuracy writer.add_scalar('Train/Accuracy', accuracy, step) print("iteration %d, accuracy = %.3f" % (i, accuracy)) if i % 100 == 99: print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 100)) print('[%d, %5d] accuracy: %.3f' % (epoch + 1, i + 1, running_accuracy / 100)) running_loss = 0.0 running_accuracy = 0.0 if step % 10000 == 9999: torch.save(c3d.state_dict(), os.path.join( constant.MODEL_DIR, '%s-%s-%d' % (constant.TRAIN_MODEL_NAME, datetime.date.today(), step + 1))) print('Finished Training') writer.close()
parser.add_argument('--dataRoot', type=str, default='data') opt = parser.parse_args() config = None if opt.config == 'transformer': config = TransformerConfig() if not os.path.exists('weight'): os.mkdir('weight') d_model = config.d_model n_layers = config.n_layers heads = config.heads dropout = config.dropout rtnet = RTNet(d_model, n_layers, heads, dropout) rtnet = rtnet.cuda() opti = None if opt.opti == 'SGD': opti = optim.SGD(rtnet.parameters(), lr=0.01) # 读取嵌入向量 f = open('config/embedding.txt', 'r') op_embedding_str = f.read() op_embedding_Dict = json.loads(op_embedding_str) f.close() # 读取操作向量 f = open('config/opDict.txt', 'r') op_str = f.read() op_Dict = json.loads(op_str) f.close() # 损失函数 # loss_func = F.cross_entropy
if args.dataset == 'mnist': net = fc(width=args.width, depth=args.depth, num_classes=num_classes).to(args.device) elif args.dataset == 'cifar10': net = fc(width=args.width, depth=args.depth, num_classes=num_classes, input_dim=3 * 32 * 32).to(args.device) elif args.model == 'alexnet': net = alexnet(ch=args.scale, num_classes=num_classes).to(args.device) print(net) opt = optim.SGD(net.parameters(), lr=args.lr, momentum=args.mom, weight_decay=args.wd) if args.lr_schedule: milestone = int(args.iterations / 3) scheduler = optim.lr_scheduler.MultiStepLR( opt, milestones=[milestone, 2 * milestone], gamma=0.5) if args.criterion == 'NLL': crit = nn.CrossEntropyLoss().to(args.device) elif args.criterion == 'linear_hinge': crit = linear_hinge_loss def cycle_loader(dataloader): while 1: for data in dataloader:
def train_network(): network = Stage1CountingNet() model_save_dir = './models_stage_1' model_save_path = os.path.join(model_save_dir, 'train2') if not os.path.exists(model_save_path): os.makedirs(model_save_path) os.makedirs(os.path.join(model_save_path, 'snapshots')) global f snapshot_path = os.path.join(model_save_path, 'snapshots') f = open(os.path.join(model_save_path, 'train0.log'), 'w') # -- Logging Parameters log(f, 'args: ' + str(args)) log(f, 'model: ' + str(network), False) log(f, 'Stage1..') log(f, 'LR: %.12f.' % (args.lr)) start_epoch = 0 num_epochs = args.epochs valid_losses = {} train_losses = {} for metric in ['loss1', 'new_mae']: valid_losses[metric] = [] for metric in ['loss1']: train_losses[metric] = [] batch_size = args.batch_size num_train_images = len(dataset.data_files['train']) num_patches_per_image = args.patches num_batches_per_epoch = num_patches_per_image * num_train_images // batch_size optimizer = optim.SGD(filter(lambda p: p.requires_grad, network.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # -- Main Training Loop all_epoch_test_valid_accs = [] all_epoch_test_valid_per_rot_accs = [] for e_i, epoch in enumerate(range(start_epoch, num_epochs)): avg_loss = [0.0 for _ in range(1)] # b_i - batch index total_match_count = 0 total_count = 0 total_per_angle_count = np.zeros(num_rotations) total_per_angle_match_count = np.zeros(num_rotations) for b_i in range(num_batches_per_epoch): # Generate next training sample Xs, _ = dataset.train_get_data(batch_size=args.batch_size) # 1. Crop image to 112x112 . Xs shape: (B,3,h,w) image_size = Xs.shape[-1] crop_start_loc = [image_size // 4, image_size // 4] Xs = Xs[:, :, crop_start_loc[0]: crop_start_loc[0] + image_new_crop_size, crop_start_loc[1]: crop_start_loc[1] + image_new_crop_size] # 2 . Randomly rotate each image new_images_input = np.zeros_like(Xs, dtype=Xs.dtype) # (B,3,h',w') new_image_rotation_gt = np.zeros( (Xs.shape[0], ), dtype=np.int32) # (B,4) images = np.transpose(Xs, (0, 2, 3, 1)) # (B,h',w',3) for i in range(images.shape[0]): image = images[i] # (h',w',3) chosen_index = np.random.choice(num_rotations, 1)[0] chosen_angle = rotation_angles[chosen_index] if chosen_angle != 0: image = cv2.rotate( image, rotation_angles_cv2[chosen_index]) new_images_input[i, :, :, :] = np.transpose(image, (2, 0, 1)) new_image_rotation_gt[i] = chosen_index losses, matches, actual_angle_dist, matches_by_angle = train_function(new_images_input, new_image_rotation_gt, network, optimizer) total_match_count += matches total_count += args.batch_size assert(total_match_count <= total_count) total_per_angle_count += actual_angle_dist total_per_angle_match_count += matches_by_angle assert(np.sum(total_per_angle_count) == total_count) for scale_idx in range(1): avg_loss[scale_idx] = avg_loss[scale_idx] + losses[scale_idx] # Logging losses after 1k iterations. if b_i % 100 == 0: log(f, 'Epoch %d [%d]: %s loss: %s.' % (epoch, b_i, [network.name], losses)) log(f, 'Epoch %d [%d]: %s rot acc: %s.' % ( epoch, b_i, [network.name], (total_match_count/total_count))) log(f, 'Epoch %d [%d]: %s rot acc(0,90,180,270): %s.' % (epoch, b_i, [network.name], (total_per_angle_match_count / total_per_angle_count))) # -- Stats update avg_loss = [al / num_batches_per_epoch for al in avg_loss] avg_loss = [av for av in avg_loss] train_losses['loss1'].append(avg_loss) torch.cuda.empty_cache() log(f, 'Validating...') epoch_val_losses, txt, rot_acc_valid, per_rot_acc_valid = test_network( dataset, 'test_valid', network, False) log(f, 'Valid epoch: ' + str(epoch) + ' ' + txt) log(f, 'Valid epoch: ' + str(epoch) + 'total rotation acc:' + str(rot_acc_valid)) log(f, 'Valid epoch: ' + str(epoch) + 'per rotation acc:' + str(per_rot_acc_valid)) all_epoch_test_valid_accs.append(rot_acc_valid) all_epoch_test_valid_per_rot_accs.append(per_rot_acc_valid) best_epoch = np.argmax(np.array(all_epoch_test_valid_accs)) best_valid_test_acc = np.array(all_epoch_test_valid_accs).max() log(f, 'Best valid rot acc so far epoch : {} , acc : {}'.format( best_epoch, best_valid_test_acc)) for metric in ['loss1', 'new_mae']: valid_losses[metric].append(epoch_val_losses[metric]) min_valid_epoch = np.argmin(valid_losses['new_mae']) # Save networks save_checkpoint({ 'epoch': epoch + 1, 'state_dict': network.state_dict(), 'optimizer': optimizer.state_dict(), }, snapshot_path, get_filename(network.name, epoch + 1)) print('saving graphs...') with open(os.path.join(snapshot_path, 'losses.pkl'), 'wb') as lossfile: pickle.dump((train_losses, valid_losses), lossfile, protocol=2) for metric in train_losses.keys(): if "maxima_split" not in metric: if isinstance(train_losses[metric][0], list): for i in range(len(train_losses[metric][0])): plt.plot([a[i] for a in train_losses[metric]]) plt.savefig(os.path.join(snapshot_path, 'train_%s_%d.png' % (metric, i))) plt.clf() plt.close() plt.plot(train_losses[metric]) plt.savefig(os.path.join( snapshot_path, 'train_%s.png' % metric)) plt.clf() plt.close() for metric in valid_losses.keys(): if isinstance(valid_losses[metric][0], list): for i in range(len(valid_losses[metric][0])): plt.plot([a[i] for a in valid_losses[metric]]) plt.savefig(os.path.join(snapshot_path, 'valid_%s_%d.png' % (metric, i))) plt.clf() plt.close() plt.plot(valid_losses[metric]) plt.savefig(os.path.join(snapshot_path, 'valid_%s.png' % metric)) plt.clf() plt.close() all_epoch_test_valid_accs = np.array(all_epoch_test_valid_accs) best_epoch = np.argmax(all_epoch_test_valid_accs) best_valid_test_acc = all_epoch_test_valid_accs.max() log(f, 'Best valid rot acc epoch : {} , acc : {}'.format( best_epoch, best_valid_test_acc)) # Plotting the valid accuracies plt.plot(np.array(all_epoch_test_valid_accs)) for i in range(num_rotations): plt.plot(np.array(all_epoch_test_valid_per_rot_accs)[:, i]) plt.legend(['overall acc', '0 deg acc', '90 deg acc', '180 deg acc', '270 deg acc'], loc='upper right') plt.savefig(os.path.join(snapshot_path, 'test_valid_all_rot_acc.png')) plt.clf() plt.close() # this is to be consistent with the file name written filename = get_filename(network.name, best_epoch + 1) with open(os.path.join(snapshot_path, 'unsup_vgg_best_model_meta.pkl'), 'wb') as unsup_file: pickle.dump(filename, unsup_file, protocol=2) log(f, 'Exiting train...') f.close() return
def main(): use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} chunk_size = torch.tensor(0) dist.recv(chunk_size, src=0) num_data = chunk_size.item() is_train = True alpha = args.alpha pattern_list = ['random', 'lowbias', 'midbias', 'highbias'] datanum_list = ['balance', 'lowimbalance', 'highimbalance'] checkpoint_dir = '/data/jcliu/FL/RE-AFL/client_result/client_' + str( args.rank) + '/' print("Create client dir success") fl_utils.create_dir(checkpoint_dir) fig_dir = checkpoint_dir + 'figure/' fl_utils.create_dir(fig_dir) MODEL_PATH = checkpoint_dir + 'model/' fl_utils.create_dir(MODEL_PATH) LOAD_MODEL_PATH = MODEL_PATH + 'alpha_' + str(alpha) + '_model-type_' + args.model_type + '_dataset-type' + args.dataset_type + \ '_batch-size' + str(args.batch_size) + '_tx2nums' + str(args.world_size) + '_' + pattern_list[args.pattern_idx] + 'data-pattern' + \ datanum_list[args.datanum_idx] + 'data' + '_exit-loss' + str(exit_loss_threshold) + '_lr' + str(args.lr) + '_epoch' + str(args.epochs) + '_local' + str(args.local_iters) + '.pth' SAVE_MODEL_PATH = MODEL_PATH + 'alpha_' + str(alpha) + '_model-type_' + args.model_type + '_dataset-type' + args.dataset_type + \ '_batch-size' + str(args.batch_size) + '_tx2nums' + str(args.world_size) + '_' + pattern_list[args.pattern_idx] + 'data-pattern' + \ datanum_list[args.datanum_idx] + 'data' + '_exit-loss' + str(exit_loss_threshold) + '_lr' + str(args.lr) + '_epoch' + str(args.epochs) + '_local' + str(args.local_iters) + '.pth' LOG_ROOT_PATH = checkpoint_dir + 'log/' + '/alpha_' + str(alpha) + '/model-type_' + args.model_type + '_dataset-type' + args.dataset_type + \ '_batch-size' + str(args.batch_size) + '_tx2nums' + str(args.world_size) + '_' + pattern_list[args.pattern_idx] + 'data-pattern' + \ datanum_list[args.datanum_idx] + 'data' + '_exit-loss' + str(exit_loss_threshold) + '_lr' + str(args.lr) + '_epoch' + str(args.epochs) + '_local' + str(args.local_iters) +'/' fl_utils.create_dir(LOG_ROOT_PATH) LOG_PATH = LOG_ROOT_PATH + 'model_acc_loss.txt' log_out = open(LOG_PATH, 'w+') # if args.epoch_start == 0: # log_out.write("%s\n" % LOG_PATH) # if not args.epoch_start == 0: # model.load_state_dict(torch.load(LOAD_MODEL_PATH)) # log_out = dict() # log_out["model_acc_loss"] = open(os.path.join(LOG_ROOT_PATH, "model_acc_loss.txt"), 'w+') # <--Load datasets train_dataset, test_dataset = fl_datasets.load_datasets(args.dataset_type) # train_dataset, test_dataset = load_data() # train_loader = torch.utils.data.DataLoader(train_dataset, # batch_size=args.batch_size, shuffle=True, **kwargs) # test_loader = torch.utils.data.DataLoader(test_dataset, # batch_size=args.test_batch_size, shuffle=True, **kwargs) pattern_idx = args.pattern_idx datanum_idx = args.datanum_idx # <--Create federated train/test loaders for virtrual machines if pattern_idx == 0: # random data (IID) if datanum_idx != 0: # imbalance data is_train = True tx2_train_loader = fl_utils.create_random_loader( args, kwargs, args.rank, num_data, is_train, train_dataset) is_train = False tx2_test_loader = fl_utils.create_random_loader( args, kwargs, args.rank, num_data, is_train, test_dataset) else: # balance data is_train = True tx2_train_loader = fl_utils.create_segment_loader( args, kwargs, args.world_size, args.rank, is_train, train_dataset) is_train = False tx2_test_loader = fl_utils.create_segment_loader( args, kwargs, args.world_size, args.rank, is_train, test_dataset) else: # bias data partition (Non-IID) if pattern_idx == 1: # lowbias label_clusters = ((0, 1, 2, 3, 4), (5, 6, 7, 8, 9)) elif pattern_idx == 2: # midbias label_clusters = ((0, 1), (2, 3), (4, 5), (6, 7), (8, 9)) elif pattern_idx == 3: # highbias label_clusters = ((0, ), (1, ), (2, ), (3, ), (4, ), (5, ), (6, ), (7, ), (8, ), (9, )) class_num = len(train_dataset.classes) cluster_len = len(label_clusters) for idx in range(cluster_len): train_data_tmp, train_targets_tmp = fl_utils.create_bias_selected_data( args, label_clusters[idx], train_dataset) test_data_tmp, test_targets_tmp = fl_utils.create_bias_selected_data( args, label_clusters[idx], test_dataset) if idx == 0: train_data = train_data_tmp train_targets = train_targets_tmp test_data = test_data_tmp test_targets = test_targets_tmp else: train_data = np.vstack((train_data, train_data_tmp)) train_targets = np.hstack((train_targets, train_targets_tmp)) test_data = np.vstack((test_data, test_data_tmp)) test_targets = np.hstack((test_targets, test_targets_tmp)) new_train_dataset = fl_datasets.train_test_dataset( train_data, train_targets, class_num) new_test_dataset = fl_datasets.train_test_dataset( test_data, test_targets, class_num) is_train = True tx2_train_loader = fl_utils.create_segment_loader( args, kwargs, args.world_size, args.rank, is_train, new_train_dataset) is_train = False tx2_test_loader = fl_utils.create_segment_loader( args, kwargs, args.world_size, args.rank, is_train, new_test_dataset) del train_dataset del test_dataset #test loader # self.test_loader = fl_utils.create_ps_test_loader( # args, kwargs, self.param_server, test_dataset) # pattern_list = ['bias', 'partition', 'random'] # pattern_idx = args.pattern_idx # # <--Create federated train/test loaders for virtrual machines # if pattern_idx == 0: # # class_num = len(train_dataset.classes) # # step = np.int32(np.floor(class_num / args.vm_num)) # if args.world_size == 5: # # <--the number of items must equals to args.vm_num # self.selected_idxs = ((0, 1), (2, 3), (4, 5), (6, 7), (8, 9)) # elif args.world_size == 10: # # <--the number of items must equals to args.vm_num # self.selected_idxs = ( # (0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,)) # else: # class_num = len(train_dataset.classes) # step = np.int32(np.floor(class_num / args.vm_num)) # self.selected_idxs = [ # [idx + n for n in range(step)] for idx in range(0, class_num - step + 1, step)] # is_train = True # self.vm_train_loaders = fl_utils.create_bias_federated_loader( # args, kwargs, self.vm_list, is_train, train_dataset, self.selected_idxs) # is_train = False # self.vm_test_loaders = fl_utils.create_bias_federated_loader( # args, kwargs, self.vm_list, is_train, test_dataset, self.selected_idxs) # elif pattern_idx == 1: # # <--the number of items must equals to args.vm_num # partition_ratios = [1/2, 1/4, 1/8, 1/16, 1/16] # is_train = True # self.vm_train_loaders = fl_utils.create_labelwise_federated_loader( # args, kwargs, self.vm_list, is_train, train_dataset, partition_ratios) # is_train = False # self.vm_test_loaders = fl_utils.create_labelwise_federated_loader( # args, kwargs, self.vm_list, is_train, test_dataset, partition_ratios) # else: # is_train = True # self.vm_train_loaders = fl_utils.create_segment_federated_loader( # args, kwargs, self.vm_list, is_train, train_dataset) # is_train = False # self.vm_test_loaders = fl_utils.create_segment_federated_loader( # args, kwargs, self.vm_list, is_train, test_dataset) # <--Create Neural Network model instance if args.dataset_type == 'FashionMNIST': if args.model_type == 'LR': model = fl_models.MNIST_LR_Net().to(device) else: model = fl_models.MNIST_Net().to(device) elif args.dataset_type == 'MNIST': if args.model_type == 'LR': model = fl_models.MNIST_LR_Net().to(device) else: model = fl_models.MNIST_Small_Net().to(device) elif args.dataset_type == 'CIFAR10': if args.model_type == 'Deep': model = fl_models.CIFAR10_Deep_Net().to(device) args.decay_rate = 0.98 else: model = fl_models.CIFAR10_Net().to(device) args.decay_rate = 0.98 elif args.dataset_type == 'Sent140': if args.model_type == 'LSTM': model = fl_models.Sent140_Net().to(device) args.decay_rate = 0.99 else: model = fl_models.Sent140_Net().to(device) args.decay_rate = 0.99 else: pass model_layers_num = len(list(model.named_parameters())) if not args.epoch_start == 0: model.load_state_dict(torch.load(LOAD_MODEL_PATH)) print("Model and Dataset ok") #model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) # global_para = model.state_dict().copy() # global_para = list(model.state_dict()) global_para = [para[1].data for para in model.named_parameters()] start = time.time() for j in range(len(global_para)): temp = global_para[j].to('cpu') dist.recv(temp, src=0) global_para[j] = temp.to('cuda') global_epoch = torch.tensor(0) dist.recv(global_epoch, src=0) # print("Recev global para from the server") apply_global_para(model, global_para) for epoch in range(1, args.epochs + 1): print("Epoch %d" % epoch) # plt.ioff() train(args, start, model, device, tx2_train_loader, tx2_test_loader, optimizer, epoch, log_out) print("train ok") global_para = [para[1].data for para in model.named_parameters()] # local_para = [para[1].data for para in model.named_parameters()] for j in range(len(global_para)): dist.send(global_para[j].to('cpu'), dst=0) # print("Send para to the server") for j in range(len(global_para)): temp = global_para[j].to('cpu') dist.recv(temp, src=0) global_para[j] = temp.to('cuda') dist.recv(global_epoch, src=0) # print("recved server epoch: ", global_epoch) if global_epoch == args.epochs: break apply_global_para(model, global_para)
# cudnn.enabled = False cudnn.benchmark = True kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = datasets.MNIST('./data', train=True, download=True, \ transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])) test_dataset = datasets.MNIST('./data', train=False, \ transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])) train_loader = torch.utils.data.DataLoader(\ train_dataset, batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(\ test_dataset, batch_size=args.batch_size, shuffle=True, **kwargs)\ optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) if not os.path.isdir(args.snapshot): os.mkdir(args.snapshot) # else: # files = glob.glob(args.snapshot+'/*') # for f in files: # os.remove(f) start_epoch = 1 if args.resume: if os.path.isfile(args.resume): print("Loading snapshot '{}'".format(args.resume)) snapshot = torch.load(args.resume)
def main(config): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_transform = transforms.Compose([ transforms.Scale(256), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor()]) val_transform = transforms.Compose([ transforms.Scale(256), transforms.RandomCrop(224), transforms.ToTensor()]) test_transform = transforms.Compose([ transforms.ToTensor()]) trainset = AVADataset(csv_file=config.train_csv_file, root_dir=config.train_img_path, transform=train_transform) valset = AVADataset(csv_file=config.val_csv_file, root_dir=config.val_img_path, transform=val_transform) train_loader = torch.utils.data.DataLoader(trainset, batch_size=config.train_batch_size, shuffle=True, num_workers=config.num_workers) val_loader = torch.utils.data.DataLoader(valset, batch_size=config.val_batch_size, shuffle=False, num_workers=config.num_workers) base_model = models.vgg16(pretrained=True) # base_model = models.resnet18(pretrained=True) # base_model = models.inception_v3(pretrained=True) model = NIMA(base_model) # model = NIMA() if config.warm_start: model.load_state_dict(torch.load(os.path.join(config.ckpt_path, 'epoch-%d.pkl' % config.warm_start_epoch))) print('Successfully loaded model epoch-%d.pkl' % config.warm_start_epoch) if config.multi_gpu: model.features = torch.nn.DataParallel(model.features, device_ids=config.gpu_ids) model = model.to(device) else: model = model.to(device) conv_base_lr = config.conv_base_lr dense_lr = config.dense_lr optimizer = optim.SGD([ {'params': model.features.parameters(), 'lr': conv_base_lr}, {'params': model.classifier.parameters(), 'lr': dense_lr}], momentum=0.6 ) criterion = torch.nn.L1Loss() # send hyperparams lrs.send({ 'title': 'EMD Loss', 'train_batch_size': config.train_batch_size, 'val_batch_size': config.val_batch_size, 'optimizer': 'SGD', 'conv_base_lr': config.conv_base_lr, 'dense_lr': config.dense_lr, 'momentum': 0.9 }) param_num = 0 for param in model.parameters(): param_num += int(np.prod(param.shape)) print('Trainable params: %.2f million' % (param_num / 1e6)) if config.train: # for early stopping count = 0 init_val_loss = float('inf') train_losses = [] val_losses = [] for epoch in range(config.warm_start_epoch, config.epochs): lrs.send('epoch', epoch) batch_losses = [] for i, data in enumerate(train_loader): images = data['image'].to(device) labels = data['annotations'].to(device).float() outputs = model(images) outputs = outputs.view(-1, 1, 1) optimizer.zero_grad() loss = criterion(outputs, labels) # loss = emd_loss(labels, outputs) batch_losses.append(loss.item()) loss.backward() optimizer.step() lrs.send('train_emd_loss', loss.item()) # print('Epoch: %d/%d | Step: %d/%d | Training EMD loss: %.4f' % (epoch + 1, config.epochs, i + 1, len(trainset) // config.train_batch_size + 1, loss.data[0])) avg_loss = sum(batch_losses) / (len(trainset) // config.train_batch_size + 1) train_losses.append(avg_loss) print('Epoch %d averaged training EMD loss: %.4f' % (epoch + 1, avg_loss)) # exponetial learning rate decay if (epoch + 1) % 10 == 0: conv_base_lr = conv_base_lr * config.lr_decay_rate ** ((epoch + 1) / config.lr_decay_freq) dense_lr = dense_lr * config.lr_decay_rate ** ((epoch + 1) / config.lr_decay_freq) optimizer = optim.SGD([ {'params': model.features.parameters(), 'lr': conv_base_lr}, {'params': model.classifier.parameters(), 'lr': dense_lr}], momentum=0.6 ) # send decay hyperparams lrs.send({ 'lr_decay_rate': config.lr_decay_rate, 'lr_decay_freq': config.lr_decay_freq, 'conv_base_lr': config.conv_base_lr, 'dense_lr': config.dense_lr }) # do validation after each epoch batch_val_losses = [] for data in val_loader: images = data['image'].to(device) labels = data['annotations'].to(device).float() with torch.no_grad(): outputs = model(images) val_outputs = outputs.view(-1, 1, 1) val_loss = criterion(val_outputs, labels) # val_loss = emd_loss(labels, outputs) batch_val_losses.append(val_loss.item()) avg_val_loss = sum(batch_val_losses) / (len(valset) // config.val_batch_size + 1) val_losses.append(avg_val_loss) lrs.send('val_emd_loss', avg_val_loss) print('Epoch %d completed. Averaged MSE loss on val set: %.4f. Inital val loss : %.4f.' % (epoch + 1, avg_val_loss, init_val_loss)) # Use early stopping to monitor training if avg_val_loss < init_val_loss: init_val_loss = avg_val_loss # save model weights if val loss decreases print('Saving model...') torch.save(model.state_dict(), os.path.join(config.ckpt_path, 'epoch-%d.pkl' % (epoch + 1))) print('Done.\n') # reset count count = 0 elif avg_val_loss >= init_val_loss: count += 1 if count == config.early_stopping_patience: print('Val EMD loss has not decreased in %d epochs. Training terminated.' % config.early_stopping_patience) # break print('Training completed.') if config.save_fig: # plot train and val loss epochs = range(1, epoch + 2) plt.plot(epochs, train_losses, 'b-', label='train loss') plt.plot(epochs, val_losses, 'g-', label='val loss') plt.title('EMD loss') plt.legend() plt.savefig('./loss.png') if config.test: start.record() print('Testing') # compute mean score test_transform = test_transform#val_transform testset = AVADataset(csv_file=config.test_csv_file, root_dir=config.test_img_path, transform=val_transform) test_loader = torch.utils.data.DataLoader(testset, batch_size=config.test_batch_size, shuffle=False, num_workers=config.num_workers) mean_preds = np.zeros(45) mean_labels = np.zeros(45) # std_preds = [] count = 0 for data in test_loader: im_id = data['img_id'] image = data['image'].to(device) labels = data['annotations'].to(device).float() output = model(image) output = output.view(1, 1) bpred = output.to(torch.device("cpu")) cpred = bpred.data.numpy() blabel = labels.to(torch.device("cpu")) clabel = blabel.data.numpy() # predicted_mean, predicted_std = 0.0, 0.0 # for i, elem in enumerate(output, 1): # predicted_mean += i * elem # for j, elem in enumerate(output, 1): # predicted_std += elem * (i - predicted_mean) ** 2 mean_preds[count] = cpred mean_labels[count] = clabel print(im_id,mean_preds[count]) count= count+1 # std_preds.append(predicted_std) # Do what you want with predicted and std... end.record()
middle_shape = 30 num_data_point = 256 buffer_size = 256 num_epoch = 8000 batch_size = 256 k = 4 data_x = np.random.randn(num_data_point, data_shape) data_x = data_x / np.linalg.norm(data_x, axis=1, keepdims=True) data_y = np.random.rand(num_data_point, 1) * 2 - 1 buffer_index = np.random.choice(num_data_point, buffer_size, replace=False) buffer_y = data_y[buffer_index] repr_model = SimpleNet(num_data_point, data_shape, middle_shape) # repr_model_target = SimpleNet(num_data_point,data_shape, middle_shape) # training optimizer = optim.SGD(repr_model.parameters(), lr=1e-4, weight_decay=0) torch_data_x = torch.tensor(data_x, dtype=torch.float32) torch_data_y = torch.tensor(data_y, dtype=torch.float32) rep_buffer = np.array([ repr_model(onehot([x])).detach().numpy().reshape(-1) for x in range(len(data_x)) ]) for epoch in range(num_epoch): repr_model.train() rep_buffer = np.array([ repr_model(onehot([x])).detach().numpy().reshape(-1) for x in range(len(data_x)) ]) # if epoch % 12000 == 0: # plot(rep_buffer, buffer_y, "rep_buffer")
def final_training(log_dirs, config): for c, experiment in enumerate(config["experiments"]): log_dir = log_dirs[c] print("using logs from: ", log_dir) basic_settings = experiment["basic_settings"] # data_manager iD = basic_settings.get("iD", "Cifar10") OoD = basic_settings.get("OoD", ["Fashion_MNIST"]) labelled_size = basic_settings.get("labelled_size", 3000) pool_size = basic_settings.get("pool_size", 20000) OOD_ratio = basic_settings.get("OOD_ratio", 0.0) # training settings epochs = 130 # basic_settings.get("epochs", 200) batch_size = basic_settings.get("batch_size", 128) weight_decay = basic_settings.get("weight_decay", 1e-4) lr = basic_settings.get("lr", 0.1) nesterov = basic_settings.get("nesterov", False) momentum = basic_settings.get("momentum", 0.9) num_classes = basic_settings.get("num_classes", 10) # criterion = basic_settings.get("criterion", "crossentropy") metric = basic_settings.get("metric", "accuracy") # logging verbose = basic_settings.get("verbose", 1) criterion = nn.CrossEntropyLoss() with open(os.path.join(log_dir, "final_result.csv"), "w", encoding="utf-8") as result_file: result_file.write( f"Experiment_name,Starting_size,Train_size,OOD_ratio,Train_Acc,Train_Loss,Val_Acc,Val_Loss,Test_Acc,Test_Loss\n" ) subclass = basic_settings.get("subclass", {"do_subclass": False}) with open(os.path.join(log_dir, "final_result.csv"), "w", encoding="utf-8") as result_file: result_file.write( "exp_name,trainsize,OOD_ratio,avg_train_acc,avg_train_loss,avg_test_acc,avg_test_loss\n" ) for exp_setting in experiment["exp_settings"]: exp_name = exp_setting.get("exp_name", "standard_name") data_manager = Data_manager( iD_datasets=[iD], OoD_datasets=OoD, labelled_size=labelled_size, pool_size=pool_size, OoD_ratio=OOD_ratio, test_iD_size=None, subclass=subclass, ) if not exp_setting.get("perform_experiment", True): continue else: print("performing final training for: ", exp_name) try: # data_manager.create_merged_data() TODO load the statusmanager from the path check_path = os.path.join( log_dir, "status_manager_dir", f"{exp_name}-result-statusmanager.csv") exp_type = exp_setting.get("exp_type", "baseline") if exp_type == "max_disc": max_disc = True else: max_disc = False print("loading statusmanager: ", check_path) if os.path.exists(check_path): data_manager.status_manager = pd.read_csv(check_path, index_col=0) # self.data_manager.reset_pool() data_manager.iter = 19 print("loaded statusmanager from file") else: print("couldn't load statusmanager aborting: f{exp_name}") break result_tup = create_dataloader(data_manager, batch_size, 0.1, validation_source=None) train_loader = result_tup[0] test_loader = result_tup[1] # val_loader = result_tup[3] device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") if not max_disc: model = get_model("base", num_classes=num_classes) else: model = get_model("maximum_discrepancy", num_classes=num_classes) model.to(device) optimizer = optim.SGD( model.parameters(), weight_decay=weight_decay, lr=lr, momentum=momentum, nesterov=nesterov, ) if device == "cuda": torch.backends.cudnn.benchmark = True model, avg_train_loss, avg_train_acc = train( train_loader=train_loader, val_loader=None, optimizer=optimizer, criterion=criterion, device=device, epochs=epochs, model=model, verbose=verbose, max_disc=max_disc) avg_test_acc, avg_test_loss = test(model, test_loader, device, criterion, max_disc=max_disc) print(f"""Experiment: {exp_name}, Final_trainingset size: {len(train_loader)}, OOD_ratio: {OOD_ratio}, Train-Accuracy: {avg_train_acc}, Train-Loss: {avg_train_loss}, Test-Accuracy: {avg_test_acc}, Test-Loss: {avg_test_loss}""") with open(os.path.join(log_dir, "final_result.csv"), "a", encoding="utf-8") as result_file: result_file.write( f"{exp_name},{len(train_loader.dataset)},{OOD_ratio},{avg_train_acc},{avg_train_loss},{avg_test_acc},{avg_test_loss}\n" ) except Exception as e: print(f"{exp_name} failed with Exceptopm {e}")
if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load('./checkpoint/ckpt.pth') net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) # Training def train(epoch): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(trainloader): inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets)
valid_data_gen = torch.utils.data.DataLoader(valid,batch_size=64,num_workers=3) dataset_sizes = {'train':len(train_data_gen.dataset),'valid':len(valid_data_gen.dataset)} dataloaders = {'train':train_data_gen,'valid':valid_data_gen} model_ft = models.resnet18(pretrained=True) breakpoint() num_ftrs = model_ft.fc.in_features model_ft.fc = nn.Linear(num_ftrs, 2) if torch.cuda.is_available(): model_ft = model_ft.cuda() # Loss and Optimizer learning_rate = 0.001 criterion = nn.CrossEntropyLoss() optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9) exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) breakpoint() def train_model(model, criterion, optimizer, scheduler, num_epochs=5): since = time.time() best_model_wts = model.state_dict() best_acc = 0.0 for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'valid']:
def train(): # Model parameters g_input_size = 1 # Random noise dimension coming into generator, per output vector g_hidden_size = 5 # Generator complexity g_output_size = 1 # Size of generated output vector d_input_size = 500 # Minibatch size - cardinality of distributions d_hidden_size = 10 # Discriminator complexity d_output_size = 1 # Single dimension for 'real' vs. 'fake' classification minibatch_size = d_input_size d_learning_rate = 1e-3 g_learning_rate = 1e-3 sgd_momentum = 0.9 num_epochs = 5000 print_interval = 100 d_steps = 20 g_steps = 20 dfe, dre, ge = 0, 0, 0 d_real_data, d_fake_data, g_fake_data = None, None, None discriminator_activation_function = torch.sigmoid generator_activation_function = torch.tanh d_sampler = get_distribution_sampler(data_mean, data_stddev) gi_sampler = get_generator_input_sampler() G = Generator(input_size=g_input_size, hidden_size=g_hidden_size, output_size=g_output_size, f=generator_activation_function) D = Discriminator(input_size=d_input_func(d_input_size), hidden_size=d_hidden_size, output_size=d_output_size, f=discriminator_activation_function) criterion = nn.BCELoss( ) # Binary cross entropy: http://pytorch.org/docs/nn.html#bceloss d_optimizer = optim.SGD(D.parameters(), lr=d_learning_rate, momentum=sgd_momentum) g_optimizer = optim.SGD(G.parameters(), lr=g_learning_rate, momentum=sgd_momentum) for epoch in range(num_epochs): for d_index in range(d_steps): # 1. Train D on real+fake D.zero_grad() # 1A: Train D on real d_real_data = Variable(d_sampler(d_input_size)) d_real_decision = D(preprocess(d_real_data)) d_real_error = criterion(d_real_decision, Variable(torch.ones([1, 1]))) # ones = true d_real_error.backward( ) # compute/store gradients, but don't change params # 1B: Train D on fake d_gen_input = Variable(gi_sampler(minibatch_size, g_input_size)) d_fake_data = G(d_gen_input).detach( ) # detach to avoid training G on these labels d_fake_decision = D(preprocess(d_fake_data.t())) d_fake_error = criterion(d_fake_decision, Variable(torch.zeros([1, 1 ]))) # zeros = fake d_fake_error.backward() d_optimizer.step( ) # Only optimizes D's parameters; changes based on stored gradients from backward() dre, dfe = extract(d_real_error)[0], extract(d_fake_error)[0] for g_index in range(g_steps): # 2. Train G on D's response (but DO NOT train D on these labels) G.zero_grad() gen_input = Variable(gi_sampler(minibatch_size, g_input_size)) g_fake_data = G(gen_input) dg_fake_decision = D(preprocess(g_fake_data.t())) g_error = criterion(dg_fake_decision, Variable(torch.ones( [1, 1]))) # Train G to pretend it's genuine g_error.backward() g_optimizer.step() # Only optimizes G's parameters ge = extract(g_error)[0] if epoch % print_interval == 0: print( "Epoch %s: D (%s real_err, %s fake_err) G (%s err); Real Dist (%s), Fake Dist (%s) " % (epoch, dre, dfe, ge, stats( extract(d_real_data)), stats(extract(d_fake_data)))) if matplotlib_is_available: print("Plotting the generated distribution...") values = extract(g_fake_data) print(" Values: %s" % (str(values))) plt.hist(values, bins=50) plt.xlabel('Value') plt.ylabel('Count') plt.title('Histogram of Generated Distribution') plt.grid(True) plt.show()
def adversarial_learning(best_cla_model_path): # Device configuration device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # print(device) parser = argparse.ArgumentParser("Image classifical!") parser.add_argument('--input_dir_trainSet', type=str, default='D:/python_workplace/resnet-AE/checkpoint/Joint_Training/ResNet18/cifar10/train/train.pkl', help='data set dir path') parser.add_argument('--input_dir_testSet', type=str, default='D:/python_workplace/resnet-AE/checkpoint/Joint_Training/ResNet18/cifar10/test/test.pkl', help='data set dir path') parser.add_argument('--epochs', type=int, default=300, help='Epoch default:50.') parser.add_argument('--image_size', type=int, default=32, help='Image Size default:28.') parser.add_argument('--batch_size', type=int, default=512, help='Batch_size default:256.') parser.add_argument('--lr', type=float, default=0.01, help='learing_rate. Default=0.01') parser.add_argument('--num_classes', type=int, default=10, help='num classes') parser.add_argument('--model_path', type=str, default='D:/python_workplace/resnet-AE/checkpoint/AdversarialLearning/ResNet18/cifar10/model/', help='Save model path') parser.add_argument('--acc_file_path', type=str, default='D:/python_workplace/resnet-AE/checkpoint/AdversarialLearning/ResNet18/cifar10/acc.txt', help='Save accuracy file') parser.add_argument('--best_acc_file_path', type=str, default='D:/python_workplace/resnet-AE/checkpoint/' 'AdversarialLearning/ResNet18/cifar10/best_acc.txt', help='Save best accuracy file') parser.add_argument('--log_file_path', type=str, default='D:/python_workplace/resnet-AE/checkpoint/AdversarialLearning/ResNet18/cifar10/log.txt', help='Save log file') args = parser.parse_args() # Load model model = resnet_cifar.resnet18(pretrained=False) model.to(device) # summary(model,(3,32,32)) # print(model) # Load pre-trained weights model.load_state_dict(torch.load(best_cla_model_path)) model.to(device) # criterion criterion = nn.CrossEntropyLoss().to(device) # batch_shape batch_shape = [args.batch_size, 3, args.image_size, args.image_size] best_acc_clean = 0 # 初始化best clean test set accuracy best_acc_adv = 0 # 初始化best adv test set accuracy best_epoch = 0 # 初始化best epoch time_k = time.time() print("Start Adversarial Training, Resnet-18!") with open(args.acc_file_path, "w") as f1: with open(args.log_file_path, "w")as f2: for epoch in range(0, args.epochs): if epoch + 1 <= 100: args.lr = 0.1 elif 100 < epoch + 1 <= 200: args.lr = 0.01 elif 200 < epoch + 1 <= 250: args.lr = 0.001 else: args.lr = 0.0001 # Optimization optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) print('Epoch: %d' % (epoch + 1)) sum_loss = 0.0 correct = 0.0 total = 0.0 batchId = 1 for batchSize, images_train, labels_train in load_train_set(args.input_dir_trainSet, batch_shape): start = time.time() # data prepare images_train = torch.from_numpy(images_train).type(torch.FloatTensor).to(device) labels_train = torch.from_numpy(labels_train).type(torch.LongTensor).to(device) model.to(device) model.train() optimizer.zero_grad() # forward + backward outputs = model(images_train) loss = criterion(outputs, labels_train) loss.backward() optimizer.step() # 每训练1个batch打印一次loss和准确率 sum_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += labels_train.size(0) correct += predicted.eq(labels_train.data).cpu().sum().item() # print(100.* correct / total) end = time.time() print('[Epoch:%d/%d] | [Batch:%d/%d] | Loss: %.03f | Acc: %.2f%% | Lr: %.04f | Time: %.03fs' % (epoch + 1, args.epochs, batchId, (100000 / args.batch_size) + 1, sum_loss / batchId, correct / total * 100, args.lr, (end - start))) f2.write('[Epoch:%d/%d] | [Batch:%d/%d] | Loss: %.03f | Acc: %.2f%% | Lr: %.4f | Time: %.3fs' % (epoch + 1, args.epochs, batchId, (100000 / args.batch_size) + 1, sum_loss / batchId, correct / total * 100, args.lr, (end - start))) f2.write('\n') f2.flush() batchId += 1 # 每训练完一个epoch测试一下准确率 if (epoch + 1) % 50 == 0: print("Waiting for Testing!") with torch.no_grad(): # 测试clean test set correct_clean = 0 total_clean = 0 for batchSize, images_test_clean, labels_test_clean in load_test_set_clean(args.input_dir_testSet, batch_shape): model.eval() # data prepare images_test_clean = torch.from_numpy(images_test_clean).type(torch.FloatTensor).to(device) labels_test_clean = torch.from_numpy(labels_test_clean).type(torch.LongTensor).to(device) model.to(device) outputs = model(images_test_clean) # 取得分最高的那个类 (outputs.data的索引号) _, predicted = torch.max(outputs.data, 1) total_clean += labels_test_clean.size(0) correct_clean += (predicted == labels_test_clean).sum().item() print('Clean Test Set Accuracy:%.2f%%' % (correct_clean / total_clean * 100)) acc_clean = correct_clean / total_clean * 100 # 测试adv test set correct_adv = 0 total_adv = 0 for batchSize, images_test_adv, labels_test_adv in load_test_set_adv(args.input_dir_testSet, batch_shape): model.eval() # data prepare images_test_adv = torch.from_numpy(images_test_adv).type(torch.FloatTensor).to(device) labels_test_adv = torch.from_numpy(labels_test_adv).type(torch.LongTensor).to(device) model.to(device) outputs = model(images_test_adv) # 取得分最高的那个类 (outputs.data的索引号) _, predicted = torch.max(outputs.data, 1) total_adv += labels_test_adv.size(0) correct_adv += (predicted == labels_test_adv).sum().item() print('Adv Test Set Accuracy:%.2f%%' % (correct_adv / total_adv * 100)) acc_adv = correct_adv / total_adv * 100 # 保存测试集准确率至acc.txt文件中 f1.write("Epoch=%03d,Clean Test Set Accuracy= %.2f%%" % (epoch + 1, acc_clean)) f1.write('\n') f1.write("Epoch=%03d,Adv Test Set Accuracy= %.2f%%" % (epoch + 1, acc_adv)) f1.write('\n') f1.flush() # 记录最佳测试分类准确率并写入best_acc.txt文件中并将准确率达标的模型保存 if acc_clean > best_acc_clean and acc_adv > best_acc_adv: if epoch != 49: os.remove(args.model_path + "model_" + str(best_epoch) + ".pth") best_acc_clean = acc_clean best_acc_adv = acc_adv print('Saving model!') torch.save(model.state_dict(), '%s/model_%d.pth' % (args.model_path, epoch + 1)) print('Model saved!') f3 = open(args.best_acc_file_path, "w") f3.write("Epoch=%d,Best Accuracy of Clean Set = %.2f%%,Best Accuracy of Adv Set = %.2f%%" % (epoch + 1, best_acc_clean, best_acc_adv)) f3.close() best_epoch = epoch + 1 time_j = time.time() print("Training Finished, Total Epoch = %d, Best Epoch = %d, Best Accuracy of Clean Set = %.2f%%, " "Best Accuracy of Adv Set = %.2f%%, Total Time = %.2f" % (args.epochs, best_epoch, best_acc_clean, best_acc_adv, (time_j - time_k)/3600))
self.module = nn.Sequential( layer1, activation1, layer2 ) def forward(self, x): out = self.module(x) result = F.softmax(out, dim=1) return result # 준비재료 criterion = nn.CrossEntropyLoss() learning_rate = 1e-5 optimizer = optim.SGD(model.paraeters(), lr=learning_rate) num_epochs = 2 num_batches = len(train_loader) for epoch in range(num_epochs): for i, data in enumerate(train_loader): x, x_labels = data # x.size() = [batch, channel, x, y] # init grad optimizer.zero_grad() # step과 zero_grad는 쌍을 이루는 것이라고 생각하면 됨 # forward pred = model(x) # calculate loss loss = criterion(pred, x_labels) # backpropagation loss.backward() # weight update
if __name__ == '__main__': #rootdir = '../../../data/office_caltech_10/' torch.manual_seed(1) i = 0 data_src = DataLoader(dataset = MyTrainData_src(i),batch_size=BATCH_SIZE[0],shuffle=True, drop_last= True) data_tar = DataLoader(dataset = MyTrainData_tar(i),batch_size=BATCH_SIZE[1],shuffle=True, drop_last= True) ''' data_src = data_loader.load_data( root_dir=rootdir, domain='amazon', batch_size=BATCH_SIZE[0]) data_tar = data_loader.load_test( root_dir=rootdir, domain='webcam', batch_size=BATCH_SIZE[1]) ''' model = DaNN.DaNN(n_input=2048, n_hidden=256, n_class=65) model = model.to(DEVICE) optimizer = optim.SGD( model.parameters(), lr=LEARNING_RATE, momentum=MOMEMTUN, weight_decay=L2_WEIGHT ) for e in tqdm(range(1, N_EPOCH + 1)): model = train(model=model, optimizer=optimizer, epoch=e, data_src=data_src, data_tar=data_tar) test(model, data_tar, e) torch.save(model, 'model_dann.pkl') log_train.close() log_test.close() res_train = np.asarray(RESULT_TRAIN) res_test = np.asarray(RESULT_TEST) np.savetxt('res_train_a-w.csv', res_train, fmt='%.6f', delimiter=',') np.savetxt('res_test_a-w.csv', res_test, fmt='%.6f', delimiter=',')
if __name__ == '__main__': # train_middle_shot(saved=True) # model_path = 'G:/model/20210319_model.pt' # model_eval(model_path) ground_dir = '../' video_list = ['01_From_Pole_to_Pole','02_Mountains','03_Ice_Worlds','04_Great_Plains','05_Jungles','06_Seasonal_Forests','07_Fresh_Water', '08_Ocean_Deep','09_Shallow_Seas','10_Caves','11_Deserts'] transcript_path = os.path.join(ground_dir,'transcript') gt_path = os.path.join(ground_dir,'annotations/scenes/annotator_1/') cuda = False check_file(video_list,ground_dir+'bbc_dataset_video') device = torch.device('cuda' if cuda else 'cpu') model = MyTransformer(4096,4,6) lossfun = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(),lr=0.1) scheduler = optim.lr_scheduler.StepLR(optimizer, 5) epoches =30 eval_rate = 5 nshots = 540 f_score = 0 for epoch in range(epoches): loss = 0 # training, testing = train_test_split(video_list) print('Epoch :{}...'.format(epoch)) for i in range(len(video_list)): video_name = video_list[i] model.train() visual_feature_dir = os.path.join(ground_dir,'parse_data',video_name) print("{} Training Start...".format(video_name))
def main(): start = time.time() parser = args.parse_args() # run some checks on arguments check_args(parser) # format logging log_name = os.path.join( parser.run_log, '{}_run_log_{}.log'.format(parser.experiment, dt.now().strftime("%Y%m%d_%H%M"))) log.basicConfig(filename=log_name, format='%(asctime)s | %(name)s -- %(message)s', level=log.INFO) os.chmod(log_name, parser.access_mode) # set device to CPU if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Starting experiment {} VN -> EN NMT on {}.".format( parser.experiment, device)) log.info("Starting experiment {} VN -> EN NMT on {}.".format( parser.experiment, device)) # set seed for replication random.seed(parser.seed) np.random.seed(parser.seed) torch.manual_seed(parser.seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(parser.seed) log.info("For reproducibility, the seed is set to {}.".format(parser.seed)) # set file paths source_name = parser.source_name target_name = parser.target_name # get saved models dir base_saved_models_dir = parser.save_dir saved_models_dir = os.path.join(base_saved_models_dir, source_name + '2' + target_name) plots_dir = parser.plots_dir log.info("We will save the models in this directory: {}".format( saved_models_dir)) log.info("We will save the plots in this directory: {}".format(plots_dir)) # get data dir main_data_path = parser.data_dir path_to_train_data = { 'source': main_data_path + 'train.tok.' + source_name, 'target': main_data_path + 'train.tok.' + target_name } path_to_dev_data = { 'source': main_data_path + 'dev.tok.' + source_name, 'target': main_data_path + 'dev.tok.' + target_name } path_to_test_data = { 'source': main_data_path + 'test.tok.' + source_name, 'target': main_data_path + 'test.tok.' + target_name } # Configuration bs = parser.batch_size log.info("Batch size = {}.".format(bs)) enc_emb = parser.enc_emb enc_hidden = parser.enc_hidden enc_layers = parser.enc_layers rnn_type = parser.rnn_type dec_emb = parser.dec_emb dec_hidden = parser.dec_hidden dec_layers = parser.dec_layers learning_rate = parser.learning_rate num_epochs = parser.epochs attn_flag = parser.attn log.info("The attention flag is set to {}.".format(attn_flag)) beam_size = parser.beam_size log.info("We evaluate using beam size of {}.".format(beam_size)) train, val, test, en_lang, vi_lang = dataset_helper.train_val_load( "", main_data_path) # get vocab sizes log.info('English has vocab size of: {} words.'.format(en_lang.n_words)) log.info('Vietnamese has vocab size of: {} words.'.format(vi_lang.n_words)) # get max sentence length by 95% percentile MAX_LEN = int(train['en_len'].quantile(0.95)) log.info( 'We will have a max sentence length of {} (95 percentile).'.format( MAX_LEN)) # set data loaders bs_dict = {'train': bs, 'validate': 1, 'test': 1} shuffle_dict = {'train': True, 'validate': False, 'test': False} train_used = train val_used = val collate_fn_dict = { 'train': partial(dataset_helper.vocab_collate_func, MAX_LEN=MAX_LEN), 'validate': dataset_helper.vocab_collate_func_val, 'test': dataset_helper.vocab_collate_func_val } transformed_dataset = { 'train': dataset_helper.Vietnamese(train_used), 'validate': dataset_helper.Vietnamese(val_used, val=True), 'test': dataset_helper.Vietnamese(test, val=True) } dataloader = { x: DataLoader(transformed_dataset[x], batch_size=bs_dict[x], collate_fn=collate_fn_dict[x], shuffle=shuffle_dict[x], num_workers=0) for x in ['train', 'validate', 'test'] } # instantiate encoder/decoder encoder_w_att = nnet_models.EncoderRNN(input_size=vi_lang.n_words, embed_dim=enc_emb, hidden_size=enc_hidden, n_layers=enc_layers, rnn_type=rnn_type).to(device) decoder_w_att = nnet_models.AttentionDecoderRNN( output_size=en_lang.n_words, embed_dim=dec_emb, hidden_size=dec_hidden, n_layers=dec_layers, attention=attn_flag).to(device) # instantiate optimizer if parser.optimizer == 'sgd': encoder_optimizer = optim.SGD(encoder_w_att.parameters(), lr=learning_rate, nesterov=True, momentum=0.99) decoder_optimizer = optim.SGD(decoder_w_att.parameters(), lr=learning_rate, nesterov=True, momentum=0.99) elif parser.optimizer == 'adam': encoder_optimizer = optim.Adam(encoder_w_att.parameters(), lr=5e-3) decoder_optimizer = optim.Adam(decoder_w_att.parameters(), lr=5e-3) else: raise ValueError('Invalid optimizer!') # instantiate scheduler enc_scheduler = ReduceLROnPlateau(encoder_optimizer, min_lr=1e-4, factor=0.5, patience=0) dec_scheduler = ReduceLROnPlateau(decoder_optimizer, min_lr=1e-4, factor=0.5, patience=0) criterion = nn.NLLLoss(ignore_index=global_variables.PAD_IDX) log.info( "Seq2Seq Model with the following parameters: batch_size = {}, learning_rate = {}, rnn_type = {}, enc_emb = {}, enc_hidden = {}, enc_layers = {}, dec_emb = {}, dec_hidden = {}, dec_layers = {}, num_epochs = {}, source_name = {}, target_name = {}" .format(bs, learning_rate, rnn_type, enc_emb, enc_hidden, enc_layers, dec_emb, dec_hidden, dec_layers, num_epochs, source_name, target_name)) # do we want to train again? train_again = False encoder_save = '{}_att_{}bs_{}hs_{}_{}beam_enc_{}_layer'.format( rnn_type, bs, enc_hidden, parser.optimizer, beam_size, enc_layers) decoder_save = '{}_att_{}bs_{}hs_{}_{}beam_dec_{}_layer'.format( rnn_type, bs, enc_hidden, parser.optimizer, beam_size, dec_layers) if os.path.exists(utils.get_full_filepath( saved_models_dir, encoder_save)) and os.path.exists( utils.get_full_filepath(saved_models_dir, decoder_save)) and (not train_again): log.info("Retrieving saved encoder from {}".format( utils.get_full_filepath(saved_models_dir, encoder_save))) log.info("Retrieving saved decoder from {}".format( utils.get_full_filepath(saved_models_dir, decoder_save))) encoder_w_att.load_state_dict( torch.load(utils.get_full_filepath(saved_models_dir, encoder_save))) decoder_w_att.load_state_dict( torch.load(utils.get_full_filepath(saved_models_dir, decoder_save))) else: log.info("Check if encoder path exists: {}".format( utils.get_full_filepath(saved_models_dir, encoder_save))) log.info("Check if decoder path exists: {}".format( utils.get_full_filepath(saved_models_dir, decoder_save))) log.info("Encoder and Decoder do not exist! Starting to train...") encoder_w_att, decoder_w_att, loss_hist, acc_hist = train_utilities.train_model( encoder_optimizer, decoder_optimizer, encoder_w_att, decoder_w_att, criterion, "attention", dataloader, en_lang, vi_lang, saved_models_dir, encoder_save, decoder_save, num_epochs=num_epochs, rm=0.95, enc_scheduler=enc_scheduler, dec_scheduler=dec_scheduler) log.info("Total time is: {} min : {} s".format( (time.time() - start) // 60, (time.time() - start) % 60)) log.info( "We will save the encoder/decoder in this directory: {}".format( saved_models_dir)) # BLEU with beam size bleu_no_unk, att_score_wo, pred_wo, src_wo = train_utilities.validation_beam_search( encoder_w_att, decoder_w_att, dataloader['validate'], en_lang, vi_lang, 'attention', beam_size, verbose=False) log.info("Bleu-{} Score (No UNK): {}".format(beam_size, bleu_no_unk)) print("Bleu-{} Score (No UNK): {}".format(beam_size, bleu_no_unk)) bleu_unk, att_score_wo, pred_wo, src_wo = train_utilities.validation_beam_search( encoder_w_att, decoder_w_att, dataloader['validate'], en_lang, vi_lang, 'attention', beam_size, verbose=False, replace_unk=True) log.info("Bleu-{} Score (UNK): {}".format(beam_size, bleu_unk)) print("Bleu-{} Score (UNK): {}".format(beam_size, bleu_unk)) # generate 5 random predictions indexes = range(len(pred_wo)) for i in np.random.choice(indexes, 5): print('Source: {} \nPrediction: {}\n---'.format(src_wo[i], pred_wo[i])) log.info('Source: {} \nPrediction: {}\n---'.format( src_wo[i], pred_wo[i])) log.info("Exported Binned Bleu Score Plot to {}!".format(plots_dir)) _, _, fig = utils.get_binned_bl_score( encoder=encoder_w_att, decoder=decoder_w_att, val_dataset=transformed_dataset['validate'], attn_flag=attn_flag, beam_size=beam_size, location=plots_dir, collate=collate_fn_dict['validate'], lang_en=en_lang, lang_vi=vi_lang)