def save_model(net, dev_loss, em, f1, global_steps): model_dir = '/data/xuwenshen/workspace/squad/code/multi_task/models/' model_dir = model_dir + "loss-{:3f}-em-{:3f}-f1-{:3f}-steps-{:d}-model.pkl".format(dev_loss, em, f1, global_steps) torch.save(net.state_dict(), model_dir)
def save_models(netG, netD, outputDir, epoch): ''' Saves model state dictionary for generator and discriminator networks. Inputs are the networks (netG, netD), the system path in which to save(outputDir) and the current 'epoch'. ''' torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (outputDir, epoch)) torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (outputDir, epoch))
def save_checkpoint(self, model, optimizer, epoch, index=0): """ :params model: model :params optimizer: optimizer :params epoch: training epoch :params index: index of saved file, default: 0 Note: if we add hook to the grad by using register_hook(hook), then the hook function can not be saved so we need to save state_dict() only. Although save state dictionary is recommended, some times we still need to save the whole model as it can save all the information of the trained model, and we do not need to create a new network in next time. However, the GPU information will be saved too, which leads to some issues when we use the model on different machine """ # get state_dict from model and optimizer model = self.list2sequential(model) if isinstance(model, nn.DataParallel): model = model.module model = model.state_dict() optimizer = optimizer.state_dict() # save information to a dict self.check_point_params['model'] = model self.check_point_params['optimizer'] = optimizer self.check_point_params['epoch'] = epoch # save to file torch.save(self.check_point_params, os.path.join( self.save_path, "checkpoint_%03d.pth" % index))
def load_word_vectors(path): if os.path.isfile(path+'.pth') and os.path.isfile(path+'.vocab'): print('==> File found, loading to memory') vectors = torch.load(path+'.pth') vocab = Vocab(filename=path+'.vocab') return vocab, vectors # saved file not found, read from txt file # and create tensors for word vectors print('==> File not found, preparing, be patient') count = sum(1 for line in open(path+'.txt',encoding='latin-1')) with open(path+'.txt','r') as f: contents = f.readline().rstrip('\n').split(' ') dim = len(contents[1:]) words = [None]*(count) vectors = torch.zeros(count,dim) with open(path+'.txt','r',encoding='latin-1') as f: idx = 0 for line in f: contents = line.rstrip('\n').split(' ') words[idx] = contents[0] #print(contents[1:]) vectors[idx] = torch.Tensor(list(map(float, contents[1:]))) idx += 1 with open(path+'.vocab','w',encoding='latin-1') as f: for word in words: f.write(word+'\n') vocab = Vocab(filename=path+'.vocab') torch.save(vectors, path+'.pth') return vocab, vectors
def fit(self, train_loader, dev_loader, test_loader, epochs, interval, eta, file): # 记录迭代时间 total_time = timedelta() # 记录最大准确率及对应的迭代次数 max_e, max_acc = 0, 0.0 # 设置优化器为Adam self.optimizer = optim.Adam(params=self.parameters(), lr=eta) for epoch in range(1, epochs + 1): start = datetime.now() # 更新参数 self.update(train_loader) print(f"Epoch: {epoch} / {epochs}:") loss, train_acc = self.evaluate(train_loader) print(f"{'train:':<6} Loss: {loss:.4f} Accuracy: {train_acc:.2%}") loss, dev_acc = self.evaluate(dev_loader) print(f"{'dev:':<6} Loss: {loss:.4f} Accuracy: {dev_acc:.2%}") loss, test_acc = self.evaluate(test_loader) print(f"{'test:':<6} Loss: {loss:.4f} Accuracy: {test_acc:.2%}") t = datetime.now() - start print(f"{t}s elapsed\n") total_time += t # 保存效果最好的模型 if dev_acc > max_acc: torch.save(self, file) max_e, max_acc = epoch, dev_acc elif epoch - max_e >= interval: break print(f"max accuracy of dev is {max_acc:.2%} at epoch {max_e}") print(f"mean time of each epoch is {total_time / epoch}s\n")
def get_vanilla_vgg_features(cut_idx=-1): if not os.path.exists('vgg_features.pth'): os.system( 'wget --no-check-certificate -N https://s3-us-west-2.amazonaws.com/jcjohns-models/vgg19-d01eb7cb.pth') vgg_weights = torch.load('vgg19-d01eb7cb.pth') # fix compatibility issues map = {'classifier.6.weight':u'classifier.7.weight', 'classifier.6.bias':u'classifier.7.bias'} vgg_weights = OrderedDict([(map[k] if k in map else k,v) for k,v in vgg_weights.iteritems()]) model = models.vgg19() model.classifier = nn.Sequential(View(), *model.classifier._modules.values()) model.load_state_dict(vgg_weights) torch.save(model.features, 'vgg_features.pth') torch.save(model.classifier, 'vgg_classifier.pth') vgg = torch.load('vgg_features.pth') if cut_idx > 36: vgg_classifier = torch.load('vgg_classifier.pth') vgg = nn.Sequential(*(vgg._modules.values() + vgg_classifier._modules.values())) vgg.eval() return vgg
def download(self): if self._check_datafile_exists(): print('# Found cached data {}'.format(self.data_file)) return if not self._check_downloaded(): # download files url = self.urls[self.name][0] filename = self.urls[self.name][1] md5 = self.urls[self.name][2] fpath = os.path.join(self.root, filename) download_url(url, self.root, filename, md5) print('# Extracting data {}\n'.format(self.data_down)) import zipfile with zipfile.ZipFile(fpath, 'r') as z: z.extractall(self.data_dir) os.unlink(fpath) # process and save as torch files print('# Caching data {}'.format(self.data_file)) dataset = ( read_image_file(self.data_dir, self.image_ext, self.lens[self.name]), read_info_file(self.data_dir, self.info_file), read_matches_files(self.data_dir, self.matches_files) ) with open(self.data_file, 'wb') as f: torch.save(dataset, f)
def save(self, path=None): """Save model parameters if model_file is set.""" path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'optims'): model = {} model['enc_lt'] = self.enc_lt.state_dict() if self.opt['lookuptable'] not in ['enc_dec', 'all']: # dec_lt is not shared with enc_lt, so save it model['dec_lt'] = self.dec_lt.state_dict() if self.opt['decoder'] != 'shared': model['encoder'] = self.encoder.state_dict() model['decoder'] = self.decoder.state_dict() model['h2e'] = self.h2e.state_dict() model['e2o'] = self.e2o.state_dict() model['optims'] = {k: v.state_dict() for k, v in self.optims.items()} model['longest_label'] = self.longest_label model['opt'] = self.opt for attn_name in ['attn', 'attn_v', 'attn_combine']: if hasattr(self, attn_name): model[attn_name] = getattr(self, attn_name).state_dict() with open(path, 'wb') as write: torch.save(model, write)
def save(self, save_optimizer=False, save_path=None, **kwargs): """serialize models include optimizer and other info return path where the model-file is stored. Args: save_optimizer (bool): whether save optimizer.state_dict(). save_path (string): where to save model, if it's None, save_path is generate using time str and info from kwargs. Returns: save_path(str): the path to save models. """ save_dict = dict() save_dict['model'] = self.faster_rcnn.state_dict() save_dict['config'] = opt._state_dict() save_dict['other_info'] = kwargs save_dict['vis_info'] = self.vis.state_dict() if save_optimizer: save_dict['optimizer'] = self.optimizer.state_dict() if save_path is None: timestr = time.strftime('%m%d%H%M') save_path = 'checkpoints/fasterrcnn_%s' % timestr for k_, v_ in kwargs.items(): save_path += '_%s' % v_ t.save(save_dict, save_path) self.vis.save([self.vis.env]) return save_path
def test(epoch, best_acc): slope = get_slope(epoch) model.eval() test_loss = 0.0 correct = 0.0 for data, target in test_loader: if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data, volatile=True), Variable(target) output = model((data, slope)) test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability correct += pred.eq(target.data.view_as(pred)).cpu().sum() test_loss /= len(test_loader.dataset) test_acc = correct / len(test_loader.dataset) print 'Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, int(correct), len(test_loader.dataset), 100. * test_acc) if test_acc >= best_acc: torch.save(model.state_dict(), os.path.join('models','{}.pth'.format(model_name))) return test_loss, test_acc
def train(train_iter, dev_iter, test_iter, model_lstm, text_field, label_field, args): loss_function = nn.NLLLoss() optimizer = optim.Adam(model_lstm.parameters(), lr=1e-3) best_test_acc = 0.0 no_up = 0 for i in range(1, args.epochs+1): print('epoch: %d start!' % i) train_epoch(model_lstm, train_iter, dev_iter, test_iter, loss_function, optimizer, i, args) dev_acc = evaluate(model_lstm, dev_iter, loss_function, 'dev') test_acc = evaluate(model_lstm, test_iter, loss_function, 'test') if test_acc > best_test_acc: print('New Best Test!!!') best_test_acc = test_acc # os.system('rm best_models/mr_best_model_minibatch_acc_*.model') if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) save_prefix = os.path.join(args.save_dir, 'snapshot') save_path = '{}epoch{}.pt'.format(save_prefix, i) # torch.save(model_lstm.state_dict(),'best_models/mr_best_model_minibatch_acc_' + str(int(test_acc * 10000)) + '.model') torch.save(model_lstm, save_path) no_up = 0 else: no_up += 1 if no_up >= 10: exit() print('now best test acc:', best_test_acc)
def save_checkpoint(state, track_list, filename): """ save checkpoint """ with open(filename+'.json', 'w') as f: json.dump(track_list, f) torch.save(state, filename+'.model')
def extract_features_targets(model, features_size, loader, path_data, cuda=False): if os.path.isfile(path_data): print('Load features from {}'.format(path_data)) return torch.load(path_data) print('\nExtract features on {}set'.format(loader.dataset.set)) features = torch.Tensor(len(loader.dataset), features_size) targets = torch.Tensor(len(loader.dataset), len(loader.dataset.classes)) for batch_id, batch in enumerate(tqdm(loader)): img = batch[0] target = batch[2] current_bsize = img.size(0) from_ = int(batch_id * loader.batch_size) to_ = int(from_ + current_bsize) if cuda: img = img.cuda(async=True) input = Variable(img, requires_grad=False) output = model(input) features[from_:to_] = output.data.cpu() targets[from_:to_] = target os.system('mkdir -p {}'.format(os.path.dirname(path_data))) print('save ' + path_data) torch.save((features, targets), path_data) print('') return features, targets
def train(args): data_file = h5py.File(args.h5_path, 'r') screens = data_file['screens'] variables = data_file['variables'] labels = data_file['action_labels'] print('Dataset size =', len(screens)) action_sets = data_file['action_sets'][:] episodes = data_file['episodes'][:] input_shape = screens[0].shape train_generator = data_generator(args, screens, variables, labels, episodes) model = BaseModelLSTM(input_shape[0], len(action_sets), variables.shape[1]) #source_model = torch.load('imitation_model_lstm_bn0.pth') #model.load_state_dict(source_model.state_dict()) #del source_model if USE_CUDA: model.cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=5e-4) optimizer.zero_grad() running_loss = 0 running_accuracy = 0 batch_time = time.time() cp = 0 for batch, (screens, variables, labels, terminals) in enumerate(train_generator): screens, variables, labels = Variable(screens), Variable(variables), Variable(labels) outputs = model(screens, variables) loss = criterion(outputs, labels) model.set_terminal(terminals) running_loss += loss.data[0] _, pred = outputs.data.max(1) accuracy = (pred == labels.data).float().mean() running_accuracy += accuracy if batch % args.episode_size == args.episode_size - 1: loss.backward() optimizer.step() model.reset() optimizer.zero_grad() running_loss /= args.episode_size running_accuracy /= args.episode_size print( '[{:d}] loss: {:.3f}, accuracy: {:.3f}, time: {:.6f}'.format( batch + 1, running_loss, running_accuracy, time.time()-batch_time ) ) running_loss = 0 running_accuracy = 0 batch_time = time.time() if batch % args.checkpoint_rate == args.checkpoint_rate - 1: cp += 1 torch.save(model, args.checkpoint_file)
def torch_to_pytorch(model, t7_file, output): py_layers = [] for layer in list(model.children()): py_layer_serial(layer, py_layers) t7_data = torchfile.load(t7_file) t7_layers = [] for layer in t7_data: torch_layer_serial(layer, t7_layers) j = 0 for i, py_layer in enumerate(py_layers): py_name = type(py_layer).__name__ t7_layer = t7_layers[j] t7_name = t7_layer[0].split('.')[-1] if layer_map[t7_name] != py_name: raise RuntimeError('%s does not match %s' % (py_name, t7_name)) if py_name == 'LSTM': n_layer = 2 if py_layer.bidirectional else 1 n_layer *= py_layer.num_layers t7_layer = t7_layers[j:j + n_layer] j += n_layer else: j += 1 load_params(py_layer, t7_layer) torch.save(model.state_dict(), output)
def train(net): net.train() priorbox = PriorBox() with torch.no_grad(): priors = priorbox.forward() priors = priors.to(device) dataloader = DataLoader(VOCDetection(), batch_size=2, collate_fn=detection_collate, num_workers=12) for epoch in range(1000): loss_ls, loss_cs = [], [] load_t0 = time.time() if epoch > 500: adjust_learning_rate(optimizer, 1e-4) for images, targets in dataloader: images = images.to(device) targets = [anno.to(device) for anno in targets] out = net(images) optimizer.zero_grad() loss_l, loss_c = criterion(out, priors, targets) loss = 2 * loss_l + loss_c loss.backward() optimizer.step() loss_cs.append(loss_c.item()) loss_ls.append(loss_l.item()) load_t1 = time.time() print(f'{np.mean(loss_cs)}, {np.mean(loss_ls)} time:{load_t1-load_t0}') torch.save(net.state_dict(), 'Final_FaceBoxes.pth')
def decompose_model_seq(model, layer_name, model_file): print(model) model.cpu() for i, (name, conv_layer) in enumerate(model.named_modules()): ## for sequential nets, 'in' is sufficient ## as long as there are not 2 homonimous layers if layer_name in name: print(name) if args.cp: rank = max(conv_layer.weight.data.shape) // 3 rank, _ = choose_compression( conv_layer, ranks=[rank, rank], compression_factor=5, flag='cpd') print('rank: ', rank) rank = cp_ranks(conv_layer) print('rank: ', rank) decomposed = cp_decomposition_conv_layer_BN(conv_layer, rank, matlab=False) # decomposed = cp_xavier_conv_layer(conv_layer, rank) else: decomposed = tucker_decomposition_conv_layer(conv_layer) # first modules return a sequential, then we need to call the proper layer model._modules['sequential']._modules[layer_name] = decomposed torch.save(model, model_file) return model
def train(train_loader, num_epochs=20): autoencoder = deep_models.ConvolutionalDenoisingAutoencoder() # weights = torch.randn(2) criterion = nn.BCELoss() # binary cross entropy loss optimizer = optim.Adadelta(autoencoder.parameters()) for epoch in range(num_epochs): for i, data in enumerate(train_loader, 0): ## zero the gradient params optimizer.zero_grad() ### forward + backprop + optimize output = autoencoder(data) loss = criterion(output, data) loss.backward() optimizer.step() print( 'epoch [{}/{}], loss:{:.4f}' .format(epoch+1, num_epochs, loss.data[0]) ) # if epoch % 5 == 0: # vutils.save_image( # output.data, # join(RESULTS_DIR, '/image_{}.png'.format(epoch)) # ) torch.save(autoencoder.state_dict(), join(MODELS_DIR, 'autoencoder.pth')) return
def train_model(args): """Load the data, train the model, test the model, export / save the model """ torch.manual_seed(args.seed) # Open our dataset train_loader, test_loader = data_utils.load_data(args.test_split, args.batch_size) # Create the model net = model.SonarDNN().double() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, nesterov=False) # Train / Test the model for epoch in range(1, args.epochs + 1): train(net, train_loader, optimizer, epoch) test(net, test_loader) # Export the trained model torch.save(net.state_dict(), args.model_name) if args.model_dir: # Save the model to GCS data_utils.save_model(args.model_dir, args.model_name)
def on_end_epoch(state): print('[Epoch %d] Training Loss: %.4f (Accuracy: %.2f%%)' % ( state['epoch'], meter_loss.value()[0], meter_accuracy.value()[0])) train_loss_logger.log(state['epoch'], meter_loss.value()[0]) train_error_logger.log(state['epoch'], meter_accuracy.value()[0]) reset_meters() engine.test(processor, get_iterator(False)) test_loss_logger.log(state['epoch'], meter_loss.value()[0]) test_accuracy_logger.log(state['epoch'], meter_accuracy.value()[0]) confusion_logger.log(confusion_meter.value()) print('[Epoch %d] Testing Loss: %.4f (Accuracy: %.2f%%)' % ( state['epoch'], meter_loss.value()[0], meter_accuracy.value()[0])) torch.save(model.state_dict(), 'epochs/epoch_%d.pt' % state['epoch']) # Reconstruction visualization. test_sample = next(iter(get_iterator(False))) ground_truth = (test_sample[0].unsqueeze(1).float() / 255.0) _, reconstructions = model(Variable(ground_truth).cuda()) reconstruction = reconstructions.cpu().view_as(ground_truth).data ground_truth_logger.log( make_grid(ground_truth, nrow=int(BATCH_SIZE ** 0.5), normalize=True, range=(0, 1)).numpy()) reconstruction_logger.log( make_grid(reconstruction, nrow=int(BATCH_SIZE ** 0.5), normalize=True, range=(0, 1)).numpy())
def save_network(self, network, network_name, epoch_count, gpu_ids): save_filename = network_name + '-' + str(epoch_count) save_path = os.path.join(self.save_dir, save_filename) torch.save(network.cpu().state_dict(), save_path) if len(gpu_ids) and torch.cuda.is_available(): network.cuda(device=gpu_ids[0])
def test_serialization_built_vocab(self): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) question_pickle_filename = "question.pl" question_pickle_path = os.path.join(self.test_dir, question_pickle_filename) torch.save(question_field, question_pickle_path) loaded_question_field = torch.load(question_pickle_path) assert loaded_question_field == question_field test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] # Test results of numericalization original_numericalization = question_field.numericalize(test_example_data) pickled_numericalization = loaded_question_field.numericalize(test_example_data) assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
def save(net, filename): if isinstance(net, nn.DataParallel): net = net.module data = dict(args=net.args, state_dict=net.state_dict()) torch.save(data, filename)
def test_load_to_gpu_from_gpu(self): # This test will make sure that the initializer works on the GPU self.net1.cuda(device=0) self.net2.cuda(device=0) # Verify the parameters are on the GPU assert self.net1.linear_1.weight.is_cuda is True assert self.net1.linear_1.bias.is_cuda is True assert self.net2.linear_1.weight.is_cuda is True assert self.net2.linear_1.bias.is_cuda is True # We need to manually save the parameters to a file because setUp() # only does it for the CPU temp_file = self.TEST_DIR / "gpu_weights.th" torch.save(self.net2.state_dict(), temp_file) applicator = self._get_applicator("linear_1.*", temp_file) applicator(self.net1) # Verify the parameters are still on the GPU assert self.net1.linear_1.weight.is_cuda is True assert self.net1.linear_1.bias.is_cuda is True assert self.net2.linear_1.weight.is_cuda is True assert self.net2.linear_1.bias.is_cuda is True # Make sure the weights are identical assert self._are_equal(self.net1.linear_1, self.net2.linear_1)
def save(): # save net1 net1 = torch.nn.Sequential( torch.nn.Linear(1, 10), torch.nn.ReLU(), torch.nn.Linear(10, 1) ) optimizer = torch.optim.SGD(net1.parameters(), lr=0.5) loss_func = torch.nn.MSELoss() for t in range(100): prediction = net1(x) loss = loss_func(prediction, y) optimizer.zero_grad() loss.backward() optimizer.step() # plot result plt.figure(1, figsize=(10, 3)) plt.subplot(131) plt.title('Net1') plt.scatter(x.data.numpy(), y.data.numpy()) plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5) # 2 ways to save the net torch.save(net1, 'net.pkl') # save entire net torch.save(net1.state_dict(), 'net_params.pkl') # save only the parameters
def save_checkpoint(model, output_path): ## if not os.path.exists(output_dir): ## os.makedirs("model/") torch.save(model, output_path) print("Checkpoint saved to {}".format(output_path))
def test(epoch): global best_acc net.eval() test_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(testloader): inputs, targets = Variable(inputs), Variable(targets) outputs = net(inputs) loss = criterion(outputs, targets) #test_loss += loss.data[0] test_loss+=loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (test_loss / (batch_idx + 1), 100. * correct / total, correct, total)) # Save checkpoint. acc = 100. * correct / total if acc > best_acc: print('Saving..') state = { 'net': net.module if use_cuda else net, 'acc': acc, 'epoch': epoch, } if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') torch.save(state, './checkpoint/ckpt.t7') best_acc = acc
def _comput_mean(self): meanstd_file = './data/300W_LP/mean.pth.tar' if os.path.isfile(meanstd_file): ms = torch.load(meanstd_file) else: print("\tcomputing mean and std for the first time, it may takes a while, drink a cup of coffe...") mean = torch.zeros(3) std = torch.zeros(3) if self.is_train: for i in range(self.total): a = self.anno[i] img_path = os.path.join(self.img_folder, self.anno[i].split('_')[0], self.anno[i][:-8] + '.jpg') img = load_image(img_path) mean += img.view(img.size(0), -1).mean(1) std += img.view(img.size(0), -1).std(1) mean /= self.total std /= self.total ms = { 'mean': mean, 'std': std, } torch.save(ms, meanstd_file) if self.is_train: print('\tMean: %.4f, %.4f, %.4f' % (ms['mean'][0], ms['mean'][1], ms['mean'][2])) print('\tStd: %.4f, %.4f, %.4f' % (ms['std'][0], ms['std'][1], ms['std'][2])) return ms['mean'], ms['std']
def save_model(self): path = self.config.data_path if os.path.isdir('data'): path = 'data/{0}'.format(self.config.data_path) print('save model parameters to {0}'.format(path)) torch.save(self.model.state_dict(), path)
def test_serialization(self): nesting_field = data.Field(batch_first=True) field = data.NestedField(nesting_field) ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) dataset = data.Dataset([ex1, ex2], [("words", field)]) field.build_vocab(dataset) examples_data = [ [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ["<cpad>"] * 7, ] ] field_pickle_filename = "char_field.pl" field_pickle_path = os.path.join(self.test_dir, field_pickle_filename) torch.save(field, field_pickle_path) loaded_field = torch.load(field_pickle_path) assert loaded_field == field original_numericalization = field.numericalize(examples_data) pickled_numericalization = loaded_field.numericalize(examples_data) assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def save_network(network, epoch_label): save_filename = 'net_%s.pth' % epoch_label save_path = os.path.join('./model', name, save_filename) torch.save(network.cpu().state_dict(), save_path) if torch.cuda.is_available(): network.cuda(gpu_ids[0])
def train(): parser = argparse.ArgumentParser(description="recognition argument") parser.add_argument("dir", default="models") parser.add_argument( "--arch", choices=[ 'BLSTM', 'LSTM', 'VGGBLSTM', 'VGGLSTM', 'LSTMrowCONV', 'TDNN_LSTM', 'BLSTMN' ], default='BLSTM') parser.add_argument("--min_epoch", type=int, default=15) parser.add_argument("--output_unit", type=int) parser.add_argument("--lamb", type=float, default=0.1) parser.add_argument("--hdim", type=int, default=512) parser.add_argument("--layers", type=int, default=6) parser.add_argument("--dropout", type=float, default=0.5) parser.add_argument("--batch_size", type=int, default=256) parser.add_argument("--feature_size", type=int, default=120) parser.add_argument("--data_path") parser.add_argument("--lr", type=float,default=0.001) parser.add_argument("--stop_lr", type=float,default=0.00001) parser.add_argument("--resume", action="store_true") parser.add_argument("--pkl", action="store_true") parser.add_argument("--pretrained_model_path") args = parser.parse_args() os.makedirs(args.dir + '/board', exist_ok=True) writer = SummaryWriter(args.dir +'/board') # save configuration with open(args.dir + '/config.json', "w") as fout: config = { "arch": args.arch, "output_unit": args.output_unit, "hdim": args.hdim, "layers": args.layers, "dropout": args.dropout, "feature_size": args.feature_size, } json.dump(config, fout) model = Model(args.arch, args.feature_size, args.hdim, args.output_unit, args.layers, args.dropout, args.lamb) if args.resume: print("resume from {}".format(args.pretrained_model_path)) pretrained_dict = torch.load(args.pretrained_model_path) model.load_state_dict(pretrained_dict) device = torch.device("cuda:0") model.cuda() model = nn.DataParallel(model) model.to(device) lr = args.lr optimizer = optim.Adam(model.parameters(), lr=lr) if args.pkl: tr_dataset = SpeechDatasetMemPickle(args.data_path + "/tr.pkl") else: tr_dataset = SpeechDatasetMem(args.data_path + "/tr.hdf5") tr_dataloader = DataLoader( tr_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=0, collate_fn=PadCollate()) if args.pkl: cv_dataset = SpeechDatasetMemPickle(args.data_path + "/cv.pkl") else: cv_dataset = SpeechDatasetMem(args.data_path + "/cv.hdf5") cv_dataloader = DataLoader( cv_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=0, collate_fn=PadCollate()) prev_t = 0 epoch = 0 prev_cv_loss = np.inf model.train() while True: # training stage torch.save(model.module.state_dict(), args.dir + "/best_model") epoch += 1 for i, minibatch in enumerate(tr_dataloader): print("training epoch: {}, step: {}".format(epoch, i)) logits, input_lengths, labels_padded, label_lengths, path_weights = minibatch sys.stdout.flush() model.zero_grad() optimizer.zero_grad() loss = model(logits, labels_padded, input_lengths, label_lengths) partial_loss = torch.mean(loss.cpu()) weight = torch.mean(path_weights) real_loss = partial_loss - weight loss.backward(loss.new_ones(len(TARGET_GPUS))) optimizer.step() t2 = timeit.default_timer() writer.add_scalar('training loss', real_loss.item(), (epoch-1) * len(tr_dataloader) + i) prev_t = t2 # save model torch.save(model.module.state_dict(), args.dir + "/model.epoch.{}".format(epoch)) # cv stage model.eval() cv_losses_sum = [] count = 0 for i, minibatch in enumerate(cv_dataloader): print("cv epoch: {}, step: {}".format(epoch, i)) logits, input_lengths, labels_padded, label_lengths, path_weights = minibatch loss = model(logits, labels_padded, input_lengths, label_lengths) loss_size = loss.size(0) count = count + loss_size partial_loss = torch.mean(loss.cpu()) weight = torch.mean(path_weights) real_loss = partial_loss - weight real_loss_sum = real_loss * loss_size cv_losses_sum.append(real_loss_sum.item()) print("cv_real_loss: {}".format(real_loss.item())) cv_loss = np.sum(np.asarray(cv_losses_sum)) / count print("mean_cv_loss: {}".format(cv_loss)) writer.add_scalar('mean_cv_loss',cv_loss,epoch) if epoch < args.min_epoch or cv_loss <= prev_cv_loss: torch.save(model.module.state_dict(), args.dir + "/best_model") prev_cv_loss = cv_loss else: print( "cv loss does not improve, decay the learning rate from {} to {}" .format(lr, lr / 10.0)) adjust_lr(optimizer, lr / 10.0) lr = lr / 10.0 if (lr < args.stop_lr): print("learning rate is too small, finish training") break model.train() ctc_crf_base.release_env(gpus)
# %% # other example plot_knn_examples(embeddings) # %% # What's next? # You could use the pre-trained model and train a classifier on top. pretrained_resnet_backbone = model.backbone # you can also store the backbone and use it in another code state_dict = { 'resnet18_parameters': pretrained_resnet_backbone.state_dict() } torch.save(state_dict, 'model.pth') # %% # THIS COULD BE IN A NEW FILE (e.g. inference.py # # Make sure you place the `model.pth` file in the same folder as this code # load the model in a new file for inference resnet18_new = torchvision.models.resnet18() last_conv_channels = list(resnet.children())[-1].in_features # note that we need to create exactly the same backbone in order to load the weights backbone_new = nn.Sequential( *list(resnet.children())[:-1], nn.Conv2d(last_conv_channels, num_ftrs, 1), )
z=znew put_trainable_values(net1,z) put_trainable_values(net2,z) put_trainable_values(net3,z) if check_results: verification_error_check(net1,net2,net3) print('Finished Training') if save_model: torch.save({ 'model_state_dict':net1.state_dict(), 'epoch':epoch, 'optimizer_state_dict':opt1.state_dict(), 'running_loss':running_loss1, },'./s1.model') torch.save({ 'model_state_dict':net2.state_dict(), 'epoch':epoch, 'optimizer_state_dict':opt2.state_dict(), 'running_loss':running_loss2, },'./s2.model') torch.save({ 'model_state_dict':net3.state_dict(), 'epoch':epoch, 'optimizer_state_dict':opt3.state_dict(), 'running_loss':running_loss3, },'./s3.model')
# t+=1 # # if svm_predict == label: # # s+=1 # #print(t,s) # # here map is acc # knn_map = (t / float(len(test_loader))) #svm_map = (s / float(len(test_loader))) map = CalcHR.CalcMap(T,H_B,test_labels_onehot.numpy(),train_labels_onehot.numpy()) print('####################################') # print('knn_map:',knn_map) print('map:',map) #print('svm_map:',svm_map) #map = round(map,5) if map > max_map: max_map = map np.save(str(opt.bit)+"H_B.npy",H_B) np.save(str(opt.bit)+'test.npy',T) np.save('train_label.npy',train_labels_onehot.numpy()) np.save('test_label.npy',test_labels_onehot.numpy()) torch.save(G,'./G3_models.pt') torch.save(H,'./H3_models.pt')
optimizerD.step() # Update G with fake data netG.zero_grad() y_fake_r = netD(fake_maps) loss_G = criterion(y_fake_r, real_label) loss_G.backward() optimizerG.step() # print info about losses and save them to file print('Epoch {} loss_D_real: {:.4f} loss_D_fake: {:.4f} loss_G: {:.4f}'\ .format(epoch, loss_D_real.mean().item(), loss_D_fake.mean().item(), loss_G.mean().item())) f = open('%s/losses/loss_64.txt' % root_out, 'a') f.write('%d %.3e %.3e %.3e\n' % (epoch, loss_D_real.mean().item(), loss_D_fake.mean().item(), loss_G.mean().item())) f.close() # save images from generator to file with torch.no_grad(): viz_sample = netG(viz_noise)[:49] vutils.save_image(viz_sample, '%s/images_64/fake_samples_%d.png'\ %(root_out,epoch), normalize=True, nrow=7, range=(-1.0, 1.0)) # save networks torch.save(netG.state_dict(), '%s/models_64/Net_Gen_%d.pt' % (root_out, epoch)) torch.save(netD.state_dict(), '%s/models_64/Net_Dis_%d.pt' % (root_out, epoch))
def load_and_cache_examples(args, task, tokenizer, evaluate=False, filter_long_sequences=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task), 'filtered' if filter_long_sequences else 'all')) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool( args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, filter_long_sequences=filter_long_sequences, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def main(): opt = opts().parse() now = datetime.datetime.now() logger = Logger(opt.saveDir + '/logs_{}'.format(now.isoformat())) if opt.loadModel != 'none': model = torch.load(opt.loadModel).cuda() else: model = HourglassNet3D(opt.nStack, opt.nModules, opt.nFeats, opt.nRegModules).cuda() criterion = torch.nn.MSELoss().cuda() optimizer = torch.optim.RMSprop(model.parameters(), opt.LR, alpha=ref.alpha, eps=ref.epsilon, weight_decay=ref.weightDecay, momentum=ref.momentum) if opt.ratio3D < ref.eps: val_loader = torch.utils.data.DataLoader(MPII(opt, 'val', returnMeta=True), batch_size=1, shuffle=False, num_workers=int(ref.nThreads)) else: val_loader = torch.utils.data.DataLoader(H36M(opt, 'val'), batch_size=1, shuffle=False, num_workers=int(ref.nThreads)) if opt.test: val(0, opt, val_loader, model, criterion) return train_loader = torch.utils.data.DataLoader( Fusion(opt, 'train'), batch_size=opt.trainBatch, shuffle=True if opt.DEBUG == 0 else False, num_workers=int(ref.nThreads)) for epoch in range(1, opt.nEpochs + 1): loss_train, acc_train, mpjpe_train, loss3d_train = train( epoch, opt, train_loader, model, criterion, optimizer) logger.scalar_summary('loss_train', loss_train, epoch) logger.scalar_summary('acc_train', acc_train, epoch) logger.scalar_summary('mpjpe_train', mpjpe_train, epoch) logger.scalar_summary('loss3d_train', loss3d_train, epoch) if epoch % opt.valIntervals == 0: loss_val, acc_val, mpjpe_val, loss3d_val = val( epoch, opt, val_loader, model, criterion) logger.scalar_summary('loss_val', loss_val, epoch) logger.scalar_summary('acc_val', acc_val, epoch) logger.scalar_summary('mpjpe_val', mpjpe_val, epoch) logger.scalar_summary('loss3d_val', loss3d_val, epoch) torch.save(model, os.path.join(opt.saveDir, 'model_{}.pth'.format(epoch))) logger.write( '{:8f} {:8f} {:8f} {:8f} {:8f} {:8f} {:8f} {:8f} \n'.format( loss_train, acc_train, mpjpe_train, loss3d_train, loss_val, acc_val, mpjpe_val, loss3d_val)) else: logger.write('{:8f} {:8f} {:8f} {:8f} \n'.format( loss_train, acc_train, mpjpe_train, loss3d_train)) adjust_learning_rate(optimizer, epoch, opt.dropLR, opt.LR) logger.close()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument( "--filter_long_sequences_train", action='store_true', help= "If set train sequences longer as max_seq_length are filtered instead of truncated." ) parser.add_argument( "--filter_long_sequences_eval", action='store_true', help= "If set eval sequences longer as max_seq_length are filtered instead of truncated." ) parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples( args, args.task_name, tokenizer, evaluate=False, filter_long_sequences=args.filter_long_sequences_train) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( '/')[-1] if checkpoint.find('checkpoint') != -1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict( (k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) return results
100. * correct / len(test_loader.dataset))) if __name__ == '__main__': batch_size = 100 epochs = 50 use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_data = datasets.CIFAR10('./data', train=True, download=True, transform=transform) train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, **kwargs) test_data = datasets.CIFAR10('./data', train=False, transform=transform) test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=True, **kwargs) model = Model().to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.01) for epoch in range(1, epochs + 1): train(model, device, train_loader, epoch, optimizer, criterion) test(model, device, test_loader, criterion) torch.save(model.state_dict(),"mnist_cnn.pt")
G_exp.add_scalar_value('G_loss', G_loss.data[0], step=batch_idx + it * train_size) G_loss.backward(retain_variables = True) G_solver.step() if it % 2 == 0: z.data.resize_(mb_size, z_dim).normal_(0, 1) samples = netG(z).data.numpy()[:16] fig = plt.figure(figsize=(4, 4)) gs = gridspec.GridSpec(4, 4) gs.update(wspace=0.05, hspace=0.05) for index, sample in enumerate(samples): ax = plt.subplot(gs[index]) plt.axis('off') ax.set_xticklabels([]) ax.set_yticklabels([]) ax.set_aspect('equal') plt.imshow(sample.reshape(28, 28), cmap='Greys_r') if not os.path.exists('out/'): os.makedirs('out/') plt.savefig('out/{}.png'.format(str(cnt).zfill(3)), bbox_inches='tight') torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % ('./out', it)) torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % ('./out', it)) cnt += 1 plt.close(fig)
warmup_for=5, min_lr=args.learning_rate / args.epochs) trainer = Trainer(net, train_loader, val_loader, loss, optimizer, scheduler) if iscuda: trainer = trainer.cuda() # Training loop # max_val_error = 1e5 for epoch in range(args.epochs): print(f"\n>> Starting epoch {epoch}...") trainer.train() val_error = trainer.validate() if val_error < max_val_error: print(f"\n>> Saving model (the lowest validation error)") torch.save({'net': args.net, 'state_dict': net.state_dict()}, osp.join(args.save_path, "min_val_error_checkpoint.pt")) max_val_error = val_error torch.save({'net': args.net, 'state_dict': net.state_dict()}, osp.join(args.save_path, "checkpoint_" + str(epoch) + ".pt")) ''' if epoch == 30: print(f"\n>> Saving model (30th epoch)") torch.save({'net': args.net, 'state_dict': net.state_dict()}, osp.join(args.save_path, "checkpoint_30th.pt")) ''' print(f"\n>> Saving model to {args.save_path}") torch.save({'net': args.net, 'state_dict': net.state_dict()}, osp.join(args.save_path, "end_train_checkpoint.pt"))
print('loss: %.7f' % avg_total_loss) writer.add_scalar('data/total_loss', avg_total_loss, epoch) avg_unsupervised_loss = unsupervised_loss.avg print('unsupervised loss: %.7f' % avg_unsupervised_loss) writer.add_scalar('data/unsupervised_loss', avg_unsupervised_loss, epoch) avg_supervised_loss = supervised_loss.avg print('supervised loss: %.7f' % avg_supervised_loss) writer.add_scalar('data/supervised_loss', avg_supervised_loss, epoch) print('end of display \n') if (epoch % opt.save_latest_freq == 0): print('saving the latest model (epoch %d, total_epoch %d)' % (epoch, opt.niter)) torch.save( model.state_dict(), os.path.join('.', opt.checkpoints_dir, opt.name, 'sound_localization_latest' + str(epoch) + '.pth')) if (epoch % opt.validation_freq == 0 and opt.validation_on): model.eval() opt.mode = 'val' print('Display validation results at (epoch %d, total_epoch %d)' % (epoch, opt.niter)) val_err = evaluate(model, writer, epoch, dataloader_test, opt) print('end of display \n') model.train() opt.mode = 'train' #save the model that achieves the smallest validation error if val_err < best_err: best_err = val_err print(
def main(): # prepare data if args.download: print("Downloading data . . .") download_data() if args.load_preproc: print("Loading preprocessed data . . .") data = torch.load('data/preproc_data') else: print("Preprocessing data . . .") data = preprocess_data() print("Saving preprocessed data . . .") torch.save(data, 'data/preproc_data') print("Making DataLoader . . .") train_loader, val_loader, test_loader = get_dataloader(data, args.batch_size) # setup model & dirs if args.load_model != '': print("Loading model . . .") pass else: print("Initializing model . . .") net = Net() model_root = 'data/models/' if not os.path.exists(model_root): os.makedirs(model_root) i = 0 while True: model_name = args.name + "(%d)"%(i) model_dir = os.path.join(model_root, model_name) if os.path.exists(model_dir): i += 1 continue else: os.makedirs(model_dir) break # train model if args.train: print("Training model . . .") net.train() optimizer = optim.Adam(net.parameters(), lr=args.lr) best_perf = None for epoch in range(1, args.epochs+1): loss = train_epoch(net, train_loader, optimizer) if epoch % args.print_every == 0: print("(Epoch %d) Training loss: %4f"%(epoch, loss)) if epoch % args.validate_every == 0: loss, perf = test_epoch(net, val_loader) print("Validation loss: %4f, accuracy: %g"%(loss, perf)) if best_perf==None or perf > best_perf: # save best model best_perf = perf fname = model_name+'_best_(epoch%d)'%(epoch) torch.save(net, os.path.join(model_dir, fname)) # test model print("Testing model . . .") net.eval() loss, perf = test_epoch(net, test_loader) print("Test loss: %4f, accuracy: %g"%(loss, perf)) pass
marker='.') plt.scatter(num_epoch + 1, one_epoch_validation_loss, linewidths=0.5, c='blue', marker='.') plt.pause(0.00001) if num_epoch % FC_save_model_epochs_interval is 0: print('saving model...') model_save_path = FC_model_save_folder + '/epoch_' + str( num_epoch + 1) torch.save( { 'epoch': num_epoch + 1, 'FC_model_state_dict': FC_model.state_dict(), 'FC_optimizer_state_dict': FC_model_optimizer.state_dict(), 'loss': all_epochs_train_loss_hist_for_final_graph, }, model_save_path) plt.clf() shist = [h for h in all_epochs_train_loss_hist_for_final_graph] vhist = [h for h in all_epochs_validation_loss_hist_for_final_graph] plt.title('training_loss') plt.xlabel("Training Epochs") plt.ylabel("Error") plt.plot(range(1, total_epochs + 1), shist, label="train") plt.plot(range(1, total_epochs + 1), vhist, label="validation") # plt.ylim((0,1.)) # plt.xticks(np.arange(1, num_epochs+1, 1.0)) plt.legend()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--mode', help='{train,test}', required=True) parser.add_argument('--output_dir', help='File to write model checkpoints and log files to', required=False) parser.add_argument('--dataset', help='{genia, genia_full, litbank, litbank_full, ontonotes}', required=True) parser.add_argument('--metric', help='{accuracy,fscore,span_fscore}', default='span_fscore', required=False) parser.add_argument("--batch_size", default=16, type=int, help="The batch size on GPU.") parser.add_argument("--gradient_accumulation_steps", default=1, type=int, help="Number of updates steps to accumulate before performing a backward pass.") parser.add_argument('--freeze_bert', help='Whether to freeze BERT weights', action='store_true') parser.add_argument('--self_attention', help='Whether sequences should be allowed to attention to themsleves', action='store_true') parser.add_argument('--vanilla', help='Whether to add LSTM encoders to model', action='store_true') parser.add_argument("--lstm_dim", default=128, type=int, help="LSTM hidden dimension size.") parser.add_argument('--pretrained_dir', help='Directory to read custom fine-tuned (BERT) base model weights from', required=False) parser.add_argument('--checkpoint_file', help='File to read checkpointed model weights from (to resume training or test)', required=False) parser.add_argument('--model_type', help='Pretrained BERT configuration checkpoint, e.g. bert-base-cased', required=True) parser.add_argument('--lr', type=float, default=2e-5, required=False) parser.add_argument('--num_epochs', type=int, default=100, required=False) parser.add_argument('--k', type=int, help='How many context sequences to attend over', default=10, required=False) args = parser.parse_args() os.makedirs(args.output_dir, exist_ok=True) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, handlers=[ logging.FileHandler(os.path.join(args.output_dir, "training.log" if args.mode == 'train' else 'test.log'), mode='w+'), logging.StreamHandler() ] ) logging.info("Running on: {}".format(device)) logging.getLogger("transformers.configuration_utils").setLevel(logging.ERROR) logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR) if args.dataset == 'genia': dataset = GENIA elif args.dataset == 'genia_full': dataset = GENIA_FULL elif args.dataset == 'litbank': dataset = LITBANK elif args.dataset == 'litbank_full': dataset = LITBANK_FULL elif args.dataset == 'ontonotes': dataset = ONTONOTES else: raise ValueError("Invalid dataset") if 'google' in args.model_type: tokenizer_type = 'bert-base-uncased' else: tokenizer_type = args.model_type do_lower_case = 'uncased' in tokenizer_type tokenizer = AutoTokenizer.from_pretrained(tokenizer_type, do_lower_case=do_lower_case, do_basic_tokenize=False) metric = None if args.metric.lower() == "fscore": metric = metrics.check_f1_two_lists elif args.metric.lower() == "accuracy": metric = metrics.get_accuracy elif args.metric.lower() == "span_fscore": metric = metrics.check_span_f1_two_lists tagset = read_tagset(dataset['tagset']) model = load_model(args.model_type, args.pretrained_dir, args.checkpoint_file, len(tagset), args.freeze_bert, args.lstm_dim, args.vanilla) model.to(device) mode = args.mode if mode == 'train': no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": 0.01, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-6) loss_function = nn.CrossEntropyLoss(ignore_index=-100) train_documents = DocumentAttentionDataset(dataset['train_dir'], tokenizer, args.k, args.self_attention) train_data_loader = DataLoader(train_documents, batch_size=1, shuffle=True, num_workers=1, collate_fn=lambda x: x) dev_documents = DocumentAttentionDataset(dataset['dev_dir'], tokenizer, args.k, args.self_attention) dev_data_loader = DataLoader(dev_documents, batch_size=1, shuffle=True, num_workers=1, collate_fn=lambda x: x) global_steps = 0 # number of backward passes steps = 0 # number of forward passes (not divided by accummulation) total_loss = 0 best_val = -1 best_idx = 0 patience = 3 for epoch in range(args.num_epochs): logging.info("*** TRAINING ****\n") logging.info("Epoch: {}".format(epoch)) total_loss = 0 for i, document_batch in enumerate(train_data_loader): document = document_batch[0] document_loss = 0 batches = get_batches(document, args.batch_size, tagset, args.k) context = model.get_static_context_representation(batches, args.batch_size) num_batches = len(batches['inputs']) num_labeled = sum([len(batches['inputs'][b]) for b in range(num_batches)]) num_attn = sum([np.prod(batches['attn_sentence_idx'][b].shape) for b in range(num_batches)]) logging.info("Document {}/{}: (len={}, attn={}, batches={})".format(i+1, len(train_documents), num_labeled, num_attn, num_batches)) model.train() for b in range(num_batches): inputs = batches['inputs'][b].to(device) transforms = batches['transforms'][b].to(device) masks = batches['masks'][b].to(device) labels = batches['labels'][b].to(device) attn_sentence_idx = batches['attn_sentence_idx'][b].to(device) attn_word_idx = batches['attn_word_idx'][b].to(device) attn_dists = batches['attn_dists'][b].to(device) attn_mask = batches['attn_masks'][b].to(device) logits = model.forward(inputs=inputs, masks=masks, transforms=transforms, full_context=context, attn_sentence_idx=attn_sentence_idx, attn_word_idx=attn_word_idx, attn_dists=attn_dists, attn_mask=attn_mask) loss = loss_function(logits.view(-1, model.num_labels), labels.view(-1)) loss /= args.gradient_accumulation_steps document_loss += loss.item() steps += 1 if steps % args.gradient_accumulation_steps == 0: total_loss += loss.item() loss.backward() optimizer.step() model.zero_grad() global_steps += 1 if global_steps % 100 == 0: logging.info("Global step: {}".format(global_steps)) logging.info("Loss: {}".format(document_loss / num_labeled)) total_loss += document_loss logging.info("Epoch total loss: {}".format(total_loss)) logging.info("*** EVALUATING ***\n") value = model.evaluate(dev_data_loader, args.batch_size, metric, tagset, args.k) logging.info("DEV {}: {}".format(args.metric, value)) if value > best_val: best_idx = epoch best_val = value model_dir = os.path.join(args.output_dir, "checkpoint-{}.bin".format(best_idx)) logging.info("Saving model @ {}".format(model_dir)) torch.save(model.state_dict(), model_dir) elif (epoch - best_idx) > patience: logging.info("Aborting training after {} epochs of patience".format(patience)) logging.info("Best model @ epoch {} with {}={}".format(best_idx, args.metric, best_val)) del model # allows torch to free gpu memory before loading best model from disk break logging.info("*** TESTING ***\n") best_model_dir = os.path.join(args.output_dir, "checkpoint-{}.bin".format(best_idx)) logging.info("Loading best model from {}".format(best_model_dir)) best_model = load_model(args.model_type, None, best_model_dir, len(tagset), args.freeze_bert, args.lstm_dim, args.vanilla) best_model.to(device) test_documents = DocumentAttentionDataset(dataset['test_dir'], tokenizer, args.k, args.self_attention) test_data_loader = DataLoader(test_documents, batch_size=1, shuffle=True, num_workers=1, collate_fn=lambda x: x) value = best_model.evaluate(test_data_loader, args.batch_size, metric, tagset, args.k) logging.info("TEST {}: {}".format(args.metric, value)) return elif mode == 'predict': prediction_file = os.path.join(args.output_dir, "predictions.txt") # TODO pass elif mode == 'test': logging.info("*** TESTING ***\n") test_documents = DocumentAttentionDataset(dataset['test_dir'], tokenizer, args.k, args.self_attention) test_data_loader = DataLoader(test_documents, batch_size=1, shuffle=True, num_workers=1, collate_fn=lambda x: x) value = model.evaluate(test_data_loader, args.batch_size, metric, tagset, args.k) logging.info("TEST {}: {}".format(args.metric, value))
def train_net(model, data_path, pre_model, save_dir, batch_size, lr, log_after, cuda, device, one_hot=False): if not pre_model: print(model) writer = SummaryWriter() if cuda: print('GPU') model.cuda(device=device) print('log: training started on device: {}'.format(device)) # define loss and optimizer optimizer = Adam(model.parameters(), lr=lr) lr_final = 0.0000003 num_epochs = 500 LR_decay = (lr_final/lr)**(1./num_epochs) scheduler = lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=LR_decay) # print(LR_decay, optimizer.state) # print(optimizer.param_groups[0]['lr']) # criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss() train_loader, val_dataloader, test_loader = get_dataloaders(path_to_nparray=data_path, batch_size=batch_size, normalize=True) if not os.path.exists(save_dir): os.mkdir(save_dir) if True: i = 1 m_loss, m_accuracy = [], [] if pre_model: # self.load_state_dict(torch.load(pre_model)['model']) model.load_state_dict(torch.load(os.path.join(save_dir, "model-"+pre_model+'.pt'))) print('log: resumed model {} successfully!'.format(pre_model)) print(model) # starting point # model_number = int(pre_model.split('/')[1].split('-')[1].split('.')[0]) model_number = int(pre_model) #re.findall('\d+', str(pre_model))[0]) i = i + model_number - 1 else: print('log: starting anew...') while i < num_epochs: i += 1 net_loss = [] # new model path save_path = os.path.join(save_dir, 'model-{}.pt'.format(i)) # remember to save only five previous models, so del_this = os.path.join(save_dir, 'model-{}.pt'.format(i-5)) if os.path.exists(del_this): os.remove(del_this) print('log: removed {}'.format(del_this)) if i > 1 and not os.path.exists(save_path): torch.save(model.state_dict(), save_path) print('log: saved {}'.format(save_path)) correct_count, total_count = 0, 0 for idx, data in enumerate(train_loader): ########################## model.train() # train mode at each epoch, just in case... ########################## test_x, label = data if cuda: test_x = test_x.cuda(device=device) label = label.cuda(device=device) # forward out_x, pred = model(test_x) # out_x, pred = out_x.cpu(), pred.cpu() loss = criterion(out_x, label) net_loss.append(loss.item()) # get accuracy metric if one_hot: batch_correct = (torch.argmax(label, dim=1).eq(pred.long())).double().sum().item() else: batch_correct = (label.eq(pred.long())).double().sum().item() correct_count += batch_correct # print(batch_correct) total_count += np.float(pred.size(0)) if idx % log_after == 0 and idx > 0: print('{}. ({}/{}) image size = {}, loss = {}: accuracy = {}/{}'.format(i, idx, len(train_loader), out_x.size(), loss.item(), batch_correct, pred.size(0))) ################################# # three steps for backprop model.zero_grad() loss.backward() # perform gradient clipping between loss backward and optimizer step clip_grad_norm_(model.parameters(), 0.05) optimizer.step() ################################# # remember this should be in the epoch loop ;) scheduler.step() # to dynamically change the learning rate mean_accuracy = correct_count / total_count * 100 mean_loss = np.asarray(net_loss).mean() m_loss.append((i, mean_loss)) m_accuracy.append((i, mean_accuracy)) writer.add_scalar(tag='train loss', scalar_value=mean_loss, global_step=i) writer.add_scalar(tag='train over_all accuracy', scalar_value=mean_accuracy, global_step=i) print('####################################') print('epoch {} -> total loss = {:.5f}, total accuracy = {:.5f}% (lr: {})'.format(i, mean_loss, mean_accuracy, optimizer.param_groups[0]['lr'])) print('####################################') # validate model after each epoch with torch.no_grad(): eval_net(model=model, writer=writer, criterion=criterion, val_loader=val_dataloader, denominator=batch_size, cuda=cuda, device=device, global_step=i, one_hot=one_hot) pass
########################### netG.zero_grad() label.fill_( real_label) # fake labels are real for generator cost output = netD(fake) errG = criterion(output, label) errG.backward() D_G_z2 = output.mean().item() optimizerG.step() print( '[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f' % (epoch, opt.niter, i, len(dataloader), errD.item(), errG.item(), D_x, D_G_z1, D_G_z2)) if i % 1 == 0: vutils.save_image(real_cpu, '%s/real_samples.png' % opt.outDict, normalize=True) fake = netG(fixed_noise) vutils.save_image(fake.detach(), '%s/fake_samples_epoch_%03d_%06d.png' % (opt.outDict, epoch, i), normalize=True) # ---DCGAN Training End--- # do checkpointing torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (opt.outDict, epoch)) torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (opt.outDict, epoch))
def snapshot(self, model, name): # filename = model.name + '_iter_{:d}'.format(iter) + '.pth' filename = f"{name}.pth" filename = os.path.join(self.output_dir, filename) torch.save(model.state_dict(), filename) self.logger('Wrote snapshot to: {:s}'.format(filename))
def train_model(model, train_dataloader, test_dataloader, criterion, optimizer, scheduler, num_epochs=25, inference=False): model = model.float() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) model = model.to(device) if not inference: print('Start training CRNet...') for epoch in range(num_epochs): model.train() scheduler.step() running_loss = 0.0 for i, data in enumerate(train_dataloader, 0): inputs, scores, classes = data['image'], data['score'], data[ 'class'] inputs = inputs.to(device) scores = scores.to(device) classes = classes.to(device) optimizer.zero_grad() inputs = inputs.float() scores = scores.float().view(cfg['batch_size'], 1) # classes = classes.int().view(cfg['batch_size'], 3) reg_out, cls_out = model(inputs) loss = criterion(cls_out, classes, reg_out, scores) loss.backward() optimizer.step() # print statistics running_loss += loss.item() if i % 10 == 9: # print every 10 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10)) running_loss = 0.0 print('Finished training CRNet...\n') print('Saving trained model...') model_path_dir = './model' mkdirs_if_not_exist(model_path_dir) torch.save(model.state_dict(), os.path.join(model_path_dir, 'crnet.pth')) print('CRNet has been saved successfully~') else: print('Loading pre-trained model...') model.load_state_dict(torch.load(os.path.join('./model/crnet.pth'))) model.eval() print('Start testing CRNet...') predicted_labels = [] gt_labels = [] filenames = [] for data in test_dataloader: images, scores, classes, filename = data['image'], data['score'], data[ 'class'], data['filename'] images = images.to(device) reg_out, cls_out = model.forward(images) # bat_list = [] # for out in F.softmax(cls_out).to("cpu"): # tmp = 0 # for i in range(0, 3, 1): # tmp += out[i] * (i - 1) # bat_list.append(float(tmp.detach().numpy())) # predicted_labels += (0.6 * reg_out.to("cpu").detach().numpy() + 0.4 * np.array(bat_list)).tolist() predicted_labels += reg_out.to("cpu").detach().numpy().tolist() gt_labels += scores.to("cpu").detach().numpy().tolist() filenames += filename from sklearn.metrics import mean_absolute_error, mean_squared_error mae_lr = round( mean_absolute_error(np.array(gt_labels), np.array(predicted_labels).ravel()), 4) rmse_lr = round( np.math.sqrt( mean_squared_error(np.array(gt_labels), np.array(predicted_labels).ravel())), 4) pc = round( np.corrcoef(np.array(gt_labels), np.array(predicted_labels).ravel())[0, 1], 4) print( '===============The Mean Absolute Error of CRNet is {0}====================' .format(mae_lr)) print( '===============The Root Mean Square Error of CRNet is {0}====================' .format(rmse_lr)) print( '===============The Pearson Correlation of CRNet is {0}====================' .format(pc)) col = ['filename', 'gt', 'pred'] df = pd.DataFrame([[filenames[i], gt_labels[i], predicted_labels[i][0]] for i in range(len(gt_labels))], columns=col) df.to_excel("./output.xlsx", sheet_name='Output', index=False) print('Output Excel has been generated~')
def train(train_data, val_data, model, args): ''' Train the model Use val_data to do early stopping Args: model (dict): {'ebd': embedding, 'clf': classifier} ''' # creating a tmp directory to save the models out_dir = os.path.abspath(os.path.join( os.path.curdir, "tmp-runs", str(int(time.time() * 1e7)))) if not os.path.exists(out_dir): os.makedirs(out_dir) # Write results # write_acc_tr = 'acc_base.csv' # init_csv(write_acc_tr) # write_acc_val = 'val_acc_base.csv' # init_csv(write_acc_val) best_acc = 0 sub_cycle = 0 best_path = None # grad_param generates the learnable parameters from the classifier params_to_opt = grad_param(model, ['ebd', 'clf']) opt = torch.optim.Adam(params_to_opt, lr=args.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( opt, 'max', patience=args.patience//2, factor=0.1, verbose=True) print("{}, Start training".format( datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S')), flush=True) train_gen = ParallelSampler(train_data, args, args.train_episodes) train_gen_val = ParallelSampler(train_data, args, args.val_episodes) val_gen = ParallelSampler(val_data, args, args.val_episodes) for ep in range(args.train_epochs): sampled_tasks = train_gen.get_epoch() grad = {'clf': [], 'ebd': []} if not args.notqdm: sampled_tasks = tqdm(sampled_tasks, total=train_gen.num_episodes, ncols=80, leave=False, desc=colored('Training on train', 'yellow')) for task in sampled_tasks: if task is None: break train_one(task, model, opt, args, grad) if ep % 10 == 0: acc, std = test(train_data, model, args, args.val_episodes, False, train_gen_val.get_epoch()) print("{}, {:s} {:2d}, {:s} {:s}{:>7.4f} ± {:>6.4f} ".format( datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S'), "ep", ep, colored("train", "red"), colored("acc:", "blue"), acc, std, ), flush=True) # write_csv(write_acc_tr, acc, std, ep) # Evaluate validation accuracy cur_acc, cur_std = test(val_data, model, args, args.val_episodes, False, val_gen.get_epoch()) print(("{}, {:s} {:2d}, {:s} {:s}{:>7.4f} ± {:>6.4f}, " "{:s} {:s}{:>7.4f}, {:s}{:>7.4f}").format( datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S'), "ep", ep, colored("val ", "cyan"), colored("acc:", "blue"), cur_acc, cur_std, colored("train stats", "cyan"), colored("ebd_grad:", "blue"), np.mean(np.array(grad['ebd'])), colored("clf_grad:", "blue"), np.mean(np.array(grad['clf'])), ), flush=True) # if ep % 10 == 0: write_csv(write_acc_val, cur_acc, cur_std, ep) # Update the current best model if val acc is better if cur_acc > best_acc: best_acc = cur_acc best_path = os.path.join(out_dir, str(ep)) # save current model print("{}, Save cur best model to {}".format( datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S'), best_path)) torch.save(model['ebd'].state_dict(), best_path + '.ebd') torch.save(model['clf'].state_dict(), best_path + '.clf') sub_cycle = 0 else: sub_cycle += 1 # Break if the val acc hasn't improved in the past patience epochs if sub_cycle == args.patience: break print("{}, End of training. Restore the best weights".format( datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S')), flush=True) # restore the best saved model model['ebd'].load_state_dict(torch.load(best_path + '.ebd')) model['clf'].load_state_dict(torch.load(best_path + '.clf')) if args.save: # save the current model out_dir = os.path.abspath(os.path.join( os.path.curdir, "saved-runs", str(int(time.time() * 1e7)))) if not os.path.exists(out_dir): os.makedirs(out_dir) best_path = os.path.join(out_dir, 'best') print("{}, Save best model to {}".format( datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S'), best_path), flush=True) torch.save(model['ebd'].state_dict(), best_path + '.ebd') torch.save(model['clf'].state_dict(), best_path + '.clf') with open(best_path + '_args.txt', 'w') as f: for attr, value in sorted(args.__dict__.items()): f.write("{}={}\n".format(attr, value)) return
def get_gqn_dataset_with_name(name, train_batch_size, eval_batch_size, kwargs, allow_empty_context=False, target_sample_method='remaining', max_cond_size=20, max_target_size=20, num_data=None ): # init dataset (train / val) train_dataset = SceneDataset( root='data/gqn-datasets', name=name, train=True, img_size=64, allow_empty_context=allow_empty_context, target_sample_method=target_sample_method, max_cond_size=max_cond_size, max_target_size=max_target_size, ) val_dataset = SceneDataset( root='data/gqn-datasets', name=name, train=True, img_size=64, allow_empty_context=allow_empty_context, target_sample_method='remaining', #'full', max_cond_size=max_cond_size, max_target_size=max_target_size, ) test_dataset = SceneDataset( root='data/gqn-datasets', name=name, train=False, img_size=64, allow_empty_context=allow_empty_context, target_sample_method='remaining', #'full', max_cond_size=max_cond_size, max_target_size=max_target_size, ) # set num data if num_data is not None: num_data = min(len(train_dataset.samples), num_data) suffix = num_data train_dataset.samples = train_dataset.samples[:num_data] val_dataset.samples = val_dataset.samples[:num_data] else: num_data = len(train_dataset.samples) suffix = None # split train and val os.system('mkdir -p cache/gqn-datasets/{}'.format(name)) split_filename = os.path.join( 'cache/gqn-datasets/{}'.format(name), 'split-{}.pt'.format(suffix) if suffix is not None else 'split.pt' ) if os.path.exists(split_filename): indices = torch.load(split_filename) else: indices = torch.from_numpy(np.random.permutation(num_data)) torch.save(indices, open(split_filename, 'wb')) train_dataset.samples = [train_dataset.samples[index] for index in indices[:num_data-20000]] val_dataset.samples = [val_dataset.samples[index] for index in indices[num_data-20000:]] # init dataloader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, collate_fn=collate_fn, **kwargs) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=train_batch_size, shuffle=False, collate_fn=collate_fn, **kwargs) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=eval_batch_size, shuffle=False, collate_fn=collate_fn, **kwargs) # init info info = {} info['nviews'] = train_dataset.num_views info['max_cond_size'] = train_dataset.max_cond_size info['max_target_size'] = train_dataset.max_target_size info['allow_empty_context'] = train_dataset.allow_empty_context info['target_sample_method'] = train_dataset.target_sample_method return train_loader, val_loader, test_loader, info
def trainEpoches(encoder, decoder, criterion, print_every=10, learning_rate=0.001, l2=0.0001): start = time.time() out_losses = [] print_loss_total = 0 # Reset every print_every # plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, weight_decay=l2) # SGD decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, weight_decay=l2) # training_pairs = [tensorsFromPair(random.choice(pairs)) # for i in range(n_iters)] # for iter in range(1, n_iters + 1): # training_pair = training_pairs[iter - 1] # for epoch in range(epoches): # i = 0 mini_batches = get_minibatches(train_datasets, BATCH) batches_size = len(train_datasets[0]) // BATCH # len(list(mini_batches)) for i, data in enumerate(mini_batches): if i == batches_size: break # for i, data in enumerate(train_dataloader, 1): sentences, tags = data input_tensor, input_length = padding_sequence(sentences, pad_token=EMBEDDING_SIZE) target_tensor, target_length = padding_sequence(tags, pad_token=TAG_SIZE) if torch.cuda.is_available(): input_tensor = Variable( torch.cuda.LongTensor(input_tensor, device=device)).cuda() target_tensor = Variable( torch.cuda.LongTensor(target_tensor, device=device)).cuda() else: input_tensor = Variable( torch.LongTensor(input_tensor, device=device)) target_tensor = Variable( torch.LongTensor(target_tensor, device=device)) loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) # , input_length, target_length out_losses.append(loss) print_loss_total += loss # plot_loss_total += loss if i % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print(' (%d %d%%) %.4f' % (i, float(i) / batches_size * 100, print_loss_avg)) # print('%s (%d %d%%) %.4f' % (timeSince(start, float(i) / batches_size), # i, float(i) / batches_size * 100, print_loss_avg)) # plot_loss_avg = plot_loss_total / plot_every # plot_losses.append(plot_loss_avg) # plot_loss_total = 0 # i += 1 np.save("loss", out_losses) if epoch % 10 == 0: model_name = "./model/model_encoder_epoch" + str(epoch) + ".pkl" torch.save(encoder, model_name) model_name = "./model/model_decoder_epoch" + str(epoch) + ".pkl" torch.save(decoder, model_name) print("Model has been saved")
def cache(self, name, cache, url=None): path = os.path.join(cache, name) path_pt = path + '.pt' if not os.path.isfile(path_pt): if not os.path.isfile(path) and url: logger.info('Downloading vectors from {}'.format(url)) if not os.path.exists(cache): os.makedirs(cache) dest = os.path.join(cache, os.path.basename(url)) if not os.path.isfile(dest): with tqdm(unit='B', unit_scale=True, miniters=1, desc=dest) as t: urlretrieve(url, dest, reporthook=reporthook(t)) logger.info('Extracting vectors into {}'.format(cache)) ext = os.path.splitext(dest)[1][1:] if ext == 'zip': with zipfile.ZipFile(dest, "r") as zf: zf.extractall(cache) elif ext == 'gz': with tarfile.open(dest, 'r:gz') as tar: tar.extractall(path=cache) if not os.path.isfile(path): raise RuntimeError('no vectors found at {}'.format(path)) # str call is necessary for Python 2/3 compatibility, since # argument must be Python 2 str (Python 3 bytes) or # Python 3 str (Python 2 unicode) itos, vectors, dim = [], array.array(str('d')), None # Try to read the whole file with utf-8 encoding. binary_lines = False try: with io.open(path, encoding="utf8") as f: lines = [line for line in f] # If there are malformed lines, read in binary mode # and manually decode each word from utf-8 except: logger.warning("Could not read {} as UTF8 file, " "reading file as bytes and skipping " "words with malformed UTF8.".format(path)) with open(path, 'rb') as f: lines = [line for line in f] binary_lines = True logger.info("Loading vectors from {}".format(path)) for line in tqdm(lines, total=len(lines)): # Explicitly splitting on " " is important, so we don't # get rid of Unicode non-breaking spaces in the vectors. entries = line.rstrip().split(" ") word, entries = entries[0], entries[1:] if dim is None and len(entries) > 1: dim = len(entries) elif len(entries) == 1: logger.warning("Skipping token {} with 1-dimensional " "vector {}; likely a header".format(word, entries)) continue elif dim != len(entries): raise RuntimeError( "Vector for token {} has {} dimensions, but previously " "read vectors have {} dimensions. All vectors must have " "the same number of dimensions.".format(word, len(entries), dim)) if binary_lines: try: if isinstance(word, six.binary_type): word = word.decode('utf-8') except: logger.info("Skipping non-UTF8 token {}".format(repr(word))) continue vectors.extend(float(x) for x in entries) itos.append(word) self.itos = itos self.stoi = {word: i for i, word in enumerate(itos)} self.vectors = torch.Tensor(vectors).view(-1, dim) self.dim = dim logger.info('Saving vectors to {}'.format(path_pt)) torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt) else: logger.info('Loading vectors from {}'.format(path_pt)) self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt)
def saveModel(FOLD, model_dict): dirs = f'./new_trained_models/fold{FOLD}/' if not os.path.exists(dirs): os.makedirs(dirs) for mod, model in model_dict.items(): torch.save(model.state_dict(), f'{dirs}/{mod}')
def main(): config_yaml, local_rank = parse_my_arguments() args = args_from_yaml(config_yaml) args.local_rank = local_rank # args = parse_arguments() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) device, args = setup_training(args) # Prepare optimizer model, optimizer, checkpoint, global_step = prepare_model_and_optimizer(args, device) if is_main_process(): print("SEED {}".format(args.seed)) if args.do_train: if is_main_process(): logger.info("***** Running training *****") # logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.train_batch_size) print(" LR = ", args.learning_rate) print("Training. . .") model.train() most_recent_ckpts_paths = [] average_loss = 0.0 # averaged loss every args.log_freq steps epoch = 0 training_steps = 0 pool = ProcessPoolExecutor(1) # Note: We loop infinitely over epochs, termination is handled via iteration count while True: if not args.resume_from_checkpoint or epoch > 0 or args.phase2: files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and 'training' in f] files.sort() num_files = len(files) random.shuffle(files) f_start_id = 0 else: f_start_id = checkpoint['files'][0] files = checkpoint['files'][1:] args.resume_from_checkpoint = False num_files = len(files) shared_file_list = {} if torch.distributed.is_initialized() and torch.distributed.get_world_size() > num_files: remainder = torch.distributed.get_world_size() % num_files data_file = files[( f_start_id * torch.distributed.get_world_size() + torch.distributed.get_rank() + remainder * f_start_id) % num_files] else: data_file = files[ (f_start_id * torch.distributed.get_world_size() + torch.distributed.get_rank()) % num_files] previous_file = data_file train_data = pretraining_dataset(data_file, args.max_predictions_per_seq) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * args.n_gpu, num_workers=4, pin_memory=True) # shared_file_list["0"] = (train_dataloader, data_file) overflow_buf = None if args.allreduce_post_accumulation: overflow_buf = torch.cuda.IntTensor([0]) for f_id in range(f_start_id + 1, len(files)): if torch.distributed.get_world_size() > num_files: data_file = files[(f_id * torch.distributed.get_world_size() + torch.distributed.get_rank() + remainder * f_id) % num_files] else: data_file = files[ (f_id * torch.distributed.get_world_size() + torch.distributed.get_rank()) % num_files] logger.info("file no %s file %s" % (f_id, previous_file)) previous_file = data_file dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args) train_iter = tqdm(train_dataloader, desc="Iteration") if is_main_process() else train_dataloader for step, batch in enumerate(train_iter): training_steps += 1 batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels, checkpoint_activations=args.checkpoint_activations) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. divisor = args.gradient_accumulation_steps if args.gradient_accumulation_steps > 1: if not args.allreduce_post_accumulation: # this division was merged into predivision loss = loss / args.gradient_accumulation_steps divisor = 1.0 if args.fp16: with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss: scaled_loss.backward() else: loss.backward() average_loss += loss.item() if training_steps % args.gradient_accumulation_steps == 0: global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step) if global_step >= args.max_steps: last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda() average_loss = average_loss / (last_num_steps * divisor) if torch.distributed.is_initialized(): average_loss /= torch.distributed.get_world_size() torch.distributed.all_reduce(average_loss) if is_main_process(): logger.info("Total Steps:{} Final Loss = {}".format( training_steps / args.gradient_accumulation_steps, average_loss.item())) elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0: if is_main_process(): print("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss / ( args.log_freq * divisor), loss.item() * args.gradient_accumulation_steps / divisor, optimizer.param_groups[0][ 'lr'])) average_loss = 0 if global_step >= args.max_steps or training_steps % ( args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0: if is_main_process(): # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self if args.resume_step < 0 or not args.phase2: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)) else: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step)) if args.do_train: torch.save({'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'master params': list(amp.master_params(optimizer)), 'files': [f_id] + files}, output_save_file) most_recent_ckpts_paths.append(output_save_file) if len(most_recent_ckpts_paths) > 3: ckpt_to_be_removed = most_recent_ckpts_paths.pop(0) os.remove(ckpt_to_be_removed) if global_step >= args.max_steps: del train_dataloader # thread.join() return args del train_dataloader # thread.join() # Make sure pool has finished and switch train_dataloader # NOTE: Will block until complete train_dataloader, data_file = dataset_future.result(timeout=None) epoch += 1
def main(): # init model, ResNet18() can be also used here for training # model = WideResNet().to(device) if args.network == 'smallCNN': model = SmallCNN().to(device) elif args.network == 'wideResNet': model = WideResNet().to(device) elif args.network == 'resnet': model = ResNet().to(device) else: model = VGG(args.network, num_classes=10).to(device) sys.stdout = Logger(os.path.join(args.log_dir, args.log_file)) print(model) criterion_prox = Proximity(10, args.feat_size, True) criterion_conprox = Con_Proximity(10, args.feat_size, True) optimizer_prox = optim.SGD(criterion_prox.parameters(), lr=args.lr_prox) optimizer_conprox = optim.SGD(criterion_conprox.parameters(), lr=args.lr_conprox) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.fine_tune: base_dir = args.base_dir state_dict = torch.load("{}/{}_ep{}.pt".format(base_dir, args.base_model, args.checkpoint)) opt = torch.load("{}/opt-{}_ep{}.tar".format(base_dir, args.base_model, args.checkpoint)) model.load_state_dict(state_dict) optimizer.load_state_dict(opt) natural_acc = [] robust_acc = [] for epoch in range(1, args.epochs + 1): # adjust learning rate for SGD adjust_learning_rate(optimizer, epoch) adjust_learning_rate(optimizer_prox, epoch) adjust_learning_rate(optimizer_conprox, epoch) start_time = time.time() # adversarial training train(model, device, train_loader, optimizer, criterion_prox, optimizer_prox, criterion_conprox, optimizer_conprox, epoch) # evaluation on natural examples print('================================================================') print("Current time: {}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) # eval_train(model, device, train_loader) # eval_test(model, device, test_loader) natural_err_total, robust_err_total = eval_adv_test_whitebox(model, device, test_loader) with open(os.path.join(stats_dir, '{}.txt'.format(args.save_model)), "a") as f: f.write("{} {} {}\n".format(epoch, natural_err_total, robust_err_total)) print('using time:', datetime.timedelta(seconds=round(time.time() - start_time))) natural_acc.append(natural_err_total) robust_acc.append(robust_err_total) file_name = os.path.join(stats_dir, '{}_stat{}.npy'.format(args.save_model, epoch)) # np.save(file_name, np.stack((np.array(self.train_loss), np.array(self.test_loss), # np.array(self.train_acc), np.array(self.test_acc), # np.array(self.elasticity), np.array(self.x_grads), # np.array(self.fgsms), np.array(self.pgds), # np.array(self.cws)))) np.save(file_name, np.stack((np.array(natural_acc), np.array(robust_acc)))) # save checkpoint if epoch % args.save_freq == 0: torch.save(model.state_dict(), os.path.join(model_dir, '{}_ep{}.pt'.format(args.save_model, epoch))) torch.save(optimizer.state_dict(), os.path.join(model_dir, 'opt-{}_ep{}.tar'.format(args.save_model, epoch))) print("Ep{}: Model saved as {}.".format(epoch, args.save_model)) print('================================================================')
def strip_optimizer(f='weights/last.pt'): # from utils.utils import *; strip_optimizer() # Strip optimizer from *.pt files for lighter files (reduced by 2/3 size) x = torch.load(f, map_location=torch.device('cpu')) x['optimizer'] = None torch.save(x, f)
label.fill_(real_label)#-- fake labels are real for generator cost errGA = criterion.forward(output_A,label) lossG += errGA.item() #errGA = errGA * 1/2 errGA.backward() errG = (errGA + errGD)/2 optimizerG.step() # print('batch_step',i+len(train_dataloader)*epoch) if (i==0) & (epoch == 0): start_time=time.time() if (i == 536) & (epoch == 0): end_time = time.time() print('1_epoch takes {} seconds'.format(end_time-start_time)) if (i+len(train_dataloader)*epoch) % 67==0: print('epoch={},batch={}, lossG={}, lossA={}, lossD={}'.format(epoch,i,lossG/2,lossA/3,lossD/3)) writer.add_scalar('data_adam/lossA', errA, i+len(train_dataloader)*epoch) writer.add_scalar('data_adam/lossD', errD, i+len(train_dataloader)*epoch) writer.add_scalar('data_adam/lossG', errG, i+len(train_dataloader)*epoch) if (i+len(train_dataloader)*epoch) % 134==0 : fake = (fake +1)/2 ass_label = (ass_label +1)/2 writer.add_image('data/generator_adam_loss_{} input_picture'.format(i+len(train_dataloader)*epoch),input_img[:8],i+len(train_dataloader)*epoch) writer.add_image('data/generator_adam_loss_{} fake_picture'.format(i+len(train_dataloader)*epoch),fake[:8],i+len(train_dataloader)*epoch) writer.add_image('data/generator_adam_loss_{} ground_truth'.format(i+len(train_dataloader)*epoch),ass_label[:8],i+len(train_dataloader)*epoch) # do checkpointing if (epoch+1) % 1 ==0: torch.save(netG.state_dict(), '%s/adam_netG_epoch_%d.pth' % (opt.outf, epoch+1)) torch.save(netD.state_dict(), '%s/adam_netD_epoch_%d.pth' % (opt.outf, epoch+1)) torch.save(netA.state_dict(), '%s/adam_netA_epoch_%d.pth' % (opt.outf, epoch+1)) writer.close()