def upload_experiment(): experiment = Experiment(**COMET_ML_KEY) experiment.log_asset_folder('./datasets') experiment.log_asset_folder('./models') experiment.log_asset_folder('./knapsack') experiment.log_asset(RESULT_FILE)
break # Save training history history_file = os.path.join(output_dir, experiment_name + "_history.npz") save_history(history_file, history) experiment.log_asset(history_file) end_time = time.time() print("Training took " + str(('%.3f' % (end_time - start_time))) + " seconds for " + str(num_epochs) + " epochs") print("------------------------------------") print("Saving model...") checkpointer.save(global_step) experiment.log_asset_folder(checkpoint_dir) if testing: # Test the model print("------------------------------------") print("Testing model...") # Load if best weights exists best_weights_file = checkpointer.get_best_weights() if load_model and best_weights_file and os.path.exists( best_weights_file): model.load_weights(best_weights_file) print("Loaded model weights from: " + best_weights_file) start_time = time.time() print("Testing started: " +
best_sel_acc = val_acc[1][1] best_sel_idx = i + 1 torch.save( model.sel_pred.state_dict(), 'saved_model/epoch%d.sel_model%s' % (i + 1, args.suffix)) torch.save(model.sel_pred.state_dict(), sel_m) if args.train_emb: torch.save( model.sel_embed_layer.state_dict(), 'saved_model/epoch%d.sel_embed%s' % (i + 1, args.suffix)) torch.save(model.sel_embed_layer.state_dict(), sel_e) if TRAIN_COND: if val_acc[1][2] > best_cond_acc: best_cond_acc = val_acc[1][2] best_cond_idx = i + 1 torch.save( model.cond_pred.state_dict(), 'saved_model/epoch%d.cond_model%s' % (i + 1, args.suffix)) torch.save(model.cond_pred.state_dict(), cond_m) if args.train_emb: torch.save( model.cond_embed_layer.state_dict(), 'saved_model/epoch%d.cond_embed%s' % (i + 1, args.suffix)) torch.save(model.cond_embed_layer.state_dict(), cond_e) experiment.log_asset_folder('saved_model')
parse_args=False, disabled=True) else: experiment = Experiment(api_key="jYkp7GiEE17RfR1iGGvF2rMTB", project_name="mvbchallenge", workspace="johnzhang1999", parse_args=False) name = args.arch + '_' + args.sources[0] + '_' + str(args.lr) + '_' + str( args.batch_size) experiment.set_name(name) if args.resume: experiment.add_tag('resume') experiment.log_parameters(args.__dict__) # asset logging is BUG-gy! experiment.log_asset_folder(osp.expanduser('log/')) experiment.log_asset_folder(osp.expanduser('runs/')) def build_datamanager(args): if args.app == 'image': return torchreid.data.ImageDataManager(**imagedata_kwargs(args)) else: return torchreid.data.VideoDataManager(**videodata_kwargs(args)) def build_engine(args, datamanager, model, optimizer, scheduler,
実験2 """ if '2' in args.experiments: exp2_result_df = model.robustness_experiments(experiment, args.data_name, training_rates_list=[0.03]) exp2_result_df.to_csv(result_dir / 'exp2.csv') """ 実験3 """ if '3' in args.experiments: exp3_result_df = model.robustness_experiments( experiment, args.data_name, training_rates_list=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]) exp3_result_df.to_csv(result_dir / 'exp3.csv') """ 実験4 """ if '4' in args.experiments: exp4_result_df = model.inductive_learning_eval( args.exp4_select, experiment, args.data_name, rate_list=args.exp4_rate_list, iter_num=30) exp4_result_df.to_csv(result_dir / 'exp4.csv') # comet-mlに保存 experiment.log_asset_folder(result_dir)
class ModelTrainer: def __init__(self, model, dataloader, args): self.model = model self.args = args self.data = dataloader self.metric = args.metric if (dataloader is not None): self.frq_log = len(dataloader['train']) // args.frq_log self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') model.to(self.device) if args.optimizer == 'sgd': self.optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer == 'adam': self.optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta1, 0.999), weight_decay=args.weight_decay) else: raise Exception('--optimizer should be one of {sgd, adam}') if args.scheduler == 'set': self.scheduler = optim.lr_scheduler.LambdaLR( self.optimizer, lambda epoch: 10**(epoch / args.scheduler_factor)) elif args.scheduler == 'auto': self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, mode='min', factor=args.scheduler_factor, patience=5, verbose=True, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08) self.experiment = Experiment(api_key=args.comet_key, project_name=args.comet_project, workspace=args.comet_workspace, auto_weight_logging=True, auto_metric_logging=False, auto_param_logging=False) self.experiment.set_name(args.name) self.experiment.log_parameters(vars(args)) self.experiment.set_model_graph(str(self.model)) def train_one_epoch(self, epoch): self.model.train() train_loader = self.data['train'] train_loss = 0 correct = 0 comet_offset = epoch * len(train_loader) for batch_idx, (data, target) in tqdm(enumerate(train_loader), leave=True, total=len(train_loader)): data, target = data.to(self.device), target.to(self.device) self.optimizer.zero_grad() output = self.model(data) loss = F.cross_entropy(output, target, reduction='sum') loss.backward() self.optimizer.step() pred = output.argmax(dim=1, keepdim=True) acc = pred.eq(target.view_as(pred)).sum().item() train_loss += loss.item() correct += acc loss = loss.item() / len(data) acc = 100. * acc / len(data) comet_step = comet_offset + batch_idx self.experiment.log_metric('batch_loss', loss, comet_step, epoch) self.experiment.log_metric('batch_acc', acc, comet_step, epoch) if (batch_idx + 1) % self.frq_log == 0: self.experiment.log_metric('log_loss', loss, comet_step, epoch) self.experiment.log_metric('log_acc', acc, comet_step, epoch) print('Epoch: {} [{}/{}]\tLoss: {:.6f}\tAcc: {:.2f}%'.format( epoch + 1, (batch_idx + 1) * len(data), len(train_loader.dataset), loss, acc)) train_loss /= len(train_loader.dataset) acc = 100. * correct / len(train_loader.dataset) comet_step = comet_offset + len(train_loader) - 1 self.experiment.log_metric('loss', train_loss, comet_step, epoch) self.experiment.log_metric('acc', acc, comet_step, epoch) print( 'Epoch: {} [Done]\tLoss: {:.4f}\tAccuracy: {}/{} ({:.2f}%)'.format( epoch + 1, train_loss, correct, len(train_loader.dataset), acc)) return {'loss': train_loss, 'acc': acc} def train(self): self.log_cmd() best = -1 history = {'lr': [], 'train_loss': []} try: print(">> Training %s" % self.model.name) for epoch in range(self.args.nepoch): with self.experiment.train(): train_res = self.train_one_epoch(epoch) with self.experiment.validate(): print("\nvalidation...") comet_offset = (epoch + 1) * len(self.data['train']) - 1 res = self.val(self.data['val'], comet_offset, epoch) if res[self.metric] > best: best = res[self.metric] self.save_weights(epoch) if self.args.scheduler == 'set': lr = self.optimizer.param_groups[0]['lr'] history['lr'].append(lr) history['train_loss'].append(train_res['loss']) self.scheduler.step(epoch + 1) lr = self.optimizer.param_groups[0]['lr'] print('learning rate changed to: %.10f' % lr) elif self.args.scheduler == 'auto': self.scheduler.step(train_res['loss']) finally: print(">> Training model %s. [Stopped]" % self.model.name) self.experiment.log_asset_folder(os.path.join( self.args.outf, self.args.name, 'weights'), step=None, log_file_name=False, recursive=False) if self.args.scheduler == 'set': plt.semilogx(history['lr'], history['train_loss']) plt.grid(True) self.experiment.log_figure(figure=plt) plt.show() def val(self, val_loader, comet_offset=-1, epoch=-1): self.model.eval() test_loss = 0 correct = 0 labels = list(range(self.args.nclass)) cm = np.zeros((len(labels), len(labels))) with torch.no_grad(): for data, target in tqdm(val_loader, leave=True, total=len(val_loader)): data, target = data.to(self.device), target.to(self.device) output = self.model(data) test_loss += F.cross_entropy(output, target, reduction='sum').item() pred = output.argmax(dim=1, keepdim=True) correct += pred.eq(target.view_as(pred)).sum().item() pred = pred.view_as(target).data.cpu().numpy() target = target.data.cpu().numpy() cm += confusion_matrix(target, pred, labels=labels) test_loss /= len(val_loader.dataset) accuracy = 100. * correct / len(val_loader.dataset) print('Evaluation: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'. format(test_loss, correct, len(val_loader.dataset), accuracy)) res = {'loss': test_loss, 'acc': accuracy} self.experiment.log_metrics(res, step=comet_offset, epoch=epoch) self.experiment.log_confusion_matrix( matrix=cm, labels=[ClassDict.getName(x) for x in labels], title='confusion matrix after epoch %03d' % epoch, file_name="confusion_matrix_%03d.json" % epoch) return res def test(self): self.load_weights() with self.experiment.test(): print('\ntesting....') res = self.val(self.data['test']) def log_cmd(self): d = vars(self.args) cmd = '!python main.py \\\n' tab = ' ' for k, v in d.items(): if v is None or v == '' or (isinstance(v, bool) and v is False): continue if isinstance(v, bool): arg = '--{} \\\n'.format(k) else: arg = '--{} {} \\\n'.format(k, v) cmd = cmd + tab + arg # print(cmd); self.experiment.log_text(cmd) def save_weights(self, epoch: int): weight_dir = os.path.join(self.args.outf, self.args.name, 'weights') if not os.path.exists(weight_dir): os.makedirs(weight_dir) torch.save({ 'epoch': epoch, 'state_dict': self.model.state_dict() }, os.path.join(weight_dir, 'model.pth')) def load_weights(self): path_g = self.args.weights_path if path_g is None: weight_dir = os.path.join(self.args.outf, self.args.name, 'weights') path_g = os.path.join(weight_dir, 'model.pth') print('>> Loading weights...') weights_g = torch.load(path_g, map_location=self.device)['state_dict'] self.model.load_state_dict(weights_g) print(' Done.') def predict(self, x): x = x / 2**15 self.model.eval() with torch.no_grad(): x = torch.from_numpy(x).float() x = self.transform(x) x = x.unsqueeze(0) x = self.model(x) x = F.softmax(x, dim=1) x = x.numpy() return x
opt) # regular setup: load and print networks; create schedulers # create a website web_dir = os.path.join( opt.results_dir, opt.name, '%s_%s' % (opt.phase, opt.epoch)) # define the website directory webpage = html.HTML( web_dir, 'Experiment = %s, Phase = %s, Epoch = %s' % (opt.name, opt.phase, opt.epoch)) # test with eval mode. This only affects layers like batchnorm and dropout. # For [pix2pix]: we use batchnorm and dropout in the original pix2pix. You can experiment it with and without eval() mode. # For [CycleGAN]: It should not affect CycleGAN as CycleGAN uses instancenorm without dropout. if opt.eval: model.eval() for i, data in enumerate(dataset): if i >= opt.num_test: # only apply our model to opt.num_test images. break model.set_input(data) # unpack data from data loader model.test() # run inference visuals = model.get_current_visuals() # get image results img_path = model.get_image_paths() comet_exp.log_image(img_path[0]) # get image paths if i % 5 == 0: # save images to an HTML file print('processing (%04d)-th image... %s' % (i, img_path)) save_images(webpage, visuals, img_path, aspect_ratio=opt.aspect_ratio, width=opt.display_winsize) webpage.save() # save the HTML comet_exp.log_asset_folder(webpage.get_image_dir())