parser.add_argument('-p', '--checkpoint_path', type=str, default=None, required=False, help="path of checkpoint pt file") args = parser.parse_args() hp = HParam(args.config) graphs = [ read_graph(hp.model.graph0), read_graph(hp.model.graph1), read_graph(hp.model.graph2), ] print('Loading model from checkpoint...') model = RandWire(hp, graphs).cuda() checkpoint = torch.load(args.checkpoint_path) model.load_state_dict(checkpoint['model']) step = checkpoint['step'] dataset = hp.data.type switcher = { 'MNIST': MNIST_dataloader, 'CIFAR10': CIFAR10_dataloader, 'ImageNet': create_dataloader, } assert dataset in switcher.keys(), 'Dataset type currently not supported' dl_func = switcher[dataset] valset = dl_func(hp, args, False) print('Validating...')
def train(out_dir, chkpt_path, trainset, valset, writer, logger, hp, hp_str, graphs, in_channels=3): model = RandWire(hp, graphs, in_channels=in_channels).cuda() #print(net) #print(count_parameters(model)) #with torch.no_grad(): # x = torch.randn(1,1,28,28).cuda() # y = model(x) # Save onnx model for visualization #torch.onnx.export(model, x, "onnx/CElegans.onnx") #print('Model saved to disk') if hp.train.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=hp.train.adam*10) elif hp.train.optimizer == 'adabound': optimizer = adabound.AdaBound(model.parameters(), lr=hp.train.adabound.initial, final_lr=hp.train.adabound.final) else: raise Exception("Optimizer not supported: %s" % hp.train.optimizer) step = 0 if chkpt_path is not None: logger.info("Resuming from checkpoint: %s" % chkpt_path) checkpoint = torch.load(chkpt_path) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) step = checkpoint['step'] if hp_str != checkpoint['hp_str']: logger.warning("New hparams are different from checkpoint.") logger.warning("Will use new hparams.") # hp = load_hparam_str(hp_str) else: logger.info("Starting new training run") logger.info("Writing graph to tensorboardX...") #writer.write_graph(model, torch.randn(1, 1, 28, 28).cuda()) logger.info("Finished.") try: model.train() epoch = 0 best_acc = 0 while epoch < 20: # TODO Change this to one epoch but make sure logging still works print(type(model.dagly3.weighted_adj)) print(model.dagly3.weighted_adj) np.savetxt('adj/unweighted.txt'.format(epoch), model.dagly3.unweighted_adj) np.savetxt('adj/after_{}_epochs.txt'.format(epoch), model.dagly3.weighted_adj) for data, target in trainset: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() step += 1 loss = loss.item() if loss > 1e8 or math.isnan(loss): logger.error("Loss exploded to %.02f at step %d!" % (loss, step)) raise Exception("Loss exploded") if step % hp.train.summary_interval == 0: writer.log_training(loss, step) logger.info("Wrote summary at step %d, epoch %d" % (step, epoch)) if step % len(trainset) == 0: #step % hp.train.checkpoint_interval == 0: save_path = os.path.join(out_dir, 'chkpt_%07d.pt' % step) torch.save({ 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'step': step, 'hp_str': hp_str, }, save_path) logger.info("Saved checkpoint to: %s" % save_path) if step % len(trainset) == 0: #step % hp.train.evaluation_interval == 0: test_loss, accuracy = validate(model, valset, writer, step) if accuracy > best_acc: best_acc = accuracy logger.info("Evaluation saved at step %d, epoch %d | test_loss: %.5f | accuracy: %.4f" % (step, epoch, test_loss, accuracy)) if step % hp.train.decay.step == 0: temp = optimizer.state_dict() temp['param_groups'][0]['lr'] *= hp.train.decay.gamma optimizer.load_state_dict(temp) epoch += 1 writer.log_best_acc(best_acc) print(type(model.dagly3.weighted_adj)) print(model.dagly3.weighted_adj) np.savetxt('adj/after_{}_epochs.txt'.format(epoch), model.dagly3.weighted_adj) except Exception as e: logger.info("Exiting due to exception: %s" % e) traceback.print_exc()
def train(out_dir, chkpt_path, trainset, valset, writer, logger, hp, hp_str, graphs): model = RandWire(hp, graphs).cuda() if hp.train.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=hp.train.adam) elif hp.train.optimizer == 'adabound': optimizer = adabound.AdaBound(model.parameters(), lr=hp.train.adabound.initial, final_lr=hp.train.adabound.final) elif hp.train.optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=hp.train.sgd.lr, momentum=hp.train.sgd.momentum, weight_decay=hp.train.sgd.weight_decay) else: raise Exception("Optimizer not supported: %s" % hp.train.optimizer) step = 0 if chkpt_path is not None: logger.info("Resuming from checkpoint: %s" % chkpt_path) checkpoint = torch.load(chkpt_path) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) step = checkpoint['step'] if hp_str != checkpoint['hp_str']: logger.warning("New hparams are different from checkpoint.") logger.warning("Will use new hparams.") # hp = load_hparam_str(hp_str) else: logger.info("Starting new training run") logger.info("Writing graph to tensorboardX...") writer.write_graph( model, torch.randn(7, hp.model.input_maps, 224, 224).cuda()) logger.info("Finished.") try: model.train() while True: for data, target in trainset: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() step += 1 loss = loss.item() if loss > 1e8 or math.isnan(loss): logger.error("Loss exploded to %.02f at step %d!" % (loss, step)) raise Exception("Loss exploded") if step % hp.train.summary_interval == 0: writer.log_training(loss, step) logger.info("Wrote summary at step %d" % step) if step % hp.train.checkpoint_interval == 0: save_path = os.path.join(out_dir, 'chkpt_%07d.pt' % step) torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'step': step, 'hp_str': hp_str, }, save_path) logger.info("Saved checkpoint to: %s" % save_path) if step % hp.train.evaluation_interval == 0: test_loss, accuracy = validate(model, valset, writer, step) logger.info( "Evaluation saved at step %d | test_loss: %.5f | accuracy: %.4f" % (step, test_loss, accuracy)) if step % hp.train.decay.step == 0: temp = optimizer.state_dict() temp['param_groups'][0]['lr'] *= hp.train.decay.gamma optimizer.load_state_dict(temp) except Exception as e: logger.info("Exiting due to exception: %s" % e) traceback.print_exc()
def train(out_dir, chkpt_path, trainset, valset, writer, logger, hp, hp_str): model = RandWire(hp).cuda() if hp.train.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=hp.train.adam) elif hp.train.optimizer == 'adabound': optimizer = AdaBound(model.parameters(), lr=hp.train.adabound.initial, final_lr=hp.train.adabound.final) else: raise Exception("Optimizer not supported: %s" % hp.train.optimizer) step = 0 if chkpt_path is not None: logger.info("Resuming from checkpoint: %s" % chkpt_path) checkpoint = torch.load(chkpt_path) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) step = checkpoint['step'] if hp_str != checkpoint['hp_str']: logger.warning("New hparams are different from checkpoint.") logger.warning("Will use new hparams.") # hp = load_hparam_str(hp_str) else: logger.info("Starting new training run") try: model.train() while True: for batch in trainset: batch = batch.cuda() data, target = batch output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() step += 1 loss = loss.item() if loss > 1e8 or math.isnan(loss): logger.error("Loss exploded to %.02f at step %d!" % (loss, step)) raise Exception("Loss exploded") if step % hp.train.summary_interval == 0: writer.log_training(loss, step) logger.info("Wrote summary at step %d" % step) if step % hp.train.checkpoint_interval == 0: save_path = os.path.join(out_dir, 'chkpt_%07d.pt' % step) torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'step': step, 'hp_str': hp_str, }, save_path) logger.info("Saved checkpoint to: %s" % save_path) if step % hp.train.evaluation_interval == 0: test_loss, accuracy = validate(model, valset, writer, step) logger.info( "Evaluation saved at step %d | test_loss: %.5f | accuracy: %.2f(%)" % (test_loss, accuracy)) except Exception as e: logger.info("Exiting due to exception: %s" % e) traceback.print_exc()
num_edges = int(f.readline().strip()) edges = list() for _ in range(num_edges): s, e = map(int, f.readline().strip().split()) edges.append((s, e)) temp = dict() temp['num_nodes'] = num_nodes temp['edges'] = edges return temp if __name__ == '__main__': hp = HParam('config/test.yaml') graphs = [ read_graph(hp.model.graph0), read_graph(hp.model.graph1), read_graph(hp.model.graph2), ] print('Building Network...') model = RandWire(hp, graphs) x = torch.randn(16, 3, 224, 224) # RGB-channel 224x224 image with batch_size=16 print('Input shape:') print(x.shape) y = model(x) print('Output shape:') print(y.shape) # [3, 1000]
def train(out_dir, chkpt_path, trainset, valset, writer, logger, hp, hp_str, graphs): model = RandWire(hp, graphs).cuda() if hp.train.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=hp.train.adam) elif hp.train.optimizer == 'adabound': optimizer = adabound.AdaBound(model.parameters(), lr=hp.train.adabound.initial, final_lr=hp.train.adabound.final) elif hp.train.optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=hp.train.sgd.lr, momentum=hp.train.sgd.momentum, weight_decay=hp.train.sgd.weight_decay) else: raise Exception("Optimizer not supported: %s" % hp.train.optimizer) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, hp.train.epoch) init_epoch = -1 step = 0 if chkpt_path is not None: logger.info("Resuming from checkpoint: %s" % chkpt_path) checkpoint = torch.load(chkpt_path) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) step = checkpoint['step'] init_epoch = checkpoint['epoch'] if hp_str != checkpoint['hp_str']: logger.warning("New hparams are different from checkpoint.") logger.warning("Will use new hparams.") # hp = load_hparam_str(hp_str) else: logger.info("Starting new training run") logger.info("Writing graph to tensorboardX...") #print(model) # parameters = 0 # for p in list(model.parameters()): # nn =1 # for s in list(p.size()): # nn = nn * s # parameters += nn #print("Parameters",parameters) #print("model",hp.model) # summary(model,(1,224,224)) writer.write_graph( model, torch.randn(7, hp.model.input_maps, 224, 224).cuda()) #Batch??, input_channels, width,depths logger.info("Finished.") try: model.train() for epoch in itertools.count(init_epoch + 1): loader = tqdm.tqdm(trainset, desc='Train data loader') for data, target in loader: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() loss = loss.item() if loss > 1e8 or math.isnan(loss): logger.error("Loss exploded to %.02f at step %d!" % (loss, step)) raise Exception("Loss exploded") writer.log_training(loss, step) loader.set_description('Loss %.02f at step %d' % (loss, step)) step += 1 save_path = os.path.join(out_dir, 'chkpt_%03d.pt' % epoch) torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'step': step, 'epoch': epoch, 'hp_str': hp_str, }, save_path) logger.info("Saved checkpoint to: %s" % save_path) validate(model, valset, writer, epoch) lr_scheduler.step() except Exception as e: logger.info("Exiting due to exception: %s" % e) traceback.print_exc()