def train(args): local_export_root, remote_export_root, communicator = setup_train(args, MACHINE_IS_HOST) assert (communicator is None and MACHINE_IS_HOST) or (communicator is not None and not MACHINE_IS_HOST) if communicator: communicator.close() # close station because it might lose connection during long training train_loader, val_loader, test_loader = dataloader_factory(args) model = model_factory(args) trainer = trainer_factory(args, model, train_loader, val_loader, test_loader, local_export_root) status_file = os.path.join(local_export_root, 'status.txt') error_log_file = os.path.join(local_export_root, 'error_log.txt') open(status_file, 'w').write(STATUS_RUNNING) try: trainer.train() open(status_file, 'w').write(STATUS_FINISHED) if not MACHINE_IS_HOST and args.experiment_group != 'test': communicator = Communicator(HOST, PORT, USERNAME, PASSWORD) communicator.upload_dir(local_export_root, remote_export_root) communicator.close() except Exception as err: # recover if args.experiment_group == 'test': raise if not os.path.exists(os.path.join(local_export_root, 'tables', 'val_log.csv')): print('Removing empty local export root') shutil.rmtree(local_export_root) raise open(status_file, 'w').write(STATUS_RECOVERY) open(error_log_file, 'w').write(str(err)) if not MACHINE_IS_HOST and args.experiment_group != 'test': print('Uploading recovery file') communicator = Communicator(HOST, PORT, USERNAME, PASSWORD) communicator.upload_dir(local_export_root, remote_export_root) communicator.close() raise
def train_test_net(run, user_options): """Train and save a network accoring to user options Args run (int): the current independent run (used in filenames) user_options (argparser) : user specified options """ # get logger logging.getLogger('train') #initialize model net = models.model_factory(user_options.arch, dataset=user_options.dataset, init=user_options.init) if torch.cuda.device_count() > 1: logger.info("Running on {} GPUs".format(torch.cuda.device_count())) net = NamedDataParallel(net) # move net to device net = net.to(device=device) # get data loader for the specified dataset train_loader, test_loader = data_loaders.load_dataset(user_options.dataset, user_options.dataset_path, user_options.noisy, user_options.batch_size) # define loss criterion = load_criterion(user_options) criterion = criterion.to(device) # resume training from snapshot if specified start_epoch = 0 if os.path.isfile(user_options.resume_from): # resume training given state dictionary optimizer, scheduler = load_optimizer(user_options, net) net, optimizer, scheduler, start_epoch = snapshot.load_snapshot(net, optimizer, scheduler, user_options.resume_from, device) start_epoch = start_epoch + 1 else: # define optimizer optimizer, scheduler = load_optimizer(user_options, net) # print model configuration logger.info("Running trial {} of {}".format(run+1, user_options.runs)) utils.print_model_config(user_options, start_epoch) if start_epoch == 0: filename = net.__name__ + '_' + str(start_epoch) + '_' + str(user_options.init) + '.pt' logger.info("Saving model initialization to {}".format(filename)) snapshot.save_model(net, filename, snapshot_dirname) # train the model net, converged = train(net, user_options.epochs, train_loader, optimizer, criterion, scheduler, device, start_epoch, snapshot_every = user_options.snapshot_every, test_loader = test_loader, kill_plateaus = user_options.kill_plateaus, init_scheme=user_options.init) if test_loader is not None: val_loss, accuracy = scores.test(net, test_loader, criterion, device) utils.print_val_loss(user_options.epochs, val_loss, accuracy) net = net.train() # save final model if converged: filename = net.__name__ + '_' + str(user_options.epochs) + '_' + user_options.init + '.pt' snapshot.save_model(net, filename, snapshot_dirname)
def train(): from models import model_factory from dataloaders import dataloader_factory from trainers import trainer_factory from pruners import pruner_factory from utils import * from utils import scatterplot from torch.utils.tensorboard import SummaryWriter from torchvision import datasets, transforms export_root = setup_train(args) test_result_root = 'experiments/testresults' test_result_title = export_root[12:] test_result_title += '.txt' model = model_factory(args) train_loader, val_loader, test_loader = dataloader_factory(args) pruner = pruner_factory(args, model) trainer = trainer_factory(args, model, train_loader, val_loader, test_loader, export_root, pruner) #load_pretrained_weights(model, './experiments/ml-1m.pth') trainer.train() trainer.test() if args.prune: trainer.prune() #pruner.print_mask(model) #pruner.print_percentage(model) i = 0 test_result = trainer.test() save_test_result(export_root, test_result) save_test_result(test_result_root, test_result, test_result_title) print(test_result_root)
def train(): export_root = setup_train(args) train_loader, val_loader, test_loader = dataloader_factory(args) model = model_factory(args) trainer = trainer_factory(args, model, train_loader, val_loader, test_loader, export_root) trainer.train()
def load_model(model_name, dataset, path, device): """Load a model from file for inference. Keyword arguments: model_name (str) -- name of the model architecture dataset (int) -- dataset (used to infer input dimensionality) path (str) -- path to the saved model device (torch.device) -- where to move the model after loading """ logger = logging.getLogger('train') net = models.model_factory(model_name, dataset) # load parameters logger.info('Loading model {} from {}'.format(net.__name__, path)) net.load_state_dict(torch.load(path), map_location=device) # move to device net = net.to(device = device) # set model to inference mode net = net.eval() return net
def __init__(self, *args, **kwargs): super().__init__() self.model, self.layers = model_factory(kwargs['model_name']) self.init_loss() for k,v in kwargs.items(): # maybe iterate them, but too much repetition setattr(self, k, v) self.hparams = kwargs self.is_ddp = self.distributed_backend in ('ddp', 'ddp2') self.num_workers = self.is_ddp and self.num_workers // self.gpus or self.num_workers
def train(model_args): export_root = setup_train(model_args) train_loader, val_loader, test_loader = dataloader_factory(model_args) model = model_factory(model_args) trainer = trainer_factory(model_args, model, train_loader, val_loader, test_loader, export_root) if model_args.mode == 'train': trainer.train() trainer.test()
def test_with_factory(self): inputs = torch.randn(1, 3, 513, 513) model = model_factory(edict({ 'seg_model': 'deeplab_v3', 'backbone': 'xception' })) model.eval() with torch.no_grad(): output = model(inputs) self.assertTupleEqual((1, 25, 513, 513), output.size())
def train(): export_root = setup_train(args) train_loader, val_loader, test_loader = dataloader_factory(args) model = model_factory(args) trainer = trainer_factory(args, model, train_loader, val_loader, test_loader, export_root) trainer.train() test_model = (input('Test model with test dataset? y/[n]: ') == 'y') if test_model: trainer.test()
def validate(args, mode='val'): local_export_root, remote_export_root, communicator = setup_train(args, MACHINE_IS_HOST) if communicator: communicator.close() train_loader, val_loader, test_loader = dataloader_factory(args) model = model_factory(args) if args.pretrained_weights is not None: model.load(args.pretrained_weights) trainer = trainer_factory(args, model, train_loader, val_loader, test_loader, local_export_root) trainer.just_validate(mode)
def main(args): export_root, args = setup_experiments(args) device = args.device model_checkpoint_path = os.path.join(export_root, 'models') dataloaders = dataloaders_factory(args) model = model_factory(args) writer = SummaryWriter(os.path.join(export_root, 'logs')) train_loggers = [ MetricGraphPrinter(writer, key='ce_loss', graph_name='ce_loss', group_name='Train'), MetricGraphPrinter(writer, key='epoch', graph_name='Epoch', group_name='Train') ] val_loggers = [ MetricGraphPrinter(writer, key='mean_iou', graph_name='mIOU', group_name='Validation'), MetricGraphPrinter(writer, key='acc', graph_name='Accuracy', group_name='Validation'), RecentModelLogger(model_checkpoint_path), BestModelLogger(model_checkpoint_path, metric_key='mean_iou'), ] # criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_LABEL, weight=torch.Tensor(CLASS_WEIGHT).to(device)) criterion = nn.CrossEntropyLoss() optimizer = create_optimizer(model, args) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.decay_step, gamma=args.gamma) trainer = Trainer(model, dataloaders, optimizer, criterion, args.epoch, args, num_classes=42, log_period_as_iter=args.log_period_as_iter, train_loggers=train_loggers, val_loggers=val_loggers, lr_scheduler=scheduler, device=device) trainer.train() writer.close()
def main(): args = parse_args() model = model_factory(args) word_id_lst, post_lsts, _, _, _, pos_lsts = read_data(args.input) freqs_lst = [] odir = Path(args.output).parent if not odir.is_dir(): os.mkdir(odir) model.load() for post, pos_tags in zip(post_lsts, pos_lsts): freqs_lst.append(model.predict(post, pos_tags)) write_results(word_id_lst, post_lsts, freqs_lst, args.output) print("Output file created successfully")
def test(args, device): print(args) model = model_factory(args['dataset'], args['training_type'], args['var_type'], args['feature_dim'], args['num_classes']) model.to(device) model.load(os.path.join(args['output_path']['models'], 'ckpt_best')) model.eval() test_loader = get_data_loader(args['dataset'], args['batch_size'], False, shuffle=False, drop_last=False) attack_names = ['FGSM', 'PGD'] # 'BIM', 'C&W', 'Few-Pixel' print('Adversarial testing.') for idx, attack in enumerate(attack_names): print('Attack: {}'.format(attack)) if attack == 'Few-Pixel': if args['dataset'] == 'cifar10': preproc = { 'mean': [0.4914, 0.4822, 0.4465], 'std': [0.2023, 0.1994, 0.2010] } else: raise NotImplementedError( 'Only CIFAR-10 supported for the one-pixel attack.') one_pixel_attack(model, test_loader, preproc, device, pixels=1, targeted=False, maxiter=1000, popsize=400, verbose=False) else: eps_names = attack_to_dataset_config[attack][ args['dataset']]['eps_names'] eps_values = attack_to_dataset_config[attack][ args['dataset']]['eps_values'] robust_accuracy = test_attack(model, test_loader, attack, eps_values, args, device) for eps_name, eps_value, accuracy in zip(eps_names, eps_values, robust_accuracy): print('Attack Strength: {}, Accuracy: {:.3f}'.format( eps_name, accuracy.item())) print('Finished testing.')
def train(): export_root = setup_train(args) model = model_factory(args) train_loader, val_loader, test_loader = dataloader_factory(args) pruner = pruner_factory(args, model) trainer = trainer_factory(args, model, train_loader, val_loader, test_loader, export_root, pruner) #trainer.train() print("Model's state_dict:") for param_tensor in model.bert.state_dict(): print(param_tensor, "\t", model.bert.state_dict()[param_tensor].size()) # Print optimizer's state_dict print("Optimizer's state_dict:") for var_name in trainer.optimizer.state_dict(): print(var_name, "\t", trainer.optimizer.state_dict()[var_name]) torch.save(model, './initmodel.pth')
def evaluate(): export_root = setup_train(args) meta, train_loader, val_loader, test_loader = dataloader_factory(args) model = model_factory(args, meta) trainer = trainer_factory(args, model, train_loader, val_loader, test_loader, export_root) path = args.eval_model_path load_pretrained_weights(model, path) average_meter_set = AverageMeterSet() for batch in test_loader: with torch.no_grad(): batch = [x.to(trainer.device) for x in batch] metrics = trainer.calculate_metrics(batch) for k, v in metrics.items(): average_meter_set.update(k, v) print(average_meter_set.averages())
def train(args, device): print(args) os.makedirs(args['output_path']['stats'], exist_ok=True) os.makedirs(args['output_path']['models'], exist_ok=True) train_loader = get_data_loader(args['dataset'], args['batch_size'], train=True, shuffle=True, drop_last=True) test_loader = get_data_loader(args['dataset'], args['batch_size'], train=False, shuffle=False, drop_last=False) model = model_factory(args['dataset'], args['training_type'], args['var_type'], args['feature_dim'], args['num_classes']) model.to(device) if args['pretrained'] is not None: if args['pretrained'] not in ('ckpt_best', 'ckpt_last', 'ckpt_robust'): raise ValueError( 'Pre-trained model name must be: [ckpt_best|ckpt_last|ckpt_robust]' ) model.load( os.path.join(args['output_path']['models'], args['pretrained'])) if args['training_type'] == 'vanilla': print('Vanilla training.') train_vanilla(model, train_loader, test_loader, args, device=device) elif args['training_type'] == 'stochastic': print('Stochastic training.') train_stochastic(model, train_loader, test_loader, args, device=device) elif args['training_type'] == 'stochastic+adversarial': print('Adversarial stochastic training.') train_stochastic_adversarial(model, train_loader, test_loader, args, device=device) else: raise NotImplementedError( 'Training "{}" not implemented. Supported: [vanilla|stochastic|stochastic+adversarial].' .format(args['training_type'])) print('Finished training.')
def main(args): export_root, args = setup_experiments(args) device = args.device model_checkpoint_path = os.path.join(export_root, 'models') train_dataset = dataset_factory(args.train_transform_type, is_train=True) val_dataset = dataset_factory(args.val_transform_type, is_train=False) dataloaders = dataloaders_factory(train_dataset, val_dataset, args.batch_size, args.test) model = model_factory(args) writer = SummaryWriter(os.path.join(export_root, 'logs')) train_loggers = [ MetricGraphPrinter(writer, key='loss', graph_name='loss', group_name='Train'), MetricGraphPrinter(writer, key='epoch', graph_name='Epoch', group_name='Train'), ] val_loggers = [ MetricGraphPrinter(writer, key='mean_iou', graph_name='mIOU', group_name='Validation'), MetricGraphPrinter(writer, key='acc', graph_name='Accuracy', group_name='Validation'), RecentModelLogger(model_checkpoint_path), BestModelLogger(model_checkpoint_path, metric_key='mean_iou'), ImagePrinter(writer, train_dataset, log_prefix='train'), ImagePrinter(writer, val_dataset, log_prefix='val') ] criterion = create_criterion(args) optimizer = create_optimizer(model, args) if args.pretrained_weights: load_pretrained_weights(args, model) if args.resume_training: setup_to_resume(args, model, optimizer) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.decay_step, gamma=args.gamma) trainer = Trainer(model, dataloaders, optimizer, criterion, args.epoch, args, num_classes=args.classes, log_period_as_iter=args.log_period_as_iter, train_loggers=train_loggers, val_loggers=val_loggers, lr_scheduler=scheduler, device=device) trainer.train() writer.close()
#preprocessing step steps = ['encode', 'add', 'regu'] for k in steps: dataset.preprocess(type=k) if log: dataset.y = dataset.y.apply(lambda x: np.log(x + 1)) # get data x, y, test, test_index = dataset.get_data() # set model parameters # a little bit complicated because it is modified from my another asgn # parameters in model params #i f not exist, then grid search model = ['mlp' for i in range(10)] # ten mlps model_stack = models.model_factory() for k in model: model_stack.add_model(k) model_stack.set_parameters(x, y) # model fusion part, use stacking mods = model_stack.get_models() sclf = StackingRegressor(regressors=mods, use_features_in_secondary=True, meta_regressor=mods[0], verbose=0) sclf.fit(x, y) result = sclf.predict(test) # map back the prediction
start = time.time() if not os.path.isfile('vocab_%s.pkl' % args.exp_name): print("Building vocabulary") text_field.build_vocab(train_dataset, val_dataset, min_freq=5) pickle.dump(text_field.vocab, open('vocab_%s.pkl' % args.exp_name, 'wb')) else: text_field.vocab = pickle.load( open('vocab_%s.pkl' % args.exp_name, 'rb')) print('build vocab time') print(time.time() - start) start = time.time() # Model and dataloaders Transformer, TransformerEncoder, TransformerDecoderLayer, ScaledDotProductAttention = model_factory( args) encoder = TransformerEncoder(args.n_layer, 0, attention_module=ScaledDotProductAttention, d_in=args.dim_feats, d_k=args.d_k, d_v=args.d_v, h=args.head, d_model=args.d_model) decoder = TransformerDecoderLayer(len(text_field.vocab), 54, args.n_layer, text_field.vocab.stoi['<pad>'], d_k=args.d_k, d_v=args.d_v, h=args.head,
def student_train_test(user_options): """Train student network by knowledge distillation Args run (int): the current independent run (used filenames) user_options (argparser) : user specified options """ # get logger logging.getLogger('train') # load teacher model teacher = models.model_factory(user_options.arch, dataset=user_options.dataset, init=user_options.init) if torch.cuda.device_count() > 1: logger.info("Running teacher network on {} GPUs".format(torch.cuda.device_count())) teacher = NamedDataParallel(teacher) tdevice = device else: tdevice = torch.device('cpu') # move net to device teacher = teacher.to(device=tdevice) # load teacher network from file if os.path.isfile(user_options.resume_from): teacher, _, _, _ = snapshot.load_snapshot(teacher, None, None, user_options.resume_from, tdevice) teacher = teacher.eval() else: raise ValueError('Missing teacher model definition. Specify it with --resume-from [FILENAME]') # get data loader for the specified dataset train_loader, test_loader = data_loaders.load_dataset(user_options.dataset, user_options.dataset_path, user_options.noisy, user_options.batch_size) # load student student = models.student_factory(user_options.arch, user_options.dataset, init=user_options.init) if torch.cuda.device_count() > 1: logger.info("Running student network on {} GPUs".format(torch.cuda.device_count())) student = NamedDataParallel(student) student = student.to(device=device) # load optimizer, scheduler optimizer, scheduler = load_optimizer(user_options, student) # define loss criterion = load_criterion(user_options) # print model configuration start_epoch = 0 utils.print_student_config(user_options) # save model at initialization teacher_name = os.path.basename(user_options.resume_from) teacher_name = os.path.splitext(teacher_name)[0] # remove file extension teacher_name = teacher_name.split('_')[0] filename = 'Student_' + teacher_name + '_' + str(start_epoch) + '.pt' snapshot.save_model(student, filename, snapshot_dirname) # train the model student, converged = distill(student, teacher, user_options.epochs, train_loader, optimizer, criterion, scheduler, tdevice, device, start_epoch, snapshot_every = user_options.epochs, kill_plateaus = user_options.kill_plateaus) if test_loader is not None: test_criterion = nn.CrossEntropyLoss() val_loss, accuracy = scores.test(student, test_loader, test_criterion, device) utils.print_val_loss(user_options.epochs, val_loss, accuracy) # save final model if converged: teacher_name = os.path.basename(user_options.resume_from) teacher_name = os.path.splitext(teacher_name)[0] # remove file extension filename = 'Student_' + teacher_name + '.pt' snapshot.save_model(student, filename, snapshot_dirname)
description='Test for Cifar10 w/ or w/o trt') parser.add_argument('--gpu', '-p', action='store_true', help='Trained on GPU', default='true') parser.add_argument('--model', '-m', default='alexnet', type=str, help='Name of Network') args = parser.parse_args() model_name = args.model model = model_factory(model_name) print("Testing model: %s" % model_name) if args.gpu and torch.cuda.is_available(): # CuDNN must be enabled for FP16 training. torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True model = model.cuda() else: print("-p is a must when executing this script. Exiting...") exit() model.load_state_dict(torch.load('./weights/' + model_name + '.pt')['net']) accbefore = torch.load('./weights/' + model_name + '.pt')['acc']