def main(args): seed = util.prepare(args) if not cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(seed) random.seed(seed) manual_seed(seed) cuda.manual_seed(seed) cuda.set_device(args.gpu) cudnn.benchmark = False cudnn.deterministic = True log_config(args) start = time.time() if args.type == 'search': search(args) elif args.type == 'train': train(args) elif args.type == 'test': pass else: raise ValueError tot_time = time.time() - start m, s = divmod(tot_time, 60) h, m = divmod(m, 60) logging.info("total time %d:%02d:%02d" % (h, m, s))
def fix_random_seed(seed=1234): # Ref.: https://github.com/bentrevett/pytorch-image-classification/blob/master/5_resnet.ipynb random.seed(seed) np.random.seed(seed) manual_seed(seed) cuda.manual_seed(seed) backends.cudnn.deterministic = True
def activate(self): random.seed(self.seed) np_random.seed(self.seed) manual_seed(self.seed) cuda.manual_seed(self.seed) cudnn.deterministic = True cudnn.benchmark = False
def set_seed(seed: int) -> None: torch.manual_seed(seed) cuda.manual_seed(seed) cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) cudnn.benchmark = False cudnn.deterministic = True
def __init__(self, args, crition: nn.CrossEntropyLoss, optimzer): seed = random.randint(0, 1000) random.seed(seed) torch.manual_seed(seed) cd.manual_seed(seed) cd.manual_seed_all(seed) self.args = args self.model_name = args.net self.config = self._parse_args(args.config) net_module = importlib.import_module(f"net.{self.model_name}") self.model_class = getattr(net_module, self.model_name) self.model = self.model_class(**self._parse_model_args()) self.crition = crition self.base_lr = self.config.get("lr", 0.01) self.optimizer = self._get_optimizer(optimzer) self.iters = self.config.get("iter", 5000) self.power = self.config.get("power", 0.9) self.numclass = self.config['numclass'] self.batch_size = self.config['batch_size'] self.print_freq = self.config['print_freq'] self.save_freq = self.config['save_freq'] self.gpu = self.config.get('gpus') print(f"gpus: {self.gpu}") if self.gpu: self.gpu = [self.gpu] if isinstance(self.gpu, int) else list( self.gpu) else: self.device = torch.device("cpu") self.train_dataloader = get_data_loader( self.config['train_data_path'], self.config['train_annot_path'], self.numclass, img_size=self.config['img_size'], batch_size=self.batch_size, name=self.config['dataset_name']) self.val_dataloader = get_data_loader(self.config['val_data_path'], self.config['val_annot_path'], self.numclass, img_size=self.config['img_size'], batch_size=self.batch_size, name=self.config['dataset_name'], mode='eval') self.metricer = Metrics(self.numclass) logdir = self._get_log_dir() self.writer = SummaryWriter(log_dir=logdir) if self.gpu: print(torch.cuda.device_count()) self.model = nn.DataParallel(self.model, device_ids=self.gpu).cuda(self.gpu[0]) # self.crition = self.crition.cuda(self.gpu[0]) cudnn.benchmark = False # 加速1 cudnn.deterministic = True
def fix_random_state(seed_value): """ fix the random seed of each library """ random.seed(seed_value) np.random.seed(seed_value) if torch.cuda.is_available(): cuda.manual_seed(seed_value) cuda.manual_seed_all(seed_value) torch.manual_seed(seed_value) torch.random.manual_seed(seed_value)
def set_seed(seed: int, cudnn_deterministic: bool = False, cudnn_benchmark: bool = True): """ Set all relevant seeds for torch, numpy and python Args: seed: int seed cudnn_deterministic: set True for deterministic training.. cudnn_benchmark: set False for deterministic training. """ th.manual_seed(seed) cuda.manual_seed(seed) cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) if cudnn_deterministic: cudnn.deterministic = True cudnn.benchmark = cudnn_benchmark
def set_seed(seed: int, set_deterministic: bool = True): """ Set all relevant seeds for torch, numpy and python Args: seed: int seed set_deterministic: Guarantee deterministic training, possibly at the cost of performance. """ th.manual_seed(seed) cuda.manual_seed(seed) cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) if set_deterministic: cudnn.benchmark = False cudnn.deterministic = True elif cudnn.benchmark or not cudnn.deterministic: print( f"WARNING: Despite fixed seed {seed}, training may not be deterministic with {cudnn.benchmark=} " f"(must be False for deterministic training) and {cudnn.deterministic=} (must be True for deterministic " f"training)")
def prepare_seed(rand_seed): random.seed(rand_seed) np.random.seed(rand_seed) torch.manual_seed(rand_seed) cuda.manual_seed(rand_seed) cuda.manual_seed_all(rand_seed)
def main(args): init_process_group(backend='nccl') with open(args.config) as file: config = json.load(file) config.update(vars(args)) config = apply_dict(Dict, config) backends.cudnn.benchmark = True backends.cudnn.fastest = True global_rank = distributed.get_rank() local_rank = global_rank % cuda.device_count() np.random.seed(global_rank) torch.manual_seed(global_rank) cuda.manual_seed(global_rank) cuda.set_device(local_rank) train_dataset = datasets.CIFAR10( root=config.train_root, train=True, transform=transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=(0.49139968, 0.48215827, 0.44653124), std=(0.24703233, 0.24348505, 0.26158768)) ]), download=True) train_train_dataset, train_val_dataset = utils.data.random_split( dataset=train_dataset, lengths=[ int(len(train_dataset) * config.split_ratio), int(len(train_dataset) * (1 - config.split_ratio)) ]) val_dataset = datasets.CIFAR10( root=config.val_root, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=(0.49139968, 0.48215827, 0.44653124), std=(0.24703233, 0.24348505, 0.26158768)) ]), download=True) train_train_sampler = utils.data.distributed.DistributedSampler( train_train_dataset) train_val_sampler = utils.data.distributed.DistributedSampler( train_val_dataset) val_sampler = utils.data.distributed.DistributedSampler(val_dataset) train_train_data_loaders = utils.data.DataLoader( dataset=train_train_dataset, batch_size=config.local_batch_size, sampler=train_train_sampler, num_workers=config.num_workers, pin_memory=True) train_val_data_loaders = utils.data.DataLoader( dataset=train_val_dataset, batch_size=config.local_batch_size, sampler=train_val_sampler, num_workers=config.num_workers, pin_memory=True) val_data_loader = utils.data.DataLoader(dataset=val_dataset, batch_size=config.local_batch_size, sampler=val_sampler, num_workers=config.num_workers, pin_memory=True) generator = DARTSGenerator( latent_size=128, min_resolution=4, out_channels=3, operations=dict( sep_conv_3x3=functools.partial(SeparableConvTranspose2d, kernel_size=3, padding=1), sep_conv_5x5=functools.partial(SeparableConvTranspose2d, kernel_size=5, padding=2), dil_conv_3x3=functools.partial(DilatedConvTranspose2d, kernel_size=3, padding=2, dilation=2), dil_conv_5x5=functools.partial(DilatedConvTranspose2d, kernel_size=5, padding=4, dilation=2), identity=functools.partial(IdentityTranspose), # zero=functools.partial(ZeroTranspose) ), num_nodes=6, num_input_nodes=2, num_cells=9, reduction_cells=[2, 5, 8], num_predecessors=2, num_channels=16, ).cuda() discriminator = DARTSDiscriminator( in_channels=3, min_resolution=4, num_classes=10, operations=dict( sep_conv_3x3=functools.partial(SeparableConv2d, kernel_size=3, padding=1), sep_conv_5x5=functools.partial(SeparableConv2d, kernel_size=5, padding=2), dil_conv_3x3=functools.partial(DilatedConv2d, kernel_size=3, padding=2, dilation=2), dil_conv_5x5=functools.partial(DilatedConv2d, kernel_size=5, padding=4, dilation=2), identity=functools.partial(Identity), # zero=functools.partial(Zero) ), num_nodes=6, num_input_nodes=2, num_cells=9, reduction_cells=[2, 5, 8], num_predecessors=2, num_channels=128).cuda() criterion = CrossEntropyLoss(config.label_smoothing) config.global_batch_size = config.local_batch_size * distributed.get_world_size( ) config.network_optimizer.lr *= config.global_batch_size / config.global_batch_denom config.architecture_optimizer.lr *= config.global_batch_size / config.global_batch_denom generator_network_optimizer = optim.Adam( params=generator.network.parameters(), lr=config.generator_network_optimizer.lr, betas=config.generator_network_optimizer.betas, weight_decay=config.generator_network_optimizer.weight_decay) generator_architecture_optimizer = optim.Adam( params=generator.architecture.parameters(), lr=config.generator_architecture_optimizer.lr, betas=config.generator_architecture_optimizer.betas, weight_decay=config.generator_architecture_optimizer.weight_decay) discriminator_network_optimizer = optim.Adam( params=discriminator.network.parameters(), lr=config.discriminator_network_optimizer.lr, betas=config.discriminator_network_optimizer.betas, weight_decay=config.discriminator_network_optimizer.weight_decay) discriminator_architecture_optimizer = optim.Adam( params=discriminator.architecture.parameters(), lr=config.discriminator_architecture_optimizer.lr, betas=config.discriminator_architecture_optimizer.betas, weight_decay=config.discriminator_architecture_optimizer.weight_decay) trainer = DARTSGANTrainer( generator=generator, generator_networks=[generator.network], generator_architectures=[generator.architecture], discriminator=discriminator, discriminator_networks=[discriminator.network], discriminator_architectures=[discriminator.architecture], generator_network_optimizer=generator_network_optimizer, generator_architecture_optimizer=generator_architecture_optimizer, discriminator_network_optimizer=discriminator_network_optimizer, discriminator_architecture_optimizer= discriminator_architecture_optimizer, train_train_data_loader=train_train_data_loader, train_val_data_loader=train_val_data_loader, val_data_loader=val_data_loader, train_train_sampler=train_train_sampler, train_val_sampler=train_val_sampler, val_sampler=val_sampler, log_dir=os.path.join('log', config.name)) if config.checkpoint: trainer.load(config.checkpoint) if config.training: for epoch in range(trainer.epoch, config.num_epochs): trainer.step(epoch) trainer.train() trainer.log_architectures() trainer.log_histograms() trainer.save() elif config.validation: trainer.validate()
def main(): seed = util.prepare(args) if not cuda.is_available(): logging.info('no gpu device available') sys.exit(1) CIFAR_CLASSES = 10 np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) cuda.manual_seed(seed) cuda.set_device(args.gpu) cudnn.benchmark = False cudnn.deterministic = True logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) logging.info('hidden_layers:{:}'.format(args.hidden_layers)) logging.info('first_neurons:{:}'.format(args.first_neurons)) logging.info('change:{:}'.format(args.change)) logging.info('activate_func:{:}'.format(args.activate_func)) logging.info('opt:{:}'.format(args.opt)) logging.info('cross_link:{:}'.format(args.cross_link)) genotype = eval("genotypes.%s" % args.arch) model = Network(args.init_channels, CIFAR_CLASSES, args.layers, args.auxiliary, genotype, args) model = model.cuda() logging.info("param size = %fMB", util.count_parameters_in_MB(model)) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) train_transform, valid_transform = util.get_data_transforms_cifar10(args) train_data = datasets.CIFAR10(root=args.data, train=True, download=False, transform=train_transform) valid_data = datasets.CIFAR10(root=args.data, train=False, download=False, transform=valid_transform) train_queue = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=1) valid_queue = DataLoader(valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=1) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.epochs) best_acc = 0 for epoch in range(args.epochs): logging.info('epoch %d lr %.6f', epoch, scheduler.get_lr()[0]) model.drop_path_prob = args.drop_path_prob * epoch / args.epochs epoch_str = '[{:03d}/{:03d}]'.format(epoch, args.epochs) train_acc, train_obj = train(train_queue, model, criterion, optimizer, epoch_str) logging.info('train_acc %.2f', train_acc) valid_acc, valid_obj = infer(valid_queue, model, criterion, epoch_str) logging.info('valid_acc %.2f', valid_acc) if valid_acc > best_acc: logging.info( 'find the best model. Save it to {:}'.format(args.save + 'best.pt')) util.save(model, os.path.join(args.save, 'best.pt')) best_acc = valid_acc scheduler.step() logging.info('best acc is {:}'.format(best_acc))
use_cuda = cuda.is_available() best_accuracy = 0 # best testing accuracy best_epoch = 0 # epoch with the best testing accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch if args.env_name is None: args.env_name = "Log:%s-Train:%s-Test:%s" % (args.enable_log_transform, \ args.enable_disturb_illumination_train, \ args.enable_disturb_illumination_test) args.save_directory = osp.join(args.save_directory, args.env_name) # Init seed print('==> Init seed..') torch.manual_seed(args.seed) if use_cuda: cuda.manual_seed(args.seed) # Calculate mean and std print('==> Prepare mean and std..') print("\t Log : %s" % args.enable_log_transform) if not args.enable_log_transform: # mean_log, std_log = (0.50707543, 0.48655024, 0.44091907), (0.26733398, 0.25643876, 0.27615029) mean_log, std_log = calculate_mean_and_std(enable_log_transform=False) else: # mean_log, std_log = (6.69928741, 6.65900993, 6.40947819), (1.2056427, 1.15127575, 1.31597221) mean_log, std_log = calculate_mean_and_std(enable_log_transform=True) print('\tmean_log = ', mean_log) print('\tstd_log = ', std_log) data_mean = torch.FloatTensor(mean_log) data_std = torch.FloatTensor(std_log)
while True: if Path(os.path.join(save_dir, 'run.%d' % (save_index, ))).exists(): save_index += 1 else: break return save_index opt = parser.parse_args() opt.save_path = os.path.join(opt.save_dir, 'run.%d' % (get_save_index(opt.save_dir), )) Path(opt.save_path).mkdir_p() print(opt.save_path) torch.manual_seed(123) cuda.set_device(opt.gpus[0]) cuda.manual_seed(123) print(opt) js.dump(opt.__dict__, open(os.path.join(opt.save_path, 'opt.json'), 'w'), sort_keys=True, indent=2) def NMTCriterion(vocabSize): weight = torch.ones(vocabSize) weight[onmt.Constants.PAD] = 0 crit = nn.NLLLoss(weight, size_average=False) if opt.gpus: crit.cuda() return crit
def main(args): init_process_group(backend='nccl') with open(args.config) as file: config = json.load(file) config.update(vars(args)) config = apply_dict(Dict, config) backends.cudnn.benchmark = True backends.cudnn.fastest = True np.random.seed(config.seed) torch.manual_seed(config.seed) cuda.manual_seed(config.seed) cuda.set_device(distributed.get_rank() % cuda.device_count()) train_dataset = datasets.CIFAR10( root=config.train_root, train=True, transform=transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize( mean=(0.49139968, 0.48215827, 0.44653124), std=(0.24703233, 0.24348505, 0.26158768) ), Cutout(size=(16, 16)) ]), download=True ) val_dataset = datasets.CIFAR10( root=config.val_root, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize( mean=(0.49139968, 0.48215827, 0.44653124), std=(0.24703233, 0.24348505, 0.26158768) ) ]), download=True ) train_sampler = utils.data.distributed.DistributedSampler(train_dataset) val_sampler = utils.data.distributed.DistributedSampler(val_dataset) train_data_loader = utils.data.DataLoader( dataset=train_dataset, batch_size=config.local_batch_size, sampler=train_sampler, num_workers=config.num_workers, pin_memory=True ) val_data_loader = utils.data.DataLoader( dataset=val_dataset, batch_size=config.local_batch_size, sampler=val_sampler, num_workers=config.num_workers, pin_memory=True ) model = DARTS( operations=dict( sep_conv_3x3=functools.partial(SeparableConv2d, kernel_size=3, padding=1), sep_conv_5x5=functools.partial(SeparableConv2d, kernel_size=5, padding=2), dil_conv_3x3=functools.partial(DilatedConv2d, kernel_size=3, padding=2, dilation=2), dil_conv_5x5=functools.partial(DilatedConv2d, kernel_size=5, padding=4, dilation=2), avg_pool_3x3=functools.partial(AvgPool2d, kernel_size=3, padding=1, postnormalization=False), max_pool_3x3=functools.partial(MaxPool2d, kernel_size=3, padding=1, postnormalization=False), identity=functools.partial(Identity), # zero=functools.partial(Zero) ), stem=[ functools.partial(Conv2d, kernel_size=3, padding=1, stride=1, affine=True, preactivation=False), functools.partial(Conv2d, kernel_size=3, padding=1, stride=1, affine=True, preactivation=True) ], num_nodes=6, num_input_nodes=2, num_cells=20, reduction_cells=[6, 13], num_predecessors=2, num_channels=36, num_classes=10, drop_prob_fn=lambda epoch: config.drop_prob * (epoch / config.num_epochs), temperature_fn=lambda epoch: config.temperature ** (epoch / config.num_epochs) ) checkpoint = Dict(torch.load('log/checkpoints/epoch_0')) model.architecture.load_state_dict(checkpoint.architecture_state_dict) model.build_discrete_dag() model.build_discrete_network() for parameter in model.architecture.parameters(): parameter.requires_grad_(False) criterion = CrossEntropyLoss(config.label_smoothing) config.global_batch_size = config.local_batch_size * distributed.get_world_size() config.lr *= config.global_batch_size / config.global_batch_denom optimizer = optim.SGD( params=model.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay ) lr_scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer=optimizer, T_max=config.num_epochs ) trainer = ClassifierTrainer( model=model, criterion=criterion, train_sampler=train_sampler, val_sampler=val_sampler, train_data_loader=train_data_loader, val_data_loader=val_data_loader, optimizer=optimizer, lr_scheduler=lr_scheduler, log_dir=os.path.join('log', config.name) ) if config.checkpoint: trainer.load(config.checkpoint) if config.training: for epoch in range(trainer.epoch, config.num_epochs): trainer.train() trainer.validate() trainer.save() trainer.step() elif config.validation: trainer.validate()
def run(config): """ Run training and testing. """ # wandb if config["wandb"]: wandb.init(config=config, project=config["project"], group=config["group"], name=config["run_name"]) # Set random seeds manual_seed(config["seed"]) cuda.manual_seed(config["seed"]) # override device use_cuda = cuda.is_available() dev = device("cuda" if use_cuda else "cpu") config["device"] = dev # load datasets transformations = transforms.Compose([transforms.ToTensor()]) train_dataset = datasets.MNIST(config["data_path"], train=True, download=True, transform=transformations) test_dataset = datasets.MNIST(config["data_path"], train=False, download=True, transform=transformations) # filter single label if config["use_single_label"]: idx = train_dataset.targets==config["single_label"] train_dataset.targets = train_dataset.targets[idx] train_dataset.data = train_dataset.data[idx] # test dataset idx = test_dataset.targets==config["single_label"] test_dataset.targets = test_dataset.targets[idx] test_dataset.data = test_dataset.data[idx] # define batchers d_kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_batcher = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True, **d_kwargs) test_batcher = DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False, **d_kwargs) config["train_batcher"] = train_batcher config["test_batcher"] = test_batcher # create encoder n_classes = config["n_classes"] enc_kwargs = {"input_dim": config["input_dim"], "hidden_dim": config["encoder_hidden_dim"], "z_dim": config["z_dim"], "act_func": getattr(torch, config["encoder_act_func"]), "n_classes": n_classes} encoder = EncoderFactory.create(config["encoder_name"]) encoder = encoder(**enc_kwargs) # create decoder dec_kwargs = {"z_dim": config["z_dim"], "hidden_dim": config["decoder_hidden_dim"], "output_dim": config["input_dim"], "act_func": getattr(torch, config["decoder_act_func"]), "pred_func": getattr(torch, config["decoder_pred_func"]), "n_classes": n_classes} decoder = DecoderFactory.create(config["decoder_name"]) decoder = decoder(**dec_kwargs) # assemble VAE reconstruction_loss = partial(getattr(functional, config["rec_loss"]), reduction="sum") vae_kwargs = {"encoder": encoder, "decoder": decoder, "recon_loss_func": reconstruction_loss, "beta": config["beta"]} # selecte VAE model vae = VAEFactory.create(config["vae_name"]) model = vae(**vae_kwargs) # send model to device and store it model = model.to(dev) config["model"] = model # print model summary print("----------------------------------------------------------------") print(f"Model: {model.name}") # summary(model, (1, config["input_dim"] + config["n_classes"])) # wandb if config["wandb"]: wandb.watch(model, log="all") # create the optimizer optimizer = getattr(optim, config["optimizer_name"]) optimizer = optimizer(model.parameters(), lr=config["lr"]) config["optimizer"] = optimizer # train and test print("Training...") print("----------------------------------------------------------------") # current date and time start = time() print(f"Start datetime: {datetime.now()}") print("----------------------------------------------------------------") # log control image test_losses = test(config) _, _, _, x, x_hat = test_losses num_img = config["test_num_img"] x_cat = cat((x[:num_img], x_hat[:num_img]), dim=0) grid = image_grid(x_cat, nrow=num_img) if config["wandb"]: wandb.log({"Test Example": wandb.Image(grid, caption="Epoch: 0")}) for e in range(config["epochs"] + 1): train_losses = train(config) train_losses = [loss / len(train_batcher.dataset) for loss in train_losses] train_loss, train_recon_loss, train_kld_loss = train_losses # average test_losses = test(config) test_losses = [loss / len(test_batcher.dataset) for loss in test_losses] test_loss, test_recon_loss, test_kld_loss, x, x_hat = test_losses # print stuff print(f'Epoch {e}, Train Loss: {train_loss:.2f}, Test Loss: {test_loss:.2f}') # log images num_img = config["test_num_img"] x_cat = cat((x[:num_img], x_hat[:num_img]), dim=0) grid = image_grid(x_cat, nrow=num_img) # show_image_grid(x_cat, nrow=5) # BUG: only works in test()? if not config["wandb"]: continue # wandb - merge into one logging operation wandb.log({"Train Loss - Total": train_loss, "Train Loss - Reconstruction": train_recon_loss, "Train Loss - KL Divergence": train_kld_loss, "Test Loss - Total": test_loss, "Test Loss - Reconstruction": test_recon_loss, "Test Loss - KL Divergence": test_kld_loss, "Test Example": wandb.Image(grid, caption=f"Epoch: {e}")}) # save model with torch to wandb run dir (uploads after training is complete) # TODO: Save intermediary checkpoints instead if config["wandb"] and config["save_model"]: save({"model_name": model.name, "beta": config["beta"], "epoch": e, "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "z_dim": config["z_dim"], "train_loss": train_loss, "test_loss": test_loss}, os.path.join(wandb.run.dir, "model_state.pt")) # current time print("----------------------------------------------------------------") print(f"End datetime: {datetime.now()}") print(f"Elapsed time: {round((time() - start) / 60.0, 2)} minutes") print("----------------------------------------------------------------")
def reload_model(config): """ Run training and testing. """ # Set random seeds manual_seed(config["seed"]) cuda.manual_seed(config["seed"]) # override device use_cuda = cuda.is_available() dev = device("cuda" if use_cuda else "cpu") config["device"] = dev # create encoder n_classes = config["n_classes"] enc_kwargs = {"input_dim": config["input_dim"], "hidden_dim": config["encoder_hidden_dim"], "z_dim": config["z_dim"], "act_func": getattr(torch, config["encoder_act_func"]), "n_classes": n_classes} encoder = EncoderFactory.create(config["encoder_name"]) encoder = encoder(**enc_kwargs) # create decoder dec_kwargs = {"z_dim": config["z_dim"], "hidden_dim": config["decoder_hidden_dim"], "output_dim": config["input_dim"], "act_func": getattr(torch, config["decoder_act_func"]), "pred_func": getattr(torch, config["decoder_pred_func"]), "n_classes": n_classes} decoder = DecoderFactory.create(config["decoder_name"]) decoder = decoder(**dec_kwargs) # assemble VAE reconstruction_loss = partial(getattr(functional, config["rec_loss"]), reduction="sum") vae_kwargs = {"encoder": encoder, "decoder": decoder, "recon_loss_func": reconstruction_loss, "beta": config["beta"]} # selecte VAE model vae = VAEFactory.create(config["vae_name"]) model = vae(**vae_kwargs) # load checkpoint checkpoint = load(config["checkpoint_path"], map_location=dev) # load state dict model.load_state_dict(checkpoint["model_state_dict"]) # send model to device and store it model = model.to(dev) # print model summary # summary(model, (1, config["input_dim"])) # print out print("----------------------------------------------------------------") print(f"Model Loaded") print("----------------------------------------------------------------") return model
def main(args): init_process_group(backend='nccl') with open(args.config) as file: config = json.load(file) config.update(vars(args)) config = apply_dict(Dict, config) backends.cudnn.benchmark = True backends.cudnn.fastest = True world_size = distributed.get_world_size() global_rank = distributed.get_rank() device_count = cuda.device_count() local_rank = global_rank % device_count np.random.seed(config.seed) torch.manual_seed(config.seed) cuda.manual_seed(config.seed) cuda.set_device(local_rank) train_dataset = datasets.MNIST( root=config.train_root, train=True, transform=transforms.Compose([ transforms.Resize((32, 32)), transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,)) ]), download=True ) val_dataset = datasets.MNIST( root=config.val_root, train=False, transform=transforms.Compose([ transforms.Resize((32, 32)), transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,)) ]), download=True ) train_sampler = utils.data.distributed.DistributedSampler(train_dataset) val_sampler = utils.data.distributed.DistributedSampler(val_dataset) train_data_loader = utils.data.DataLoader( dataset=train_dataset, batch_size=config.local_batch_size, sampler=train_sampler, num_workers=config.num_workers, pin_memory=True ) val_data_loader = utils.data.DataLoader( dataset=val_dataset, batch_size=config.local_batch_size, sampler=val_sampler, num_workers=config.num_workers, pin_memory=True ) generator = Generator( latent_size=128, mapping_layers=2, min_resolution=4, max_resolution=32, max_channels=128, min_channels=16, out_channels=1 ).cuda() discriminator = Discriminator( in_channels=1, min_channels=16, max_channels=128, max_resolution=32, min_resolution=4, num_classes=1 ).cuda() inverter = Discriminator( in_channels=1, min_channels=16, max_channels=128, max_resolution=32, min_resolution=4, num_classes=128 ).cuda() config.global_batch_size = config.local_batch_size * distributed.get_world_size() config.generator_optimizer.lr *= config.global_batch_size / config.global_batch_denom config.discriminator_optimizer.lr *= config.global_batch_size / config.global_batch_denom config.inverter_optimizer.lr *= config.global_batch_size / config.global_batch_denom generator_optimizer = optim.Adam(generator.parameters(), **config.generator_optimizer) discriminator_optimizer = optim.Adam(discriminator.parameters(), **config.discriminator_optimizer) inverter_optimizer = optim.Adam(inverter.parameters(), **config.inverter_optimizer) trainer = GANTrainer( latent_size=128, generator=generator, discriminator=discriminator, inverter=inverter, generator_optimizer=generator_optimizer, discriminator_optimizer=discriminator_optimizer, inverter_optimizer=inverter_optimizer, train_data_loader=train_data_loader, val_data_loader=val_data_loader, train_sampler=train_sampler, val_sampler=val_sampler, log_dir=os.path.join('log', config.name) ) if config.checkpoint: trainer.load(config.checkpoint) if config.training: for epoch in range(trainer.epoch, config.num_epochs): trainer.step(epoch) trainer.train() # trainer.validate() trainer.save() elif config.validation: trainer.validate()
import torch import torch.nn as nn import torch.cuda as cuda import extractinputs as ei import support_functions as sf import os import numpy as np from sklearn.preprocessing import LabelEncoder import pandas as pd import csv import time torch.manual_seed(2020) cuda.manual_seed(2020) np.random.seed(2020) os.chdir(os.getcwd()) path = os.getcwd() device = torch.device("cuda:0" if cuda.is_available() else "cpu") os.chdir(os.getcwd()) path = os.getcwd() # Import the DMS dataset data = pd.read_csv(path + '/Input/DMS_CCS_ML_Dataset_Class.csv') data.head() #Un-normalized data inputs_raw = data.drop(labels=['Combined_CCS', 'Compound'], axis='columns') target = data['Combined_CCS'] names = data['Compound']
def main(arg): ################################## for key in arg: parse[key] = arg[key] global args args = SimpleNamespace(**parse) ''' print('seed:{:}'.format(args.seed)) print('dataset:{:}'.format(args.dataset)) print('hidden_layers:{:}'.format(args.hidden_layers)) print('first_neurons:{:}'.format(args.first_neurons)) print('cross_link:{:}'.format(args.cross_link)) print('fully_cross:{:}'.format(args.fully_cross)) print() exit(0) ''' ################################## seed = util.prepare(args) if not cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) cuda.manual_seed(seed) cuda.set_device(args.gpu) cudnn.benchmark = False cudnn.deterministic = True logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) logging.info('hidden_layers:{:}'.format(args.hidden_layers)) logging.info('first_neurons:{:}'.format(args.first_neurons)) logging.info('change:{:}'.format(args.change)) logging.info('activate_func:{:}'.format(args.activate_func)) logging.info('opt:{:}'.format(args.opt)) logging.info('cross_link:{:}'.format(args.cross_link)) logging.info('fully_cross:{:}'.format(args.fully_cross)) model = Network(args) model = model.cuda() logging.info("param size = %fMB", util.count_parameters_in_MB(model)) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.7) train_data, valid_data = dataset.get_dataset(args.data, args.dataset) train_queue, valid_queue = dataset.get_data_loader(train_data, valid_data, 2) early_stop = util.EarlyStop(patience=10, delta=0.0001, save_path=args.save + '/best.pt') for epoch in range(args.epochs): logging.info('epoch %d lr %.6f', epoch, scheduler.get_lr()[0]) epoch_str = '[{:03d}/{:03d}]'.format(epoch, args.epochs) train_acc, train_obj = train(train_queue, model, criterion, optimizer, epoch_str) logging.info('train_acc %.2f', train_acc) valid_acc, valid_obj = infer(valid_queue, model, criterion, epoch_str) logging.info('valid_acc %.2f', valid_acc) if early_stop.check(train_obj, valid_acc, model): logging.info('Early stopping at {:}'.format(epoch)) break scheduler.step()
def load_words_embed(pretrained_embed_model, vocab) -> torch.FloatTensor: l = len(vocab) embeds = torch.randn(l, 300) for i in range(0, l): try: embeds[i, :] = torch.from_numpy( pretrained_embed_model[vocab[i]]).view(1, 300) except: embeds[i, :] = torch.randn(1, 300) return embeds if __name__ == '__main__': manual_seed(100) epochs = 1000 data, vocab, tags = load_data("../data") fasttext = FastText.load("../data/wiki.ar.gensim") embeds = load_words_embed(fasttext, vocab) net = BiLSTMWithCRF(len(vocab), tags, 300, 8, preinit_embedding=embeds) # bilstmcrf = bilstmcrf.cuda() opt = Adam(net.parameters(), lr=0.01, weight_decay=1e-3) print("Begin training") for epoch in range(epochs): for sentence, tgs in data: opt.zero_grad() sentence_in = prepare_sequence(sentence, vocab) targets = torch.LongTensor([tags[t] for t in tgs]) neg_log_likelihood = net.neg_log_likelihood(sentence_in, targets) neg_log_likelihood.backward()
def init_randseed(RANDOM_SEED=20190421): random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) if cuda.is_available(): cuda.manual_seed(RANDOM_SEED)
from torchvision import transforms from torch.utils.data import DataLoader from torch import device from torch import cuda from torch import manual_seed from neu_vae.training import reload_model with open("../training/config.yaml") as f: config = yaml.safe_load(f) # Set random seeds if config["seed"]: manual_seed(config["seed"]) cuda.manual_seed(config["seed"]) # override device use_cuda = cuda.is_available() dev = device("cuda" if use_cuda else "cpu") config["device"] = dev # load test dataset transformations = transforms.Compose([transforms.ToTensor()]) test_dataset = datasets.MNIST(config["data_path"], train=False, download=True, transform=transformations) idx = test_dataset.targets == config["single_label"] test_dataset.targets = test_dataset.targets[idx]
def print_max_nbd(t): "Print values in a neighborhood of the max value in t" x, y = [v[0] for v in argmax(t)] print(f'max coords at {(x, y)}') print(t[max(0, x - 3):min(x + 3, t.size(0)), max(0, y - 3):min(y + 3, t.size(0))]) class Flatten(Module): "Simply flattens all but the batch (first) dimension of the input" def forward(self, i): return i.view(i.size(0), -1) TP.manual_seed(0) # Note: Messing with the current RNG state. # Shape arguments passed to `InformationDropoutLayer`'s. INFO_ARGS = [ dict(output_size=(32, 38, 38), in_channels=2, out_channels=32, kernel_size=2, stride=2, max_alpha=0.), # 0 dict(output_size=(64, 10, 10), in_channels=32, out_channels=64, kernel_size=2, stride=2, max_alpha=0.), # 1
def main(args): init_process_group(backend='nccl') with open(args.config) as file: config = apply_dict(Dict, json.load(file)) config.update(vars(args)) config.update( dict(world_size=distributed.get_world_size(), global_rank=distributed.get_rank(), device_count=cuda.device_count(), local_rank=distributed.get_rank() % cuda.device_count())) print(f'config: {config}') backends.cudnn.benchmark = True backends.cudnn.fastest = True np.random.seed(config.seed) torch.manual_seed(config.seed) cuda.manual_seed(config.seed) cuda.set_device(config.local_rank) train_dataset = ImageNet(root=config.train_root, meta=config.train_meta, transform=transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2), transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ])) val_dataset = ImageNet(root=config.val_root, meta=config.val_meta, transform=transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), ])) train_sampler = utils.data.distributed.DistributedSampler(train_dataset) val_sampler = utils.data.distributed.DistributedSampler(val_dataset) train_data_loader = utils.data.DataLoader( dataset=train_dataset, batch_size=config.local_batch_size, sampler=train_sampler, num_workers=config.num_workers, pin_memory=True) val_data_loader = utils.data.DataLoader(dataset=val_dataset, batch_size=config.local_batch_size, sampler=val_sampler, num_workers=config.num_workers, pin_memory=True) model = SuperMobileNetV2(first_conv_param=Dict(in_channels=3, out_channels=32, kernel_size=3, stride=2), middle_conv_params=[ Dict(in_channels=32, out_channels=16, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=1, stride=1), Dict(in_channels=16, out_channels=24, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=2, stride=2), Dict(in_channels=24, out_channels=32, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=3, stride=2), Dict(in_channels=32, out_channels=64, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=4, stride=2), Dict(in_channels=64, out_channels=96, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=3, stride=1), Dict(in_channels=96, out_channels=160, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=3, stride=2), Dict(in_channels=160, out_channels=320, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=1, stride=1), ], last_conv_param=Dict(in_channels=320, out_channels=1280, kernel_size=1, stride=1), drop_prob=config.drop_prob, num_classes=1000).cuda() for tensor in model.state_dict().values(): distributed.broadcast(tensor, 0) criterion = CrossEntropyLoss(config.label_smoothing) config.global_batch_size = config.local_batch_size * config.world_size config.lr = config.lr * config.global_batch_size / config.global_batch_denom optimizer = torch.optim.RMSprop(params=model.weights(), lr=config.lr, alpha=config.alpha, eps=config.eps, weight_decay=config.weight_decay, momentum=config.momentum) lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=config.milestones, gamma=config.gamma) last_epoch = -1 global_step = 0 if config.checkpoint: checkpoint = Dict(torch.load(config.checkpoint)) model.load_state_dict(checkpoint.model_state_dict) optimizer.load_state_dict(checkpoint.optimizer_state_dict) last_epoch = checkpoint.last_epoch global_step = checkpoint.global_step elif config.global_rank == 0: if os.path.exists(config.checkpoint_directory): shutil.rmtree(config.checkpoint_directory) if os.path.exists(config.event_directory): shutil.rmtree(config.event_directory) os.makedirs(config.checkpoint_directory) os.makedirs(config.event_directory) if config.global_rank == 0: summary_writer = SummaryWriter(config.event_directory) if config.training: for epoch in range(last_epoch + 1, config.num_epochs): train_sampler.set_epoch(epoch) lr_scheduler.step(epoch) model.train() for local_step, (images, targets) in enumerate(train_data_loader): step_begin = time.time() images = images.cuda(non_blocking=True) targets = targets.cuda(non_blocking=True) logits = model(images) loss = criterion(logits, targets) / config.world_size optimizer.zero_grad() loss.backward() for parameter in model.parameters(): distributed.all_reduce(parameter.grad) optimizer.step() predictions = torch.argmax(logits, dim=1) accuracy = torch.mean( (predictions == targets).float()) / config.world_size for tensor in [loss, accuracy]: distributed.all_reduce(tensor) step_end = time.time() if config.global_rank == 0: summary_writer.add_scalars( main_tag='loss', tag_scalar_dict=dict(train=loss), global_step=global_step) summary_writer.add_scalars( main_tag='accuracy', tag_scalar_dict=dict(train=accuracy), global_step=global_step) print( f'[training] epoch: {epoch} global_step: {global_step} local_step: {local_step} ' f'loss: {loss:.4f} accuracy: {accuracy:.4f} [{step_end - step_begin:.4f}s]' ) global_step += 1 if config.global_rank == 0: torch.save( dict(model_state_dict=model.state_dict(), optimizer_state_dict=optimizer.state_dict(), last_epoch=epoch, global_step=global_step), f'{config.checkpoint_directory}/epoch_{epoch}') if config.validation: model.eval() with torch.no_grad(): average_loss = 0 average_accuracy = 0 for local_step, (images, targets) in enumerate(val_data_loader): images = images.cuda(non_blocking=True) targets = targets.cuda(non_blocking=True) logits = model(images) loss = criterion(logits, targets) / config.world_size predictions = torch.argmax(logits, dim=1) accuracy = torch.mean( (predictions == targets).float()) / config.world_size for tensor in [loss, accuracy]: distributed.all_reduce(tensor) average_loss += loss average_accuracy += accuracy average_loss /= (local_step + 1) average_accuracy /= (local_step + 1) if config.global_rank == 0: summary_writer.add_scalars( main_tag='loss', tag_scalar_dict=dict(val=average_loss), global_step=global_step) summary_writer.add_scalars( main_tag='accuracy', tag_scalar_dict=dict(val=average_accuracy), global_step=global_step) print( f'[validation] epoch: {epoch} loss: {average_loss:.4f} accuracy: {average_accuracy:.4f}' ) elif config.validation: model.eval() with torch.no_grad(): average_loss = 0 average_accuracy = 0 for local_step, (images, targets) in enumerate(val_data_loader): images = images.cuda(non_blocking=True) targets = targets.cuda(non_blocking=True) logits = model(images) loss = criterion(logits, targets) / config.world_size predictions = torch.argmax(logits, dim=1) accuracy = torch.mean( (predictions == targets).float()) / config.world_size for tensor in [loss, accuracy]: distributed.all_reduce(tensor) average_loss += loss average_accuracy += accuracy average_loss /= (local_step + 1) average_accuracy /= (local_step + 1) if config.global_rank == 0: print( f'[validation] epoch: {last_epoch} loss: {average_loss:.4f} accuracy: {average_accuracy:.4f}' ) if config.global_rank == 0: summary_writer.close()
print('==> Init variables..') use_cuda = cuda.is_available() best_accuracy = 0 # best testing accuracy best_epoch = 0 # epoch with the best testing accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch # netArch = "resnet34" netArch = "resnet50" save_directory = os.path.join("checkpoint", netArch) if not os.path.isdir(save_directory): os.makedirs(save_directory) # Init seed print('==> Init seed..') torch.manual_seed(args.seed) # Sets the seed for generating random numbers if use_cuda: cuda.manual_seed(args.seed) # Sets the seed for generating random numbers for the current GPU # Calculate mean and std print('==> Prepare mean and std..') # data_mean, data_std = getMeanStdByBatch(datapath, args.train_batch_size) # fengxi data_mean = [ 0.331948, 0.33171957, 0.29903654] data_std = [ 0.28179781, 0.27919075, 0.27801905] print('\tdata_mean = ', data_mean) print('\tdata_std = ', data_std) # Prepare training transform print('==> Prepare training transform..') training_transform = transforms.Compose([ # torchvision.transforms.RandomAffine(degrees, translate=None, scale=None, shear=None, resample=False, fillcolor=0)
def main(): # Control the random seeds torch.manual_seed(SEED) cuda.manual_seed(SEED) cuda.manual_seed_all(SEED) np.random.seed(SEED) # Numpy module. random.seed(SEED) # Python random module. backends.cudnn.benchmark = False backends.cudnn.deterministic = True print(">> Set random seed: {}".format(SEED)) # Write data filenames and labels to a txt file. read_filenames_and_labels_to_txt(CANDIDATE_ROOT, "gt.txt") # Conduct data augmentation first create_patches(CANDIDATE_ROOT, PATCH_ROOT) # Get the size of the unlabeled data pool to build a list of indices indices = list(range(get_sample_num(PATCH_ROOT))) # Randomly select K samples in the first cycle random.shuffle(indices) labeled_indices = indices[:K] unlabeled_indices = indices[K:] # Load training and testing data filenames, labels = load_train_data(CANDIDATE_ROOT, PATCH_ROOT, labeled_indices) train_dataset = MyDataset(filenames, labels, transform=image_transform) train_loader = DataLoader(train_dataset, batch_size=BATCH, shuffle=True, pin_memory=True) print("Current training dataset size: {}".format(len(train_dataset))) filenames, labels = load_test_data(TEST_ROOT) test_dataset = MyDataset(filenames, labels, transform=image_transform) test_loader = DataLoader(test_dataset, batch_size=BATCH, sampler=SequentialSampler(range( len(test_dataset))), pin_memory=True) dataloaders = {'train': train_loader, 'test': test_loader} # Set the device for running the network device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Build the network structure classifier_network = ResNet18(num_classes=23) classifier_network.to(device) loss_network = LossNet() loss_network.to(device) # Load pre-trained weight of the classifier network classifier_dict = classifier_network.state_dict() pretrained_dict = torch.load("resnet18.pth") parameter_dict = { k: v for k, v in pretrained_dict.items() if k in classifier_dict } classifier_dict.update(parameter_dict) classifier_network.load_state_dict(classifier_dict) # Integration model = {'classifier': classifier_network, 'module': loss_network} # Set the loss criterion of the training procedure criterion = nn.CrossEntropyLoss(reduction='none') print(">> Start active learning!") for cycle in range(CYCLES): # for each cycle, we need new optimizers and learning rate schedulers optim_classifier = optim.SGD(model['classifier'].parameters(), lr=LR_classifier, momentum=MOMENTUM, weight_decay=WDECAY) optim_loss = optim.SGD(model['module'].parameters(), lr=LR_loss, momentum=MOMENTUM, weight_decay=WDECAY) optimizers = {'classifier': optim_classifier, 'loss': optim_loss} scheduler_classifier = lr_scheduler.MultiStepLR(optim_classifier, milestones=MILESTONE) scheduler_loss = lr_scheduler.MultiStepLR(optim_loss, milestones=MILESTONE) schedulers = { 'classifier': scheduler_classifier, 'module': scheduler_loss } # Training train(model, criterion, optimizers, schedulers, dataloaders, EPOCH, device) acc = test(model, dataloaders, device, mode='test') print('Cycle {}/{} || Label set size {}: Test acc {}'.format( cycle + 1, CYCLES, len(labeled_indices), acc)) # Random subset sampling to explore the data pool random.shuffle(unlabeled_indices) subset_indices = unlabeled_indices[:SUBSET] # Choose the active learning strategy selected_indices = active_sampling(strategy="hybrid", model=model, indices=subset_indices) # Add new labeled samples to the labeled dataset labeled_indices.extend(selected_indices) # Remove labeled samples from the unlabeled data pool for i in selected_indices: unlabeled_indices.remove(i) # Update the training dataset filenames, labels = load_train_data(CANDIDATE_ROOT, PATCH_ROOT, labeled_indices) train_dataset = MyDataset(filenames, labels, transform=image_transform) print("Training data number: ", len(train_dataset)) dataloaders['train'] = DataLoader(train_dataset, batch_size=BATCH, pin_memory=True, shuffle=True) # Save the model of the current cycle torch.save(model["classifier"].state_dict(), 'checkpoints/active_resnet18_cycle{}.pth'.format(cycle))