def _setup_gpus(self, seed: float, detect_anomaly: bool): utils.setup_cuda(seed, self.local_rank) torch.autograd.set_detect_anomaly(detect_anomaly) self._log_info({ 'set_detect_anomaly': detect_anomaly, 'is_anomaly_enabled': torch.is_anomaly_enabled() }) self._log_info({ 'gpu_names': utils.cuda_device_names(), 'gpu_count': torch.cuda.device_count(), 'CUDA_VISIBLE_DEVICES': os.environ['CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ else 'NotSet', 'cudnn.enabled': cudnn.enabled, 'cudnn.benchmark': cudnn.benchmark, 'cudnn.deterministic': cudnn.deterministic, 'cudnn.version': cudnn.version() }) self._log_info({'memory': str(psutil.virtual_memory())}) self._log_info({'CPUs': str(psutil.cpu_count())})
# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. from archai.algos.darts.mixed_op import MixedOp from archai.common import utils, timing import torch import torch.backends.cudnn as cudnn import numpy as np utils.setup_cuda(2, local_rank=0) device = torch.device('cuda') mop = MixedOp(16, 1).to(device=device) a = torch.randn(8, requires_grad=True).to(device=device) x = torch.randn((64, 16, 32, 32), requires_grad=True).to(device=device) for i in range(1000): y = mop(x, a) timing.print_all_timings() """ Without cudnn setup, requires_grad=False: 3: 0.90ms for 1000 calls [stddev: 9.08, min: 0.49, max: 287.68] 4: 0.57ms for 1000 calls [stddev: 0.16, min: 0.48, max: 3.89] 6: 0.32ms for 1000 calls [stddev: 0.07, min: 0.27, max: 1.22] 5: 0.32ms for 1000 calls [stddev: 0.06, min: 0.27, max: 0.56] 0: 0.29ms for 1000 calls [stddev: 0.09, min: 0.19, max: 1.87] 1: 0.19ms for 1000 calls [stddev: 0.05, min: 0.16, max: 1.19]
def main(): parser = argparse.ArgumentParser(description='Pytorch cifar training') parser.add_argument('--experiment-name', '-n', default='train_pytorch') parser.add_argument('--experiment-description', '-d', default='Train cifar usin pure PyTorch code') parser.add_argument('--epochs', '-e', type=int, default=108) parser.add_argument('--model-name', '-m', default='5') parser.add_argument( '--device', default='', help='"cuda" or "cpu" or "" in which case use cuda if available') parser.add_argument('--train-batch-size', '-b', type=int, default=256) parser.add_argument('--test-batch-size', type=int, default=256) parser.add_argument('--seed', '-s', type=float, default=42) parser.add_argument('--half', type=lambda x: x.lower() == 'true', nargs='?', const=True, default=False) parser.add_argument('--cutout', type=int, default=0) parser.add_argument('--grad-clip', type=float, default=5.0) parser.add_argument( '--datadir', default='', help='where to find dataset files, default is ~/torchvision_data_dir') parser.add_argument('--outdir', default='', help='where to put results, default is ~/logdir') parser.add_argument( '--loader-workers', type=int, default=-1, help='number of thread/workers for data loader (-1 means auto)') args = parser.parse_args() if not args.datadir: args.datadir = common.default_dataroot() nsds_dir = args.datadir if os.environ.get('PT_DATA_DIR', ''): nsds_dir = os.environ.get('PT_DATA_DIR') if not args.outdir: args.outdir = os.environ.get('PT_OUTPUT_DIR', '') if not args.outdir: args.outdir = os.path.join('~/logdir', 'nasbench101', args.experiment_name) assert isinstance(nsds_dir, str) expdir = utils.full_path(args.outdir) os.makedirs(expdir, exist_ok=True) utils.setup_cuda(args.seed) datadir = utils.full_path(args.datadir) os.makedirs(datadir, exist_ok=True) utils.create_logger(filepath=os.path.join(expdir, 'logs.log')) # log config for reference logging.info( f'exp_name="{args.experiment_name}", exp_desc="{args.experiment_description}"' ) logging.info( f'model_name="{args.model_name}", seed={args.seed}, epochs={args.epochs}' ) logging.info(f'half={args.half}, cutout={args.cutout}') logging.info(f'datadir="{datadir}"') logging.info(f'expdir="{expdir}"') logging.info(f'train_batch_size={args.train_batch_size}') if args.device: device = torch.device(args.device) else: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') nsds = Nasbench101Dataset( os.path.join(nsds_dir, 'nasbench_ds', 'nasbench_full.pkl')) # load data just before train start so any errors so far is not delayed train_dl, val_dl, test_dl = get_data( datadir=datadir, train_batch_size=args.train_batch_size, test_batch_size=args.test_batch_size, train_num_workers=args.loader_workers, test_num_workers=args.loader_workers, cutout=args.cutout) model_id = int(args.model_name) # 5, 401, 4001, 40001, 400001 epochs = args.epochs net = create_model(nsds, model_id, device, args.half) crit = create_crit(device, args.half) optim, sched, sched_on_epoch = optim_sched_darts( net, epochs) # optim_sched_darts optim_sched_paper train_metrics = train(epochs, train_dl, val_dl, net, device, crit, optim, sched, sched_on_epoch, args.half, False, grad_clip=args.grad_clip) test_acc = test(net, test_dl, device, args.half) log_metrics(expdir, f'metrics_{model_id}', train_metrics, test_acc, args, nsds, model_id)
from archai.darts.mixed_op import MixedOp from archai.common import utils, timing import torch import torch.backends.cudnn as cudnn import numpy as np utils.setup_cuda(2) device = torch.device('cuda') mop = MixedOp(16, 1).to(device=device) a = torch.randn(8, requires_grad=True).to(device=device) x = torch.randn((64, 16, 32, 32), requires_grad=True).to(device=device) for i in range(1000): y = mop(x, a) timing.print_all_timings() """ Without cudnn setup, requires_grad=False: 3: 0.90ms for 1000 calls [stddev: 9.08, min: 0.49, max: 287.68] 4: 0.57ms for 1000 calls [stddev: 0.16, min: 0.48, max: 3.89] 6: 0.32ms for 1000 calls [stddev: 0.07, min: 0.27, max: 1.22] 5: 0.32ms for 1000 calls [stddev: 0.06, min: 0.27, max: 0.56] 0: 0.29ms for 1000 calls [stddev: 0.09, min: 0.19, max: 1.87] 1: 0.19ms for 1000 calls [stddev: 0.05, min: 0.16, max: 1.19] 7: 0.09ms for 1000 calls [stddev: 0.02, min: 0.07, max: 0.16] 2: 0.05ms for 1000 calls [stddev: 0.01, min: 0.04, max: 0.11]