def get_env_info(): import sys print('Python version={}'.format(sys.version)) print('PyTorch version={}'.format(torch.__version__)) flag = torch.cuda.is_available() print('torch.cuda.is_available()={}'.format(flag)) if flag: from torch.backends import cudnn cudnn.enabled = True cudnn.benchmark = False # False efficiency decrease; but fix random; cudnn.deterministic = True # if True, the result would keep same; if False, efficiency would be high but results would change slightly # os.environ["CUDA_VISIBLE_DEVICES"] = '1' # choose which device to use # torch.set_default_tensor_type(torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor) # be careful if use print('torch.cuda.current_device()={}'.format( torch.cuda.current_device())) print('torch.cuda.device_count()={}'.format(torch.cuda.device_count())) print('torch.cuda.get_device_name(0)={}'.format( torch.cuda.get_device_name(0))) print('torch.backends.cudnn.version()={}'.format(cudnn.version())) print('torch.version.cuda={}'.format(torch.version.cuda)) print('Memory Usage:') print('Allocated:', round(torch.cuda.memory_allocated(0) / 1024**3, 1), 'GB') print('Cached: ', round(torch.cuda.memory_cached(0) / 1024**3, 1), 'GB')
def dump_system_info(file_path: str): if os.path.isfile(file_path): os.remove(file_path) with open(file_path, 'w+') as o: o.write(headline('torch collect_env')) o.write(collect_env.get_pretty_env_info()) o.write(headline('system info')) o.write('platform: %s\n' % platform.platform()) o.write('python: %s\n' % platform.python_version()) o.write(headline('gpus')) try: for i, gpu in enumerate(GPUtil.getGPUs()): o.write('gpu %d\n' % i) for k in ['id', 'driver', 'name', 'memoryTotal']: o.write('\t%s=%s\n' % (k, gpu.__dict__[k])) except ValueError as e: o.write("%s" % repr(e)) o.write(headline('cuda / cudnn')) o.write('cuda via cat: %s\n' % get_command_result('cat /usr/local/cuda/version.txt')) o.write('cuda via dpkg: %s\n' % get_command_result('dpkg -l | grep cuda-toolkit')) o.write('cuda via nvcc: %s\n' % get_command_result('nvcc --version')) o.write('cudnn version: %s\n' % cudnn.version()) # o.write('\nnvidia-smi:\n%s\n' % get_command_result('nvidia-smi')) o.write(headline('pip freeze')) for r in freeze(local_only=True): o.write('%s\n' % r)
def _setup_gpus(self, seed: float, detect_anomaly: bool): utils.setup_cuda(seed, self.local_rank) torch.autograd.set_detect_anomaly(detect_anomaly) self._log_info({ 'set_detect_anomaly': detect_anomaly, 'is_anomaly_enabled': torch.is_anomaly_enabled() }) self._log_info({ 'gpu_names': utils.cuda_device_names(), 'gpu_count': torch.cuda.device_count(), 'CUDA_VISIBLE_DEVICES': os.environ['CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ else 'NotSet', 'cudnn.enabled': cudnn.enabled, 'cudnn.benchmark': cudnn.benchmark, 'cudnn.deterministic': cudnn.deterministic, 'cudnn.version': cudnn.version() }) self._log_info({'memory': str(psutil.virtual_memory())}) self._log_info({'CPUs': str(psutil.cpu_count())})
def run(args): print("OS: {}, pytorch version: {}".format(os.name, torch.__version__)) if torch.cuda.is_available(): from torch.backends import cudnn name = torch.cuda.get_device_name(torch.cuda.current_device()) print("Device: {}, CUDA: {}, CuDNN: {}".format(name, cudnn.cuda, cudnn.version())) print("Test setup: ({},{},{})->({},{},{})".format( args.batch_len, args.batch_size, args.dim_in, args.batch_len, args.batch_size, args.dim_out)) starttime = time.time() if (not args.no_gpu) and torch.cuda.is_available(): print("GPU Results") time_speeds(args, cuda=True, number=args.gpu_number) if not args.no_cpu: print("CPU Results") time_speeds(args, cuda=False, number=args.cpu_number) endtime = time.time() elapsed = endtime - starttime res = "Testing took {} sec".format(elapsed) print(res) with open(args.logfile, 'a') as f: f.write(res) f.write('\n')
def main(): # Init logger #args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed)) if not os.path.isdir(args.save_path): os.makedirs(args.save_path) log = open( os.path.join(args.save_path, 'seed-{:}-log.txt'.format(args.manualSeed)), 'w') print_log('Save Path : {:}'.format(args.save_path), log) state = {k: v for k, v in args._get_kwargs()} print_log(state, log) print_log("Random Seed : {:}".format(args.manualSeed), log) print_log("Python version : {:}".format(sys.version.replace('\n', ' ')), log) print_log("Torch version : {:}".format(torch.__version__), log) print_log("CUDA version : {:}".format(torch.version.cuda), log) print_log("cuDNN version : {:}".format(cudnn.version()), log) print_log("Num of GPUs : {:}".format(torch.cuda.device_count()), log) args.dataset = args.dataset.lower() config = load_config(args.model_config) genotype = models[args.arch] print_log('configuration : {:}'.format(config), log) print_log('genotype : {:}'.format(genotype), log) # clear GPU cache torch.cuda.empty_cache() if args.dataset == 'imagenet': main_procedure_imagenet(config, args.data_path, args, genotype, args.init_channels, args.layers, None, log) else: main_procedure(config, args.dataset, args.data_path, args, genotype, args.init_channels, args.layers, None, log) log.close()
def train(args): if torch.cuda.is_available(): torch.distributed.init_process_group(backend='NCCL', init_method='env://') cudnn.enabled = True cudnn.benchmark = True cudnn.deterministic = True print(torch.__version__) print(torch.version.cuda) print(cudnn.version()) init_seed(123456) batch_size = 32 output_path = data_path + base_output_path vocab2id, id2vocab, id2freq = load_vocab(data_path + 'holl_input_output.' + version + '.vocab', t=min_vocab_freq) if not os.path.exists(data_path + 'glove.6B.300d.txt' + '.dat'): prepare_embeddings(data_path + 'glove.6B.300d.txt') emb_matrix = load_embeddings(data_path + 'glove.6B.300d.txt', id2vocab, embedding_size) if os.path.exists(data_path + 'holl-train.' + version + '.pkl'): train_dataset = torch.load(data_path + 'holl-train.' + version + '.pkl') else: train_dataset = GLKSDataset( [data_path + 'holl-train.' + version + '.json'], vocab2id, min_window_size, num_windows, knowledge_len) model = GLKS(min_window_size, num_windows, embedding_size, hidden_size, vocab2id, id2vocab, max_dec_len=70, beam_width=1, emb_matrix=emb_matrix) init_params(model, escape='embedding') model_optimizer = optim.Adam(model.parameters()) trainer = DefaultTrainer(model, args.local_rank) # for i in range(10): # trainer.train_epoch('ds_train', train_dataset, collate_fn, batch_size, i, model_optimizer) for i in range(20): if i == 5: train_embedding(model) trainer.train_epoch('ds_mle_mcc_train', train_dataset, collate_fn, batch_size, i, model_optimizer) trainer.serialize(i, output_path=output_path)
def test(args): cudnn.enabled = True cudnn.benchmark = True cudnn.deterministic = True print(torch.__version__) print(torch.version.cuda) print(cudnn.version()) init_seed(123456) batch_size = 64 output_path = data_path + base_output_path vocab2id, id2vocab, id2freq = load_vocab(data_path + 'holl_input_output.' + version + '.vocab', t=min_vocab_freq) if os.path.exists(data_path + 'holl-dev.' + version + '.pkl'): dev_dataset = torch.load(data_path + 'holl-dev.' + version + '.pkl') else: dev_dataset = GLKSDataset( [data_path + 'holl-dev.' + version + '.json'], vocab2id, min_window_size, num_windows, knowledge_len) if os.path.exists(data_path + 'holl-test.' + version + '.pkl'): test_dataset = torch.load(data_path + 'holl-test.' + version + '.pkl') else: test_dataset = GLKSDataset( [data_path + 'holl-test.' + version + '.json'], vocab2id, min_window_size, num_windows, knowledge_len) for i in range(20): print('epoch', i) file = output_path + 'model/' + str(i) + '.pkl' if os.path.exists(file): model = GLKS(min_window_size, num_windows, embedding_size, hidden_size, vocab2id, id2vocab, max_dec_len=70, beam_width=1) model.load_state_dict(torch.load(file)) trainer = DefaultTrainer(model, None) trainer.test('test', dev_dataset, collate_fn, batch_size, i, output_path=output_path) trainer.test('test', test_dataset, collate_fn, batch_size, 100 + i, output_path=output_path)
def init_torch_seeds(seed=0): import torch.backends.cudnn as cudnn torch.manual_seed(seed) if seed == 0: # slower, more reproducible cudnn.benchmark, cudnn.deterministic = False, True else: # faster, less reproducible cudnn.benchmark, cudnn.deterministic = True, False print('PyTorch version {}'.format(torch.__version__)) print('CUDA version {}'.format(torch.version.cuda)) print('cuDNN version {}'.format(cudnn.version())) print('cuDNN deterministic {}'.format(cudnn.deterministic)) print('cuDNN benchmark {}'.format(cudnn.benchmark))
def backward_weight(fn, input, hx, output, weight, grad_weight): with torch.cuda.device_of(input): is_input_packed = fn.batch_sizes is not None handle = cudnn.get_handle() if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx else: cx = None if fn.batch_first and not is_input_packed: input = input.transpose(0, 1) output = output.transpose(0, 1) input_size = _input_size(fn, input) hidden_size = _hidden_size(fn) if not fn.requires_grad: raise RuntimeError( 'backward_weight can only be called when the function requires grad!' ) if fn.dropout != 0 and cudnn.version() < 5103: raise RuntimeError( 'dropout supported only in cudnn v 5.1 and above') if tuple(input.size()) != input_size: raise RuntimeError('Expected input size {}, got {}'.format( input_size, tuple(input.size()))) if tuple(hx.size()) != hidden_size: raise RuntimeError('Expected input size {}, got {}'.format( hidden_size, hx.size())) assert hx.is_contiguous() assert cx is None or cx.is_contiguous() x = input.contiguous() y = output dw = fn.weight_buf.new().resize_as_(fn.weight_buf).zero_() with torch.cuda.device_of(input): workspace = torch.cuda.ByteTensor(fn.workspace_size) check_error( cudnn.lib.cudnnRNNBackwardWeights( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), ctypes.c_void_p(workspace.data_ptr()), workspace.size(0), fn.w_desc, ctypes.c_void_p(dw.data_ptr()), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0))) # copy the weights from the weight_buf into grad_weight grad_params = get_parameters(fn, handle, dw) _copyParams(grad_params, grad_weight) return grad_weight
def setting(cfg: argparse.Namespace): cudnn.benchmark = True logger = get_logger() logger('==> args: {}'.format(cfg)) logger('==> the results path: {}'.format(cfg.output)) if not hasattr(cfg, 'seed') or cfg.seed < 0: cfg.seed = int(time.time()) random.seed(cfg.seed) torch.manual_seed(cfg.seed) logger('==> seed: {}'.format(cfg.seed)) logger('==> PyTorch version: {}, cudnn version: {}'.format(torch.__version__, cudnn.version())) git_version = os.popen('git log --pretty=oneline | head -n 1').readline()[:-1] logger('==> git version: {}'.format(git_version)) return
def backward_weight(fn, input, hx, output, weight, grad_weight): with torch.cuda.device_of(input): is_input_packed = fn.batch_sizes is not None handle = cudnn.get_handle() if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx else: cx = None if fn.batch_first and not is_input_packed: input = input.transpose(0, 1) output = output.transpose(0, 1) input_size = _input_size(fn, input) hidden_size = _hidden_size(fn) if not fn.requires_grad: raise RuntimeError('backward_weight can only be called when the function requires grad!') if fn.dropout != 0 and cudnn.version() < 5103: raise RuntimeError('dropout supported only in cudnn v 5.1 and above') if tuple(input.size()) != input_size: raise RuntimeError('Expected input size {}, got {}'.format( input_size, tuple(input.size()))) if tuple(hx.size()) != hidden_size: raise RuntimeError('Expected input size {}, got {}'.format( hidden_size, hx.size())) assert hx.is_contiguous() assert cx is None or cx.is_contiguous() x = input.contiguous() y = output dw = fn.weight_buf.new().resize_as_(fn.weight_buf).zero_() with torch.cuda.device_of(input): workspace = torch.cuda.ByteTensor(fn.workspace_size) check_error(cudnn.lib.cudnnRNNBackwardWeights( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), ctypes.c_void_p(workspace.data_ptr()), workspace.size(0), fn.w_desc, ctypes.c_void_p(dw.data_ptr()), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0) )) # copy the weights from the weight_buf into grad_weight grad_params = get_parameters(fn, handle, dw) _copyParams(grad_params, grad_weight) return grad_weight
def _update_output(self, input, weight, bias): self.use_cudnn = cudnn.is_acceptable(input) if self.use_cudnn and cudnn.version() < 6000: self.use_cudnn = not self.is_dilated() if self.use_cudnn: output = input.new(*self._output_size(input, weight)) if self.transposed: self._cudnn_info = ( torch._C._cudnn_convolution_transpose_full_forward( input, weight, bias, output, self.padding, self.stride, self.dilation, self.groups, cudnn.benchmark)) else: self._cudnn_info = torch._C._cudnn_convolution_full_forward( input, weight, bias, output, self.padding, self.stride, self.dilation, self.groups, cudnn.benchmark) return output self._bufs = [[] for g in range(self.groups)] return self._thnn('update_output', input, weight, bias)
def main(): # Init logger args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed)) if not os.path.isdir(args.save_path): os.makedirs(args.save_path) log = open(os.path.join(args.save_path, 'log-seed-{:}-{:}.txt'.format(args.manualSeed, time_file_str())), 'w') print_log('save path : {:}'.format(args.save_path), log) state = {k: v for k, v in args._get_kwargs()} print_log(state, log) print_log("Random Seed: {}".format(args.manualSeed), log) print_log("Python version : {}".format(sys.version.replace('\n', ' ')), log) print_log("Torch version : {}".format(torch.__version__), log) print_log("CUDA version : {}".format(torch.version.cuda), log) print_log("cuDNN version : {}".format(cudnn.version()), log) print_log("Num of GPUs : {}".format(torch.cuda.device_count()), log) print_log("Num of CPUs : {}".format(multiprocessing.cpu_count()), log) config = load_config( args.config_path ) genotype = Networks[ args.arch ] main_procedure(config, genotype, args.save_path, args.print_freq, log) log.close()
def log_basic_info(logger: Logger, config: Any) -> None: """Logging about pytorch, ignite, configurations, gpu system distributed settings. Parameters ---------- logger Logger instance for logging config config object to log """ import ignite logger.info("PyTorch version: %s", torch.__version__) logger.info("Ignite version: %s", ignite.__version__) if torch.cuda.is_available(): # explicitly import cudnn as # torch.backends.cudnn can not be pickled with hvd spawning procs from torch.backends import cudnn logger.info("GPU device: %s", torch.cuda.get_device_name(idist.get_local_rank())) logger.info("CUDA version: %s", torch.version.cuda) logger.info("CUDNN version: %s", cudnn.version()) logger.info("Configuration: %s", pformat(vars(config))) if idist.get_world_size() > 1: logger.info("distributed configuration: %s", idist.model_name()) logger.info("backend: %s", idist.backend()) logger.info("device: %s", idist.device().type) logger.info("hostname: %s", idist.hostname()) logger.info("world size: %s", idist.get_world_size()) logger.info("rank: %s", idist.get_rank()) logger.info("local rank: %s", idist.get_local_rank()) logger.info("num processes per node: %s", idist.get_nproc_per_node()) logger.info("num nodes: %s", idist.get_nnodes()) logger.info("node rank: %s", idist.get_node_rank())
def forward(fn, input, hx, weight, output, hy): with torch.cuda.device_of(input): lib = cudnn.lib handle = cudnn.get_handle() fn.datatype = cudnn._typemap[input.type()] is_input_packed = fn.batch_sizes is not None if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx hy, cy = hy else: cx, cy = None, None if fn.batch_first and not is_input_packed: input = input.transpose(0, 1) if fn.dropout != 0 and cudnn.version() < 5103: raise RuntimeError('dropout supported only in cudnn v5.1 and above') if is_input_packed: fn.seq_length = len(fn.batch_sizes) fn.mini_batch = fn.batch_sizes[0] fn.input_size = input.size(-1) else: fn.seq_length, fn.mini_batch, fn.input_size = input.size() hidden_size = _hidden_size(fn) output_size = _output_size(fn, input) assert hx.is_contiguous() assert cx is None or cx.is_contiguous() x = input.contiguous() output.resize_(*output_size) hy.resize_(*hidden_size) if cy is not None: cy.resize_(*hidden_size) y = output # init descriptors fn.rnn_desc = init_rnn_descriptor(fn, handle) if is_input_packed: fn.x_descs = cudnn.descriptor_sequence(x, fn.batch_sizes) fn.y_descs = cudnn.descriptor_sequence(y, fn.batch_sizes) else: fn.x_descs = cudnn.descriptor(x[0], fn.seq_length) fn.y_descs = cudnn.descriptor(y[0], fn.seq_length) fn.hx_desc = cudnn.descriptor(hx) fn.hy_desc = cudnn.descriptor(hx) fn.cx_desc = cudnn.descriptor(cx) if cx is not None else None fn.cy_desc = cudnn.descriptor(cx) if cx is not None else None # create the weight buffer and copy the weights into it if fn.weight_buf is None: num_weights = get_num_weights( handle, fn.rnn_desc, fn.x_descs[0], fn.datatype) fn.weight_buf = x.new(num_weights) fn.w_desc = init_weight_descriptor(fn, fn.weight_buf) w = fn.weight_buf # this zero might not seem necessary, but it is in the case # where biases are disabled; then they won't be copied and must be zero'd. # Alternatively, _copyParams could be written more carefully. w.zero_() params = get_parameters(fn, handle, w) _copyParams(weight, params) else: fn.w_desc = init_weight_descriptor(fn, fn.weight_buf) w = fn.weight_buf if cx is not None and tuple(cx.size()) != hidden_size: raise RuntimeError('Expected cell size {}, got {}'.format( hidden_size, tuple(cx.size()))) workspace_size = ctypes.c_long() check_error(lib.cudnnGetRNNWorkspaceSize( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.byref(workspace_size) )) fn.workspace_size = workspace_size.value with torch.cuda.device_of(input): workspace = torch.cuda.ByteTensor(fn.workspace_size) if fn.requires_grad: reserve_size = ctypes.c_long() check_error(lib.cudnnGetRNNTrainingReserveSize( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.byref(reserve_size) )) fn.reserve = torch.cuda.ByteTensor(reserve_size.value) check_error(lib.cudnnRNNForwardTraining( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.hy_desc, ctypes.c_void_p(hy.data_ptr()), fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx is not None else None, ctypes.c_void_p(workspace.data_ptr()), workspace.size(0), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0) )) else: # inference check_error(lib.cudnnRNNForwardInference( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.hy_desc, ctypes.c_void_p(hy.data_ptr()), fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx is not None else None, ctypes.c_void_p(workspace.data_ptr()), workspace.size(0) )) if fn.batch_first and not is_input_packed: output.transpose_(0, 1)
def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_input, grad_hx): with torch.cuda.device_of(input): is_input_packed = fn.batch_sizes is not None handle = cudnn.get_handle() if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx grad_hx, grad_cx = grad_hx grad_hy, grad_cy = grad_hy else: cx, grad_cx, grad_cy = None, None, None if fn.batch_first and not is_input_packed: input = input.transpose(0, 1) grad_output = grad_output.transpose(0, 1) output = output.transpose(0, 1) input_size = _input_size(fn, input) hidden_size = _hidden_size(fn) output_size = _output_size(fn, input) assert hx.is_contiguous() assert cx is None or cx.is_contiguous() x = input.contiguous() dy = grad_output.contiguous() y = output w = fn.weight_buf dx = grad_input.resize_as_(input) dhy = grad_hy.contiguous().view(*hidden_size) dcy = grad_cy.contiguous().view(*hidden_size) if grad_cy is not None else None dhx = grad_hx.resize_(*hidden_size) dcx = grad_cx.resize_(*hidden_size) if grad_cx is not None else None if fn.dropout != 0 and cudnn.version() < 5103: raise RuntimeError('dropout supported only in cudnn v 5.1 and above') if not fn.requires_grad: raise RuntimeError('backward_grad can only be called when the function requires grad!') if tuple(input.size()) != input_size: raise RuntimeError('Expected input size {}, got {}'.format( input_size, tuple(input.size()))) if tuple(output.size()) != output_size: raise RuntimeError('Expected output size {}, got {}'.format( output_size, output.size())) if hx is not None and tuple(hx.size()) != hidden_size: raise RuntimeError('Expected hidden size {}, got {}'.format( hidden_size, hx.size())) if cx is not None and tuple(cx.size()) != hidden_size: raise RuntimeError('Expected cell size {}, got {}'.format( hidden_size, cx.size())) if dhy is not None and tuple(dhy.size()) != hidden_size: raise RuntimeError('Expected d_hidden size {}, got {}'.format( hidden_size, dhy.size())) if dcy is not None and tuple(dcy.size()) != hidden_size: raise RuntimeError('Expected d_cell size {}, got {}'.format( hidden_size, dcy.size())) if not dhy.is_cuda or not dy.is_cuda or (dcy is not None and not dcy.is_cuda): raise RuntimeError('Gradients aren\'t CUDA tensors') with torch.cuda.device_of(input): workspace = torch.cuda.ByteTensor(fn.workspace_size) check_error(cudnn.lib.cudnnRNNBackwardData( handle, fn.rnn_desc, fn.seq_length, fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.y_descs, ctypes.c_void_p(dy.data_ptr()), fn.hy_desc, ctypes.c_void_p(dhy.data_ptr()), fn.cy_desc, ctypes.c_void_p(dcy.data_ptr()) if cx is not None else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None, fn.x_descs, ctypes.c_void_p(dx.data_ptr()), fn.hx_desc, ctypes.c_void_p(dhx.data_ptr()), fn.cx_desc, ctypes.c_void_p(dcx.data_ptr()) if cx is not None else None, ctypes.c_void_p(workspace.data_ptr()), workspace.size(0), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0) )) if fn.batch_first and not is_input_packed: grad_input = grad_input.transpose_(0, 1)
from parser import create_parser from torch.backends import cudnn from dataset.dataloader import create_dataset from utils.evaluation_metric import confusion_matrix, mIoU, per_cls_iou from utils.data_visualization import visualize_segmap from utils.file_op import mkdir from PIL import Image np.random.seed(0) torch.manual_seed(0) torch.cuda.manual_seed_all(0) torch.set_default_tensor_type(torch.FloatTensor) TORCH_VERSION = torch.__version__ TORCH_CUDA_VERSION = torch.version.cuda CUDNN_VERSION = str(cudnn.version()) DEVICE_NAME = torch.cuda.get_device_name() # cudnn.benchmark = True cudnn.deterministic = True os.environ['CUDA_VISIBLE_DEVICES'] = '1' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") parser = create_parser() dataset_name = 'Cityscapes' ckpt_name = 'Semantic_Segmentation_Cityscapes_27.pth.tar' def evaluation(model, dataloader): # result dir result_dir = parser.result_dir # label dir
def dual_train(args): cudnn.enabled = True cudnn.benchmark = True cudnn.deterministic = True print("torch_version:{}".format(torch.__version__)) print("CUDA_version:{}".format(torch.version.cuda)) print("cudnn_version:{}".format(cudnn.version())) init_seed(123456) data_path = args.base_data_path + args.dataset + '/' print("Load BERT vocab") tokenizer, vocab2id, id2vocab = bert_tokenizer() print('--Vocabulary size', len(vocab2id)) print("Load dataset") # load dataset query = torch.load(data_path + 'query_DukeNet.pkl') train_samples = torch.load(data_path + 'train_DukeNet.pkl') passage = torch.load(data_path + 'passage_DukeNet.pkl') print("--The number of train_samples:", len(train_samples)) print("Establish model and load parameters") saved_model_path = os.path.join(args.base_output_path + args.name + "/", 'model/') with open(saved_model_path + "checkpoints.json", 'r', encoding='utf-8') as r: checkpoints = json.load(r) last_epoch = checkpoints["time"][-1] fuse_dict = torch.load( os.path.join(saved_model_path, '.'.join([str(last_epoch), 'pkl']))) model = DukeNet(vocab2id, id2vocab, args) model.load_state_dict(fuse_dict["model"]) # freeze the parameter of encoder,reducing the cost of GPU memory. freeze_params(model, "enc") if torch.cuda.is_available(): model = model.cuda() else: model = model model.train() print('--Loading success, last_epoch is {}'.format(last_epoch)) print("Create optimizer") A_optimizer = optim.Adam(model.shifter.parameters(), args.A_lr) B_optimizer = optim.Adam(model.posterior_tracker.parameters(), args.B_lr) All_optimizer = optim.Adam(model.parameters(), args.ALL_lr) A_optimizer.zero_grad() B_optimizer.zero_grad() All_optimizer.zero_grad() print("Define loss") loss_nll = torch.nn.NLLLoss(reduction='none') KLDLoss = nn.KLDivLoss(reduction='batchmean') if isinstance(last_epoch, int): last_epoch = -1 else: last_epoch = int(last_epoch.split('_')[1]) # epoch start ====================================================================================================== for epoch in range(last_epoch + 1, args.epoches): print("Epoch:", epoch) print("Create dataloader") train_dataset = Dataset("train", train_samples, query, passage, vocab2id, args.max_knowledge_pool_when_train, args.max_knowledge_pool_when_inference, args.context_len, args.knowledge_sentence_len, args.max_dec_length) train_loader = torch.utils.data.DataLoader( train_dataset, collate_fn=collate_fn, batch_size=args.dual_train_batch_size, shuffle=True) # each example for j, data in enumerate(train_loader, 0): if torch.cuda.is_available(): data_cuda = dict() for key, value in data.items(): if isinstance(value, torch.Tensor): data_cuda[key] = value.cuda() else: data_cuda[key] = value data = data_cuda # A(shifting)============================================================================================ # (N,K) #print("Start from shifting") encoded_state = model.encoding_layer(data) # get label shifting knowledge N, K, H = encoded_state['knowledge_tracking_pool_encoded'][ 1].size() offsets = torch.arange( N, device=data["knowledge_tracking_label"].device ) * K + data["knowledge_tracking_label"] # N # knowledge_tracking_pool_use (N K E)->(N*K,E) flatten_knowledge_tracking_pool_use = encoded_state[ 'knowledge_tracking_pool_encoded'][1].view(N * K, -1) label_tracked_knowledge_use = flatten_knowledge_tracking_pool_use[ offsets] # (N E) # N K knowledge_shifting_score, _, _, _, _ = model.shifter( encoded_state['contexts_encoded'], label_tracked_knowledge_use, # encoded_state['knowledge_shifting_pool_encoded'], encoded_state['knowledge_shifting_pool_mask'], data["shifting_ck_mask"], data["knowledge_shifting_label"], data['knowledge_shifting_pool'], mode="inference") knowledge_shifting_prob = F.softmax(knowledge_shifting_score, -1) logist = Categorical(knowledge_shifting_prob) inferred_shifted_knowledge_index = logist.sample() # N N, K, H = encoded_state['knowledge_shifting_pool_encoded'][ 1].size() # (N K E) offsets = torch.arange( N, device=inferred_shifted_knowledge_index.device ) * K + inferred_shifted_knowledge_index # N # knowledge_shifting_pool_use (N K E)->(N*K,E) flatten_knowledge_shifting_pool_use = encoded_state[ 'knowledge_shifting_pool_encoded'][1].view(N * K, -1) inferred_shifted_knowledge_use = flatten_knowledge_shifting_pool_use[ offsets] # (N E) # action prob # N action_prob_loss = loss_nll( torch.log(knowledge_shifting_prob + 1e-10), inferred_shifted_knowledge_index) # B with torch.no_grad(): # (N,K) knowledge_tracking_score, _, _, _ = model.posterior_tracker( encoded_state['contexts_encoded'], inferred_shifted_knowledge_use, encoded_state['knowledge_tracking_pool_encoded'], encoded_state['knowledge_tracking_pool_mask'], data['tracking_ck_mask'], data["knowledge_tracking_label"], mode="inference") # N reward = -loss_nll( F.log_softmax(knowledge_tracking_score, 1), data["knowledge_tracking_label"]) # N norm_reward = (reward - torch.mean(reward)) / torch.std(reward) A_loss = torch.mean(action_prob_loss * norm_reward) A_optimizer.zero_grad() A_loss.backward() torch.nn.utils.clip_grad_norm_(model.shifter.parameters(), 0.4) A_optimizer.step() print_A_loss = A_loss.cpu().item() # B tracking=========================================================================================== #print("Start from tracking") encoded_state = model.encoding_layer(data) # get label shifting knowledge N, K, H = encoded_state['knowledge_shifting_pool_encoded'][ 1].size() offsets = torch.arange( N, device=data["knowledge_shifting_label"].device ) * K + data["knowledge_shifting_label"] # N # knowledge_shifting_pool_use (N K E)->(N*K,E) flatten_knowledge_shifting_pool_use = encoded_state[ 'knowledge_shifting_pool_encoded'][1].view(N * K, -1) label_shifted_knowledge_use = flatten_knowledge_shifting_pool_use[ offsets] # (N E) # N K knowledge_tracking_score, _, _, _ = model.posterior_tracker( encoded_state['contexts_encoded'], label_shifted_knowledge_use, encoded_state['knowledge_tracking_pool_encoded'], encoded_state['knowledge_tracking_pool_mask'], data['tracking_ck_mask'], data["knowledge_tracking_label"], mode="inference") # N K knowledge_tracking_prob = F.softmax(knowledge_tracking_score, -1) logist = Categorical(knowledge_tracking_prob) # N inferred_tracked_knowledge_index = logist.sample() # batch N, K, H = encoded_state['knowledge_tracking_pool_encoded'][ 1].size() # (N K E) offsets = torch.arange( N, device=inferred_tracked_knowledge_index.device ) * K + inferred_tracked_knowledge_index # N flatten_knowledge_tracking_pool_use = encoded_state[ 'knowledge_tracking_pool_encoded'][1].view(N * K, -1) inferred_tracked_knowledge_use = flatten_knowledge_tracking_pool_use[ offsets] # (N E) # action prob # N action_prob_loss = loss_nll( torch.log(knowledge_tracking_prob + 1e-10), inferred_tracked_knowledge_index) with torch.no_grad(): knowledge_shifting_score, _, _, _, _ = model.shifter( encoded_state['contexts_encoded'], inferred_tracked_knowledge_use, # label tracked knowledge encoded_state['knowledge_shifting_pool_encoded'], encoded_state['knowledge_shifting_pool_mask'], data["shifting_ck_mask"], data["knowledge_shifting_label"], data['knowledge_shifting_pool'], mode="inference") reward = -loss_nll( F.log_softmax(knowledge_shifting_score, -1), data["knowledge_shifting_label"]) norm_reward = (reward - torch.mean(reward)) / torch.std(reward) B_loss = torch.mean(action_prob_loss * norm_reward) B_optimizer.zero_grad() B_loss.backward() torch.nn.utils.clip_grad_norm_( model.posterior_tracker.parameters(), 0.4) B_optimizer.step() print_B_loss = B_loss.cpu().item() # ALL==================================================================================================== encoded_state = model.encoding_layer(data) interaction_outputs = model.dual_knowledge_interaction_layer( data, encoded_state) rg = model.decoding_layer(data, interaction_outputs) _, pri_tracking_pred = interaction_outputs[ 'prior_knowledge_tracking_score'].detach().max(1) pri_tracking_acc = ( pri_tracking_pred == data['knowledge_tracking_label'] ).float().mean() _, pos_tracking_pred = interaction_outputs[ 'posterior_knowledge_tracking_score'].detach().max(1) pos_tracking_acc = ( pos_tracking_pred == data['knowledge_tracking_label'] ).float().mean() _, shifting_pred = interaction_outputs[ 'knowledge_shifting_score'].detach().max(1) shifting_acc = ( shifting_pred == data['knowledge_shifting_label'] ).float().mean() # As with NLLLoss, the input given is expected to contain log-probabilities # The targets are given as probabilities (i.e. without taking the logarithm). pri_2_pos = KLDLoss( F.log_softmax( interaction_outputs['prior_knowledge_tracking_score'], 1), F.softmax( interaction_outputs[ 'posterior_knowledge_tracking_score'], 1).detach()) loss_pos_tracking = F.nll_loss( F.log_softmax( interaction_outputs[ 'posterior_knowledge_tracking_score'], -1), data['knowledge_tracking_label'].view(-1)) loss_shifting = F.nll_loss( F.log_softmax( interaction_outputs['knowledge_shifting_score'], -1), data['knowledge_shifting_label'].view(-1)) loss_g = F.nll_loss( (rg[0] + 1e-8).log().reshape(-1, rg[0].size(-1)), data['response'].reshape(-1), ignore_index=0) ALL_loss = pri_2_pos + (loss_pos_tracking + loss_shifting + loss_g) * 0.5 All_optimizer.zero_grad() ALL_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.4) All_optimizer.step() if j % 10 == 0: print('Training: %s' % "Dual_Game_DukeNet") print( "Epoch:{}, Batch:{}, Loss_A:{}, Loss_B:{}, KLDLoss:{}, Pos_Tra_Loss:{}, Shi_Loss:{}, Gen_Loss:{}, Pri_T_ACC:{}, Pos_T_ACC:{}, Shi_ACC:{}" .format(epoch, j, rounder(print_A_loss, 4), rounder(print_B_loss, 4), rounder(pri_2_pos.cpu().item(), 4), rounder(loss_pos_tracking.cpu().item(), 4), rounder(loss_shifting.cpu().item(), 4), rounder(loss_g.cpu().item(), 4), rounder(pri_tracking_acc.cpu().item(), 2), rounder(pos_tracking_acc.cpu().item(), 2), rounder(shifting_acc.cpu().item(), 2))) # save model==================================================================================================== fuse_dict = {"model": model.state_dict()} torch.save( fuse_dict, os.path.join(saved_model_path, '.'.join(["d_" + str(epoch), 'pkl']))) print("Saved epoch {} model".format("d_" + str(epoch))) with open(saved_model_path + "checkpoints.json", 'r', encoding='utf-8') as r: checkpoints = json.load(r) checkpoints["time"].append("d_" + str(epoch)) with open(saved_model_path + "checkpoints.json", 'w', encoding='utf-8') as w: json.dump(checkpoints, w)
def forward(fn, input, hx, weight, output, hy): with torch.cuda.device_of(input): lib = cudnn.lib handle = cudnn.get_handle() fn.datatype = cudnn._typemap[input.type()] if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx hy, cy = hy else: cx, cy = None, None if fn.batch_first: input = input.transpose(0, 1) if input.dim() != 3: raise RuntimeError('input must have 3 dimensions, got {}'.format( input.dim())) if fn.input_size != input.size(2): raise RuntimeError( 'input.size(2) must be equal to input_size. Expected {}, got {}' .format(fn.input_size)) if fn.dropout != 0 and cudnn.version() < 5103: raise RuntimeError( 'dropout supported only in cudnn v5.1 and above') fn.seq_length, fn.mini_batch, fn.input_size = input.size() hidden_size = _hidden_size(fn) output_size = _output_size(fn) assert hx.is_contiguous() assert cx is None or cx.is_contiguous() x = input.contiguous() output.resize_(*output_size) hy.resize_(*hidden_size) if cy is not None: cy.resize_(*hidden_size) y = output # init descriptors if ('desc' not in fn.dropout_state) or (fn.dropout_state['desc'].get() is None): fn.dropout_state['desc'] = Unserializable( init_dropout_descriptor(fn, handle)) fn.rnn_desc = init_rnn_descriptor(fn, handle) fn.x_descs = cudnn.descriptor(x[0], fn.seq_length) fn.y_descs = cudnn.descriptor(y[0], fn.seq_length) fn.hx_desc = cudnn.descriptor(hx) fn.hy_desc = cudnn.descriptor(hx) fn.cx_desc = cudnn.descriptor(cx) if cx is not None else None fn.cy_desc = cudnn.descriptor(cx) if cx is not None else None # create the weight buffer and copy the weights into it num_weights = get_num_weights(handle, fn.rnn_desc, fn.x_descs[0], fn.datatype) fn.weight_buf = input.new(num_weights) fn.w_desc = init_weight_descriptor(fn, fn.weight_buf) w = fn.weight_buf # this zero might not seem necessary, but it is in the case # where biases are disabled; then they won't be copied and must be zero'd. # Alternatively, _copyParams could be written more carefully. w.zero_() params = get_parameters(fn, handle, w) _copyParams(weight, params) if tuple(hx.size()) != hidden_size: raise RuntimeError('Expected hidden size {}, got {}'.format( hidden_size, tuple(hx.size()))) if cx is not None and tuple(cx.size()) != hidden_size: raise RuntimeError('Expected cell size {}, got {}'.format( hidden_size, tuple(cx.size()))) workspace_size = ctypes.c_long() check_error( lib.cudnnGetRNNWorkspaceSize(handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.byref(workspace_size))) fn.workspace = torch.cuda.ByteTensor(workspace_size.value) if fn.train: reserve_size = ctypes.c_long() check_error( lib.cudnnGetRNNTrainingReserveSize(handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.byref(reserve_size))) fn.reserve = torch.cuda.ByteTensor(reserve_size.value) check_error( lib.cudnnRNNForwardTraining( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.hy_desc, ctypes.c_void_p(hy.data_ptr()), fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx is not None else None, ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0))) else: # inference check_error( lib.cudnnRNNForwardInference( handle, fn.rnn_desc, fn.seq_length, fn.x_descs, ctypes.c_void_p(x.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.hy_desc, ctypes.c_void_p(hy.data_ptr()), fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx is not None else None, ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0))) if fn.batch_first: output = output.transpose_(0, 1)
def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_input, grad_hx): with torch.cuda.device_of(input): handle = cudnn.get_handle() if fn.mode == cudnn.CUDNN_LSTM: hx, cx = hx grad_hx, grad_cx = grad_hx grad_hy, grad_cy = grad_hy else: cx, grad_cx, grad_cy = None, None, None if fn.batch_first: input = input.transpose(0, 1) grad_output = grad_output.transpose(0, 1) output = output.transpose(0, 1) input_size = _input_size(fn) hidden_size = _hidden_size(fn) output_size = _output_size(fn) assert hx.is_contiguous() assert cx is None or cx.is_contiguous() x = input.contiguous() dy = grad_output.contiguous() y = output w = fn.weight_buf dx = grad_input.resize_as_(input) dhy = grad_hy.contiguous().view(*hidden_size) dcy = grad_cy.contiguous().view( *hidden_size) if grad_cy is not None else None dhx = grad_hx.resize_(*hidden_size) dcx = grad_cx.resize_(*hidden_size) if grad_cx is not None else None if fn.dropout != 0 and cudnn.version() < 5103: raise RuntimeError( 'dropout supported only in cudnn v 5.1 and above') if not fn.train: raise RuntimeError( 'backward_grad can only be called when training!') if tuple(input.size()) != input_size: raise RuntimeError('Expected input size {}, got {}'.format( input_size, tuple(input.size()))) if tuple(output.size()) != _output_size(fn): raise RuntimeError('Expected output size {}, got {}'.format( output_size, output.size())) if hx is not None and tuple(hx.size()) != hidden_size: raise RuntimeError('Expected hidden size {}, got {}'.format( hidden_size, hx.size())) if cx is not None and tuple(cx.size()) != hidden_size: raise RuntimeError('Expected cell size {}, got {}'.format( hidden_size, cx.size())) if dhy is not None and tuple(dhy.size()) != hidden_size: raise RuntimeError('Expected d_hidden size {}, got {}'.format( hidden_size, dhy.size())) if dcy is not None and tuple(dcy.size()) != hidden_size: raise RuntimeError('Expected d_cell size {}, got {}'.format( hidden_size, dcy.size())) if not dhy.is_cuda or not dy.is_cuda or (dcy is not None and not dcy.is_cuda): raise RuntimeError('Gradients aren\'t CUDA tensors') check_error( cudnn.lib.cudnnRNNBackwardData( handle, fn.rnn_desc, fn.seq_length, fn.y_descs, ctypes.c_void_p(y.data_ptr()), fn.y_descs, ctypes.c_void_p(dy.data_ptr()), fn.hy_desc, ctypes.c_void_p(dhy.data_ptr()), fn.cy_desc, ctypes.c_void_p(dcy.data_ptr()) if cx is not None else None, fn.w_desc, ctypes.c_void_p(w.data_ptr()), fn.hx_desc, ctypes.c_void_p(hx.data_ptr()), fn.cx_desc, ctypes.c_void_p(cx.data_ptr()) if cx is not None else None, fn.x_descs, ctypes.c_void_p(dx.data_ptr()), fn.hx_desc, ctypes.c_void_p(dhx.data_ptr()), fn.cx_desc, ctypes.c_void_p(dcx.data_ptr()) if cx is not None else None, ctypes.c_void_p(fn.workspace.data_ptr()), fn.workspace.size(0), ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0))) if fn.batch_first: grad_input = grad_input.transpose_(0, 1)
def main(): """ --------------------------------------------- MAIN -------------------------------------------------------- Loads the data and executes the grid search on depth and width scaling factors. """ # Manual seed for reproducibility torch.manual_seed(363636) # Global instances global args, use_cuda, device # Instantiating the parser args = parser.parse_args() # Global CUDA flag use_cuda = args.cuda and torch.cuda.is_available() # Defining device and device's map locationo device = torch.device("cuda" if use_cuda else "cpu") print('chosen device: ', device) # Defining loss function and printing CUDA information (if available) if use_cuda: print("PyTorch version: ") print(torch.__version__) print("CUDA Version: ") print(torch.version.cuda) print("cuDNN version is: ") print(cudnn.version()) cudnn.benchmark = True criterion = nn.CrossEntropyLoss().cuda() else: criterion = nn.CrossEntropyLoss() # Dataloaders for CIFAR, ImageNet and MNIST if args.dataset == 'CIFAR100': normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) kwargs = { 'num_workers': args.workers, 'pin_memory': True } if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.CIFAR100( root=args.data_path, train=True, transform=transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.075), transforms.ToTensor(), normalize, Cutout(n_holes=1, length=16), ]), download=True), batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( datasets.CIFAR100(root=args.data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), batch_size=args.val_batch_size, shuffle=False, **kwargs) elif args.dataset == 'ImageNet': traindir = os.path.join(args.data_path, 'train') valdir = os.path.join(args.data_path, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(args.image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) image_size = args.image_size val_dataset = datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(image_size, interpolation=PIL.Image.BICUBIC), transforms.CenterCrop(image_size), transforms.ToTensor(), normalize, ])) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.val_batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) elif args.dataset == 'MNIST': kwargs = { 'num_workers': args.workers, 'pin_memory': True } if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( args.data_path, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( datasets.MNIST(args.data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.val_batch_size, shuffle=True, **kwargs) elif args.dataset == 'CIFAR10': normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) kwargs = { 'num_workers': args.workers, 'pin_memory': True } if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.CIFAR10( root=args.data_path, train=True, transform=transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ToTensor(), normalize, ]), download=True), batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( datasets.CIFAR10(root=args.data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), batch_size=args.val_batch_size, shuffle=False, **kwargs) # original grid = [(1.0, 1.0), (1.9, 1.0), (1.7, 1.1), (1.6, 1.1), (1.4, 1.2), (1.2, 1.3), (1.0, 1.4)] grid = [(args.grid[i], args.grid[i + 1]) for i in range(0, len(args.grid), 2)] for coeff in grid: alpha = coeff[0]**args.phi beta = coeff[1]**args.phi grid_search(train_loader, val_loader, criterion, alpha, beta)
def compress(self, w, pi, delta, trainloader, testloader, valloader, loss_fn): """ Main L-C compression method. :param w: Input model. :type w: torch.nn.Module :param pi: Compression function. :param delta: Decompression function. :param trainloader: Training dataloader. :param testloader: Test dataloader. :param valloader: Validation dataloader. :param loss_fn: Loss criterion. """ statistics = {} # Save engine configuration statistics.update(self._engine_config) _model_stat_fn = self.debugging_flags['custom_model_statistics']\ if 'custom_model_statistics' in self.debugging_flags\ else util.empty_stat_fn _disable_train_stats = self.debugging_flags['disable_train_stats']\ if 'disable_train_stats' in self.debugging_flags\ else False timer_lc = EventTimer() if self.use_cuda: cudnn.benchmark = True logger.debug("[Condensa] cuDNN VERSION: {}".format(cudnn.version())) validate = (valloader is not None) test = (testloader is not None) # Copy model to GPU0 memory if self.use_cuda: w = w.cuda(0) # Mark all compressible modules in w with record_mode(): pi(w) with torch.no_grad(): theta = deepcopy(w) self.zero_(theta) with torch.no_grad(): lm = deepcopy(w) self.zero_(lm) with torch.no_grad(): best_model = deepcopy(w) # Enable data-parallelism in L step if self.use_cuda and self.distributed: ngpus = torch.cuda.device_count() logger.info('[Condensa] {} GPUs enabled for L-step'.format(ngpus)) w = torch.nn.DataParallel(w) mu = 0. learning_rate = self.lr optimizer = self.l_optimizer(w, lr=learning_rate, **self.l_optimizer_params) optimizer.reset_state() context = { 'iteration': -1, 'learing_rate': learning_rate, 'mu': mu, 'theta': theta, } if not _disable_train_stats: try: w_train_loss, w_train_stats = _model_stat_fn(w, loss_fn, trainloader, loader='train', context=context) except TypeError: w_train_loss, w_train_stats = _model_stat_fn( w, loss_fn, trainloader) logger.info('[Condensa] w TRAIN\tloss={:.5f}, {}'.format( w_train_loss, ', '.join( ['{}:{}'.format(k, v) for k, v in w_train_stats.items()]))) if validate: try: w_val_loss, w_val_stats = _model_stat_fn(w, loss_fn, valloader, loader='val', context=context) except TypeError: w_val_loss, w_val_stats = _model_stat_fn(w, loss_fn, valloader) logger.info('[Condensa] w VAL\tloss={:.5f}, {}'.format( w_val_loss, ', '.join( ['{}:{}'.format(k, v) for k, v in w_val_stats.items()]))) if test: try: w_test_loss, w_test_stats = _model_stat_fn(w, loss_fn, testloader, loader='test', context=context) except TypeError: w_test_loss, w_test_stats = _model_stat_fn( w, loss_fn, testloader) logger.info('[Condensa] w TEST\tloss={:.5f}, {}'.format( w_test_loss, ', '.join( ['{}:{}'.format(k, v) for k, v in w_test_stats.items()]))) best_loss = sys.float_info.max train_losses = [] if validate: val_losses = [] if test: test_losses = [] outer_lr_scheduler = None if self.lr_decay is not None: outer_lr_scheduler = ExpDecayedLR(self.lr, self.lr_decay) elif self.lr_schedule is not None: outer_lr_scheduler = DecayedLR(self.lr, self.lr_schedule, self.lr_multiplier) for j in range(0, self.steps): n_sgd_iter = (self.mb_iterations_first_l if j == 1 else self.mb_iterations_per_l) # Set up outer learning rate learning_rate = self.lr if outer_lr_scheduler is not None: learning_rate = outer_lr_scheduler.learning_rate logger.info( '[Condensa] LC Iteration {}:\tmu={:.5f}, lr={:.5f}'.format( j, mu, learning_rate)) inner_lr_scheduler = None if self.lr_end is not None: inner_lr_scheduler = IntervalLR(learning_rate, self.lr_end, n_sgd_iter) # L step # Switch to training mode i = 0 w.train() iterator = iter(trainloader) if logger.isEnabledFor(logging.INFO) and j > 0: pbar = tqdm(total=n_sgd_iter, ascii=True) while True: if j == 0: logger.info('[Condensa] Skipping first L-step') break if j == 1 and i >= self.mb_iterations_first_l: break if j > 1 and i >= self.mb_iterations_per_l: break try: inputs, targets = next(iterator) except StopIteration: iterator = iter(trainloader) inputs, targets = next(iterator) if self.use_cuda: if not inputs.is_cuda: inputs = inputs.cuda() if not targets.is_cuda: targets = targets.cuda(non_blocking=True) outputs = w(inputs) loss = loss_fn(outputs, targets) optimizer.zero_grad() loss.backward() optimizer.step(learning_rate, mu, theta, lm) if inner_lr_scheduler is not None: inner_lr_scheduler.step() learning_rate = inner_lr_scheduler.learning_rate if logger.isEnabledFor(logging.INFO): pbar.update() i += 1 if logger.isEnabledFor(logging.INFO) and j > 0: pbar.close() logger.info('') if self.use_cuda: torch.cuda.synchronize() w.eval() # C step and theta update try: theta.load_state_dict(w.module.state_dict()) except AttributeError: theta.load_state_dict(w.state_dict()) if mu > 0: try: wmodules = w.module.modules() except AttributeError: wmodules = w.modules() with record_mode(): pi(theta) with torch.no_grad(): for w_m, theta_m, lm_m in zip(wmodules, theta.modules(), lm.modules()): if hasattr(theta_m, 'condense'): for pname in theta_m.condense: getattr(theta_m, pname).data = ( getattr(w_m, pname).detach() - getattr(lm_m, pname).data / mu) pi(theta) context['iteration'] = j context['learing_rate'] = learning_rate context['theta'] = theta if not _disable_train_stats: try: nested_train_loss, nested_train_stats = _model_stat_fn( theta, loss_fn, trainloader, loader='train', context=context) except TypeError: nested_train_loss, nested_train_stats = _model_stat_fn( theta, loss_fn, trainloader) train_losses.append(nested_train_loss) logger.info( '[Condensa] Nested (theta) TRAIN\tloss={:.5f}, {}'.format( nested_train_loss, ', '.join([ '{}:{}'.format(k, v) for k, v in nested_train_stats.items() ]))) if validate: try: nested_val_loss, nested_val_stats = _model_stat_fn( theta, loss_fn, valloader, loader='val', context=context) except TypeError: nested_val_loss, nested_val_stats = _model_stat_fn( theta, loss_fn, valloader) val_losses.append(nested_val_loss) logger.info( '[Condensa] Nested (theta) VAL\tloss={:.5f}, {}'.format( nested_val_loss, ', '.join([ '{}:{}'.format(k, v) for k, v in nested_val_stats.items() ]))) if test: try: nested_test_loss, nested_test_stats = _model_stat_fn( theta, loss_fn, testloader, loader='test', context=context) except TypeError: nested_test_loss, nested_test_stats = _model_stat_fn( theta, loss_fn, testloader) test_losses.append(nested_test_loss) logger.info( '[Condensa] Nested (theta) TEST\tloss={:.5f}, {}'.format( nested_test_loss, ', '.join([ '{}:{}'.format(k, v) for k, v in nested_test_stats.items() ]))) if validate: if nested_val_loss < best_loss: logger.info('[Condensa] Saving model based on VAL') best_loss = nested_val_loss # Deep-copy required here to preserve dtypes best_model = deepcopy(theta) elif test: if nested_test_loss < best_loss: logger.info('[Condensa] Saving model based on TEST') best_loss = nested_test_loss # Deep-copy required here to preserve dtypes best_model = deepcopy(theta) else: logger.info('[Condensa] Saving model based on most recent') best_model = deepcopy(theta) # theta <- delta(theta) delta(theta) # LM update if mu > 0: try: wmodules = w.module.modules() except AttributeError: wmodules = w.modules() for w_m, theta_m, lm_m in zip(wmodules, theta.modules(), lm.modules()): if hasattr(theta_m, 'condense'): for pname in theta_m.condense: getattr( lm_m, pname).data = (getattr(lm_m, pname).data - mu * (getattr(w_m, pname).detach() - getattr(theta_m, pname).data)) optimizer.reset_state() # Update mu mu = self._update_mu(mu, self.mu_init, self.mu_multiplier, self.mu_cap) # Update LR schedule if outer_lr_scheduler is not None: outer_lr_scheduler.step() statistics['elapsed_lc'] = timer_lc.elapsed_seconds statistics['train_losses'] = train_losses if test: statistics['test_losses'] = test_losses if validate: statistics['val_losses'] = val_losses return best_model, statistics
def inference(args): cudnn.enabled = True cudnn.benchmark = True cudnn.deterministic = True print("torch_version:{}".format(torch.__version__)) print("CUDA_version:{}".format(torch.version.cuda)) print("cudnn_version:{}".format(cudnn.version())) init_seed(123456) data_path = args.base_data_path + args.dataset + '/' tokenizer, vocab2id, id2vocab = bert_tokenizer() detokenizer = bert_detokenizer() print('Vocabulary size', len(vocab2id)) if os.path.exists(data_path + 'dev_DukeNet.pkl'): query = torch.load(data_path + 'query_DukeNet.pkl') passage = torch.load(data_path + 'passage_DukeNet.pkl') dev_samples = torch.load(data_path + 'dev_DukeNet.pkl') print("The number of dev_samples:", len(dev_samples)) if args.dataset == "wizard_of_wikipedia": test_seen_samples = torch.load(data_path + 'test_seen_DukeNet.pkl') test_unseen_samples = torch.load(data_path + 'test_unseen_DukeNet.pkl') print("The number of test_seen_samples:", len(test_seen_samples)) print("The number of test_unseen_samples:", len(test_unseen_samples)) elif args.dataset == "holl_e": test_samples = torch.load(data_path + 'test_DukeNet.pkl') print("The number of test_samples:", len(test_samples)) else: samples, query, passage = load_default(args.dataset, data_path + args.dataset + '.answer', data_path + args.dataset + '.passage', data_path + args.dataset + '.pool', data_path + args.dataset + '.qrel', data_path + args.dataset + '.query', tokenizer) if args.dataset == "wizard_of_wikipedia": train_samples, dev_samples, test_seen_samples, test_unseen_samples = split_data(args.dataset, data_path + args.dataset + '.split', samples) print("The number of test_seen_samples:", len(test_seen_samples)) print("The number of test_unseen_samples:", len(test_unseen_samples)) torch.save(test_seen_samples, data_path + 'test_seen_DukeNet.pkl') torch.save(test_unseen_samples, data_path + 'test_unseen_DukeNet.pkl') elif args.dataset == "holl_e": train_samples, dev_samples, test_samples, = split_data(args.dataset, data_path + args.dataset + '.split', samples) print("The number of test_samples:", len(test_samples)) torch.save(test_samples, data_path + 'test_DukeNet.pkl') print("The number of train_samples:", len(train_samples)) print("The number of dev_samples:", len(dev_samples)) torch.save(query, data_path + 'query_DukeNet.pkl') torch.save(passage, data_path + 'passage_DukeNet.pkl') torch.save(train_samples, data_path + 'train_DukeNet.pkl') torch.save(dev_samples, data_path + 'dev_DukeNet.pkl') if args.dataset == "wizard_of_wikipedia": dev_dataset = Dataset(args.mode, dev_samples, query, passage, vocab2id, args.max_knowledge_pool_when_train, args.max_knowledge_pool_when_inference, args.context_len, args.knowledge_sentence_len, args.max_dec_length) test_seen_dataset = Dataset(args.mode, test_seen_samples, query, passage, vocab2id, args.max_knowledge_pool_when_train, args.max_knowledge_pool_when_inference, args.context_len, args.knowledge_sentence_len, args.max_dec_length) test_unseen_dataset = Dataset(args.mode, test_unseen_samples, query, passage, vocab2id, args.max_knowledge_pool_when_train, args.max_knowledge_pool_when_inference, args.context_len, args.knowledge_sentence_len, args.max_dec_length) elif args.dataset == "holl_e": test_dataset = Dataset(args.mode, test_samples, query, passage, vocab2id, args.max_knowledge_pool_when_train, args.max_knowledge_pool_when_inference, args.context_len, args.knowledge_sentence_len, args.max_dec_length) saved_model_path = os.path.join(args.base_output_path + args.name + "/", 'model/') def inference(dataset, epoch=None): file =saved_model_path + str(epoch) + '.pkl' if os.path.exists(file): model = DukeNet(vocab2id, id2vocab, args) model.load_state_dict(torch.load(file)["model"]) trainer = CumulativeTrainer(args.name, model, tokenizer, detokenizer, None) if dataset == "wizard_of_wikipedia": print('inference {}'.format("dev_dataset")) trainer.test('inference', dev_dataset, collate_fn, args.inference_batch_size, 'dev', str(epoch), output_path=args.base_output_path + args.name+"/") print('inference {}'.format("test_seen_dataset")) trainer.test('inference', test_seen_dataset, collate_fn, args.inference_batch_size, 'test_seen', str(epoch), output_path=args.base_output_path + args.name+"/") print('inference {}'.format("test_unseen_dataset")) trainer.test('inference', test_unseen_dataset, collate_fn, args.inference_batch_size, 'test_unseen', str(epoch), output_path=args.base_output_path + args.name+"/") elif dataset == "holl_e": print('inference {}'.format("test_dataset")) trainer.test('inference', test_dataset, collate_fn, args.inference_batch_size, 'test', str(epoch), output_path=args.base_output_path + args.name+"/") if not os.path.exists(saved_model_path+"finished_inference.json"): finished_inference = {"time": []} w = open(saved_model_path+"finished_inference.json", 'w', encoding='utf-8') json.dump(finished_inference, w) w.close() if args.appoint_epoch != -1: print('Start inference at epoch', args.appoint_epoch) inference(args.dataset, args.appoint_epoch) r = open(saved_model_path+"finished_inference.json", 'r', encoding='utf-8') finished_inference = json.load(r) r.close() finished_inference["time"].append(args.appoint_epoch) w = open(saved_model_path + "finished_inference.json", 'w', encoding='utf-8') json.dump(finished_inference, w) w.close() print("finished epoch {} inference".format(args.appoint_epoch)) exit() while True: with open(saved_model_path + "checkpoints.json", 'r', encoding='utf-8') as r: checkpoints = json.load(r) r = open(saved_model_path + "finished_inference.json", 'r', encoding='utf-8') finished_inference = json.load(r) r.close() if len(checkpoints["time"]) == 0: print('Inference_mode: wait train finish the first epoch...') time.sleep(300) else: for i in checkpoints["time"]: # i is the index of epoch if i in finished_inference["time"]: print("epoch {} already has been inferenced, skip it".format(i)) pass else: print('Start inference at epoch', i) inference(args.dataset, i) r = open(saved_model_path + "finished_inference.json", 'r', encoding='utf-8') finished_inference = json.load(r) r.close() finished_inference["time"].append(i) w = open(saved_model_path+"finished_inference.json", 'w', encoding='utf-8') json.dump(finished_inference, w) w.close() print("finished epoch {} inference".format(i)) print("Inference_mode: current all model checkpoints are completed...") print("Inference_mode: finished %d modes" % len(finished_inference["time"])) if len(finished_inference["time"]) == args.epoches: print("All inference is ended") break else: print('Inference_mode: wait train finish the next epoch...') time.sleep(300)
def train(args): cudnn.enabled = True cudnn.benchmark = True cudnn.deterministic = True print("torch_version:{}".format(torch.__version__)) print("CUDA_version:{}".format(torch.version.cuda)) print("cudnn_version:{}".format(cudnn.version())) init_seed(123456) data_path = args.base_data_path+args.dataset+'/' tokenizer, vocab2id, id2vocab = bert_tokenizer() detokenizer = bert_detokenizer() print('Vocabulary size', len(vocab2id)) if os.path.exists(data_path + 'train_DukeNet.pkl'): query = torch.load(data_path + 'query_DukeNet.pkl') train_samples = torch.load(data_path + 'train_DukeNet.pkl') passage = torch.load(data_path + 'passage_DukeNet.pkl') print("The number of train_samples:", len(train_samples)) else: samples, query, passage = load_default(args.dataset, args.datasetdata_path + args.dataset + '.answer', data_path + args.dataset + '.passage', data_path + args.dataset + '.pool', data_path + args.dataset + '.qrel', data_path + args.dataset + '.query', tokenizer) if args.dataset == "wizard_of_wikipedia": train_samples, dev_samples, test_seen_samples, test_unseen_samples = split_data(args.dataset, data_path + args.dataset + '.split', samples) print("The number of test_seen_samples:", len(test_seen_samples)) print("The number of test_unseen_samples:", len(test_unseen_samples)) torch.save(test_seen_samples, data_path + 'test_seen_DukeNet.pkl') torch.save(test_unseen_samples, data_path + 'test_unseen_DukeNet.pkl') elif args.dataset == "holl_e": train_samples, dev_samples, test_samples, = split_data(args.dataset, data_path + args.dataset + '.split', samples) print("The number of test_samples:", len(test_samples)) torch.save(test_samples, data_path + 'test_DukeNet.pkl') print("The number of train_samples:", len(train_samples)) print("The number of dev_samples:", len(dev_samples)) torch.save(query, data_path + 'query_DukeNet.pkl') torch.save(passage, data_path + 'passage_DukeNet.pkl') torch.save(train_samples, data_path + 'train_DukeNet.pkl') torch.save(dev_samples, data_path + 'dev_DukeNet.pkl') model = DukeNet(vocab2id, id2vocab, args) saved_model_path = os.path.join(args.base_output_path + args.name + "/", 'model/') if args.resume is True: print("Reading checkpoints...") with open(saved_model_path + "checkpoints.json", 'r', encoding='utf-8') as r: checkpoints = json.load(r) last_epoch = checkpoints["time"][-1] fuse_dict = torch.load(os.path.join(saved_model_path, '.'.join([str(last_epoch), 'pkl']))) model.load_state_dict(fuse_dict["model"]) print('Loading success, last_epoch is {}'.format(last_epoch)) else: init_params(model, "enc") freeze_params(model, "enc") last_epoch = -1 if not os.path.exists(saved_model_path): os.makedirs(saved_model_path) with open(saved_model_path + "checkpoints.json", 'w', encoding='utf-8') as w: checkpoints = {"time": []} json.dump(checkpoints, w) # construct an optimizer object model_optimizer = optim.Adam(model.parameters(), args.lr) # model.parameters() Returns an iterator over module parameters.This is typically passed to an optimizer. model_scheduler = get_constant_schedule(model_optimizer) if args.resume is True: model_scheduler.load_state_dict(fuse_dict["scheduler"]) print('Loading scheduler, last_scheduler is', fuse_dict["scheduler"]) trainer = CumulativeTrainer(args.name, model, tokenizer, detokenizer, args.local_rank, accumulation_steps=args.accumulation_steps) model_optimizer.zero_grad() # Clears the gradients of all optimized torch.Tensor s. for i in range(last_epoch+1, args.epoches): if i==5: unfreeze_params(model, "enc") args.train_batch_size = 2 args.accumulation_steps = 16 train_dataset = Dataset(args.mode, train_samples, query, passage, vocab2id, args.max_knowledge_pool_when_train, args.max_knowledge_pool_when_inference, args.context_len, args.knowledge_sentence_len, args.max_dec_length) trainer.train_epoch('train', train_dataset, collate_fn, args.train_batch_size, i, model_optimizer, model_scheduler) del train_dataset trainer.serialize(i, model_scheduler, saved_model_path=saved_model_path)
if 'val' in args.test_idx: args.rel_path = True args.save_dir = './val_submissions' # manual seed args.manual_seed = random.randint(0, 10000) # fix seed print("Random Seed: ", args.manual_seed) random.seed(args.manual_seed) np.random.seed(args.manual_seed) torch.manual_seed(args.manual_seed) args.cuda = torch.cuda.is_available() if args.cuda: print('using cuda') if cudnn.enabled: cudnn.benchmark = True print('using cudnn {}'.format(cudnn.version())) print(args) # Data augmentation and normalization for training # Just normalization for validation (scale_size, crop_size) = (370, 299) if 'inception_v3' in args.arch else (256, 224) # create the dataloader for test image idx_files = args.test_idx if args.ten_crop: # TODO: test mix mode dsets = get_augmented_test_set(data_root=args.data_root, idx_file=idx_files, scale_size=scale_size,
def main(): """ --------------------------------------------- MAIN -------------------------------------------------------- Instantiates the model plus loss function and defines the dataloaders for several datasets including some data augmentation. Defines the grid for a grid search on lambda_max_divrs and initial_centroid_value_multipliers which both have a big influence on the sparsity (and respectively accuracy) of the resulting ternary networks. Starts grid search. """ # Manual seed for reproducibility torch.manual_seed(363636) # Global instances global args, use_cuda, device # Instantiating the parser args = parser.parse_args() # Global CUDA flag use_cuda = args.cuda and torch.cuda.is_available() # Defining device and device's map locationo device = torch.device("cuda" if use_cuda else "cpu") print('chosen device: ', device) # Building the model if args.model == 'cifar_micronet': print('Building MicroNet for CIFAR-100 with depth multiplier {} and width multiplier {} ...'.format( args.dw_multps[0] ** args.phi, args.dw_multps[1] ** args.phi)) model = micronet(args.dw_multps[0] ** args.phi, args.dw_multps[1] ** args.phi) elif args.model == 'imagenet_micronet': print('Building MicroNet for ImageNet with depth multiplier {} and width multiplier {} ...'.format( args.dw_multps[0] ** args.phi, args.dw_multps[1] ** args.phi)) model = image_micronet(args.dw_multps[0] ** args.phi, args.dw_multps[1] ** args.phi) elif args.model == 'efficientnet-b1': print('Building EfficientNet-B1 ...') model = EfficientNet.efficientnet_b1() elif args.model == 'efficientnet-b2': print('Building EfficientNet-B2 ...') model = EfficientNet.efficientnet_b2() elif args.model == 'efficientnet-b3': print('Building EfficientNet-B3 ...') model = EfficientNet.efficientnet_b3() elif args.model == 'efficientnet-b4': print('Building EfficientNet-B4 ...') model = EfficientNet.efficientnet_b4() for name, param in model.named_parameters(): print('\n', name) # Transfers model to device (GPU/CPU). model.to(device) # Defining loss function and printing CUDA information (if available) if use_cuda: print("PyTorch version: ") print(torch.__version__) print("CUDA Version: ") print(torch.version.cuda) print("cuDNN version is: ") print(cudnn.version()) cudnn.benchmark = True loss_fct = nn.CrossEntropyLoss().cuda() else: loss_fct = nn.CrossEntropyLoss() # Dataloaders for CIFAR, ImageNet and MNIST if args.dataset == 'CIFAR100': print('Loading CIFAR-100 data ...') normalize = transforms.Normalize(mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) kwargs = {'num_workers': args.workers, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( datasets.CIFAR100(root=args.data_path, train=True, transform=transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.075), transforms.ToTensor(), normalize, Cutout(n_holes=1, length=16), ]), download=True), batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( datasets.CIFAR100(root=args.data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), batch_size=args.val_batch_size, shuffle=False, **kwargs) elif args.dataset == 'ImageNet': print('Loading ImageNet data ...') traindir = os.path.join(args.data_path, 'train') valdir = os.path.join(args.data_path, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(args.image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) if model.__class__.__name__ == 'EfficientNet' or 'efficientnet' in str(args.model): image_size = EfficientNet.get_image_size(args.model) else: image_size = args.image_size val_dataset = datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(image_size, interpolation=PIL.Image.BICUBIC), transforms.CenterCrop(image_size), transforms.ToTensor(), normalize, ])) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.val_batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) elif args.dataset == 'MNIST': kwargs = {'num_workers': args.workers, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( datasets.MNIST(args.data_path, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( datasets.MNIST(args.data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.val_batch_size, shuffle=True, **kwargs) elif args.dataset == 'CIFAR10': normalize = transforms.Normalize(mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) kwargs = {'num_workers': args.workers, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( datasets.CIFAR10(root=args.data_path, train=True, transform=transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ToTensor(), normalize, ]), download=True), batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( datasets.CIFAR10(root=args.data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), batch_size=args.val_batch_size, shuffle=False, **kwargs) else: raise NotImplementedError('Undefined dataset name %s' % args.dataset) # Gridsearch on dividers for lambda_max and initial cluster center values for initial_c_divr in args.ini_c_divrs: for lambda_max_divr in args.lambda_max_divrs: print('lambda_max_divr: {}, initial_c_divr: {}'.format(lambda_max_divr, initial_c_divr)) logfile = open('./model_quantization/logfiles/logfile.txt', 'a+') logfile.write('lambda_max_divr: {}, initial_c_divr: {}'.format(lambda_max_divr, initial_c_divr)) grid_search(train_loader, val_loader, model, loss_fct, lambda_max_divr, initial_c_divr)
else: handlers = [logging.StreamHandler()] """ add '%(filename)s' to format show source file """ logging.basicConfig(level=logging.DEBUG if debug_mode else logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', handlers=handlers) if __name__ == '__main__': # set args args = parser.parse_args() set_logger(log_file=args.log_file, debug_mode=args.debug_mode) logging.info("Cudnn Version: {}".format(cudnn.version())) cudnn.benchmark = True logging.info("Start evaluation with args:\n" + json.dumps(vars(args), indent=4, sort_keys=True)) # set device states os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpus) # before using torch assert torch.cuda.is_available(), "CUDA is not available" # creat model sym_net, input_config = get_symbol(name=args.network, num_classes=101) # network if torch.cuda.is_available(): sym_net = sym_net.cuda() net = static_model(net=sym_net)
parser.add_argument("--dataset", type=str) parser.add_argument("--output_path", type=str, default='./output/CaSE/') parser.add_argument("--embedding_size", type=int, default=256) parser.add_argument("--hidden_size", type=int, default=256) parser.add_argument("--max_span_size", type=int, default=4) parser.add_argument("--max_target_length", type=int, default=40) parser.add_argument("--min_window_size", type=int, default=4) parser.add_argument("--num_windows", type=int, default=1) parser.add_argument("--accumulation_steps", type=int, default=1) parser.add_argument("--epoch", type=int, default=20) parser.add_argument("--batch_size", type=int, default=16) parser.add_argument("--num_gpu", type=int, default=4) args = parser.parse_args() if torch.cuda.is_available(): torch.distributed.init_process_group(backend='NCCL', init_method='env://') cudnn.enabled = True cudnn.benchmark = True cudnn.deterministic = True print(torch.__version__) print(torch.version.cuda) print(cudnn.version()) init_seed(123456) if args.mode == 'test': test(args) elif args.mode == 'train': train(args)
def main(): # Manual seed for reproducibility torch.manual_seed(363636) # Global instances global args, use_cuda, device # Instantiating the parser args = parser.parse_args() # Global CUDA flag use_cuda = args.cuda and torch.cuda.is_available() # Defining device and device's map locationo device = torch.device("cuda" if use_cuda else "cpu") print('chosen device: ', device) # Building the model if args.model == 'cifar_micronet': print( 'Building MicroNet for CIFAR with depth multiplier {} and width multiplier {} ...' .format(args.dw_multps[0]**args.phi, args.dw_multps[1]**args.phi)) if args.dataset == 'CIFAR100': num_classes = 100 elif args.dataset == 'CIFAR10': num_classes = 10 model = micronet(args.dw_multps[0]**args.phi, args.dw_multps[1]**args.phi, num_classes) elif args.model == 'image_micronet': print( 'Building MicroNet for ImageNet with depth multiplier {} and width multiplier {} ...' .format(args.dw_multps[0]**args.phi, args.dw_multps[1]**args.phi)) model = image_micronet(args.dw_multps[0]**args.phi, args.dw_multps[1]**args.phi) elif args.model == 'efficientnet-b1': print('Building EfficientNet-B1 ...') model = EfficientNet.efficientnet_b1() elif args.model == 'efficientnet-b2': print('Building EfficientNet-B2 ...') model = EfficientNet.efficientnet_b2() elif args.model == 'efficientnet-b3': print('Building EfficientNet-B3 ...') model = EfficientNet.efficientnet_b3() elif args.model == 'efficientnet-b4': print('Building EfficientNet-B4 ...') model = EfficientNet.efficientnet_b4() elif args.model == 'lenet-5': print( 'Building LeNet-5 with depth multiplier {} and width multiplier {} ...' .format(args.dw_multps[0]**args.phi, args.dw_multps[1]**args.phi)) model = lenet5(d_multiplier=args.dw_multps[0]**args.phi, w_multiplier=args.dw_multps[1]**args.phi) for name, param in model.named_parameters(): print('\n', name) # Transfers model to device (GPU/CPU). model.to(device) # Defining loss function and printing CUDA information (if available) if use_cuda: print("PyTorch version: ") print(torch.__version__) print("CUDA Version: ") print(torch.version.cuda) print("cuDNN version is: ") print(cudnn.version()) cudnn.benchmark = True loss_fct = nn.CrossEntropyLoss().cuda() else: loss_fct = nn.CrossEntropyLoss() # Dataloaders for CIFAR, ImageNet and MNIST if args.dataset == 'CIFAR100': print('Loading CIFAR-100 data ...') normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) kwargs = { 'num_workers': args.workers, 'pin_memory': True } if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.CIFAR100( root=args.data_path, train=True, transform=transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.075), transforms.ToTensor(), normalize, Cutout(n_holes=1, length=16), ]), download=True), batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( datasets.CIFAR100(root=args.data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), batch_size=args.val_batch_size, shuffle=False, **kwargs) elif args.dataset == 'ImageNet': print('Loading ImageNet data ...') traindir = os.path.join(args.data_path, 'train') valdir = os.path.join(args.data_path, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(args.image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) if model.__class__.__name__ == 'EfficientNet' or 'efficientnet' in str( args.model): image_size = EfficientNet.get_image_size(args.model) else: image_size = args.image_size val_dataset = datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(image_size, interpolation=PIL.Image.BICUBIC), transforms.CenterCrop(image_size), transforms.ToTensor(), normalize, ])) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.val_batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) elif args.dataset == 'MNIST': kwargs = { 'num_workers': args.workers, 'pin_memory': True } if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( args.data_path, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( datasets.MNIST(args.data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.val_batch_size, shuffle=True, **kwargs) elif args.dataset == 'CIFAR10': print('Loading CIFAR-10 data ...') normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) kwargs = { 'num_workers': args.workers, 'pin_memory': True } if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.CIFAR10( root=args.data_path, train=True, transform=transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.075), transforms.ToTensor(), normalize, Cutout(n_holes=1, length=16), ]), download=True), batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( datasets.CIFAR10(root=args.data_path, train=False, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), batch_size=args.val_batch_size, shuffle=False, **kwargs) else: raise NotImplementedError('Undefined dataset name %s' % args.dataset) train_w_frozen_assignment(train_loader, val_loader, model, loss_fct)