def sweep_step(work_items, tgroup_id, I, sigma, new_sigma, coefs, directions, sigma_a, sigma_s): I[1:-1, 1:-1, 1:-1, :, -1] = np.nan tgroup_id[0] = 0 # Sweep across the graph for the differencing scheme for the gradient. chunk_size = 1024 num_blocks = (work_items.shape[0] + chunk_size - 1) // chunk_size assert I.strides[3] == 4 I_flat = np.swapaxes(I, 3, 4).ravel() sigma_flat = sigma.ravel() # direction_offset = 1 frequency_offset = I.shape[3] cp.cuda.get_current_stream().synchronize() start = perf_counter() cuda.profile_start() compute_fluxes[num_blocks, chunk_size, 0, uint_t_nbytes]( work_items, sigma.shape[0], sigma.shape[1], sigma.shape[2], sigma.shape[3], I.shape[4], I_flat, sigma_flat, directions, sigma_a + sigma_s, tgroup_id, 1. / I.shape[1], I.strides[0] // float_t_nbytes, I.strides[1] // float_t_nbytes, I.strides[2] // float_t_nbytes, I.strides[3] // float_t_nbytes, I.strides[4] // float_t_nbytes) cp.cuda.get_current_stream().synchronize() cuda.profile_stop() stop = perf_counter() print("sweep kernel time:", stop - start) # Compute the scattering terms in the collision operator. compute_new_scattering(sigma_s, I, coefs, new_sigma)
def train(): #Launch recv td print("worker_id(rank)",worker_id, " size:",str(worker_num)," batch_size=",batch_size ) init_processes(worker_id,worker_num, 'gloo') input("Worker End Connection Initialized") sub_net.train() inputs = None outputs = None train_loss = 0 correct = 0 total = 0 iteration_num = 100 iter_n = 0 loss = None sub_optimizer.zero_grad() sta = time.time() while True: inputs = fake_input.to(device) targets = fake_target.to(device) outputs = sub_net(inputs) loss = criterion(outputs, targets) loss.backward() comm_time_sta = time.time() para_num = 0 for name, parameters in sub_net.named_parameters(): if(parameters.grad is not None): grad_content = parameters.grad.to("cpu") para_num += grad_content.numel() dist.all_reduce(tensor=grad_content, op = dist.ReduceOp.SUM) grad_content = grad_content/worker_num parameters.grad = grad_content.to(device) comm_time_ed = time.time() sub_optimizer.step() sub_optimizer.zero_grad() print("iter=",iter_n," comm_time=",str(comm_time_ed-comm_time_ed)) if iter_n == 10: cuda.profile_start() print("cuda profile start...") if iter_n == 30: cuda.profile_stop() print("cuda profile end...") iter_n = iter_n + 1 if iter_n%10 == 0: ed = time.time() print("iter_n=",iter_n," time=",(ed-sta*1.0), "comm_num=",para_num) if iter_n == iteration_num: exit(0)
def every_n_step_begin(self, step): if self.ended: return first_check_step = 305 last_check_step = 325 if (not self.started) and step > first_check_step: print("Profile Start!") self.started = True cuda.profile_start() elif self.started and step > last_check_step: print("Profile End! Calling profile_stop().") self.ended = True cuda.profile_stop() print("Done calling profile_stop().")
def train(): #Launch recv td print("worker_id(rank)", worker_id, " size:", str(worker_num), " batch_size=", batch_size) init_processes(worker_id, worker_num, 'gloo') print("Worker End Connection Initialized") global sub_net, sub_optimizer, device is_cpu_mode = False sub_net.train() inputs = None outputs = None train_loss = 0 correct = 0 total = 0 iteration_num = 100 iter_n = 0 loss = None sub_optimizer.zero_grad() sta = time.time() with torch.autograd.profiler.emit_nvtx(): cuda.profile_start() while iter_n <= 10: inputs = fake_input.to(device) targets = fake_target.to(device) outputs = sub_net(inputs) loss = criterion(outputs, targets) loss.backward() comm_time_sta = time.time() for name, parameters in sub_net.named_parameters(): if (parameters.grad is not None): grad_content = parameters.grad.to("cpu") dist.all_reduce(tensor=grad_content, op=dist.ReduceOp.SUM) grad_content = grad_content / worker_num parameters.grad = grad_content.to(device) comm_time_ed = time.time() sub_optimizer.step() sub_optimizer.zero_grad() print("iter=", iter_n) iter_n = iter_n + 1 if iter_n == 5: print("Stop") cuda.profile_stop() if iter_n % 10 == 0: ed = time.time() print("iter_n=", iter_n, " time=", (ed - sta * 1.0))
def main(self): self.stats.start() self.dynamic_adjustment.start() if Config.PLAY_MODE: for trainer in self.trainers: trainer.enabled = False learning_rate_multiplier = ( Config.LEARNING_RATE_END - Config.LEARNING_RATE_START) / Config.ANNEALING_EPISODE_COUNT beta_multiplier = (Config.BETA_END - Config.BETA_START) / Config.ANNEALING_EPISODE_COUNT while self.stats.episode_count.value < Config.EPISODES: #CUDA PROFILING - GUY if self.stats.episode_count.value == 1000: cuda.profile_start() if self.stats.episode_count.value == 2000: cuda.profile_stop() #CUDA PROFILING - GUY step = min(self.stats.episode_count.value, Config.ANNEALING_EPISODE_COUNT - 1) self.model.learning_rate = Config.LEARNING_RATE_START + learning_rate_multiplier * step self.model.beta = Config.BETA_START + beta_multiplier * step # Saving is async - even if we start saving at a given episode, we may save the model at a later episode if Config.SAVE_MODELS and self.stats.should_save_model.value > 0: self.save_model() self.stats.should_save_model.value = 0 time.sleep(0.01) self.dynamic_adjustment.exit_flag = True while self.agents: self.remove_agent() while self.predictors: self.remove_predictor() while self.trainers: self.remove_trainer()
def _fit( self, train_iter: data_io.ParallelBucketSentenceIter, val_iter: data_io.ParallelBucketSentenceIter, output_folder: str, max_params_files_to_keep: int, metrics: List[AnyStr], max_updates: int, checkpoint_frequency: int, max_num_not_improved: int, min_num_epochs: Optional[int] = None, # <EcoSys> Parametrizing profiler profiler_on: bool = False, profiler_start: int = 4500, profiler_stop: int = 4600, # </EcoSys> mxmonitor: Optional[mx.monitor.Monitor] = None): """ Internal fit method. Runtime determined by early stopping. :param train_iter: Training data iterator. :param val_iter: Validation data iterator. :param output_folder: Model output folder. :params max_params_files_to_keep: Maximum number of params files to keep in the output folder (last n are kept). :param metrics: List of metric names to track on training and validation data. :param max_updates: Maximum number of batches to process. :param checkpoint_frequency: Frequency of checkpointing. :param max_num_not_improved: Maximum number of checkpoints until fitting is stopped if model does not improve, -1 for no early stopping. :param min_num_epochs: Minimum number of epochs to train, even if validation scores did not improve. :param mxmonitor: Optional MXNet monitor instance. """ metric_train = self._create_eval_metric(metrics) metric_val = self._create_eval_metric(metrics) tic = time.time() training_state_dir = os.path.join(output_folder, C.TRAINING_STATE_DIRNAME) if os.path.exists(training_state_dir): train_state = self.load_checkpoint(training_state_dir, train_iter) else: train_state = _TrainingState(num_not_improved=0, epoch=0, checkpoint=0, updates=0, samples=0) next_data_batch = train_iter.next() logfile = expanduser("~") + "/profiler-" + str(plt.node()) + ".json" mx.profiler.profiler_set_config(mode='all', filename=logfile) while max_updates == -1 or train_state.updates < max_updates: # <EcoSys> Added the profiler start and end point. if profiler_on: import numba.cuda as cuda if train_state.updates == profiler_start: cuda.profile_start() mx.profiler.profiler_set_state('run') if train_state.updates == profiler_stop: mx.profiler.profiler_set_state('stop') mx.profiler.dump_profile() cuda.profile_stop() exit() # </EcoSys> if not train_iter.iter_next(): train_state.epoch += 1 train_iter.reset() # process batch batch = next_data_batch if mxmonitor is not None: mxmonitor.tic() self.module.forward_backward(batch) self.module.update() if mxmonitor is not None: results = mxmonitor.toc() if results: for _, k, v in results: logger.info('Monitor: Batch [{:d}] {:s} {:s}'.format( train_state.updates, k, v)) if train_iter.iter_next(): # pre-fetch next batch next_data_batch = train_iter.next() self.module.prepare(next_data_batch) self.module.update_metric(metric_train, batch.label) self.training_monitor.batch_end_callback(train_state.epoch, train_state.updates, metric_train) train_state.updates += 1 train_state.samples += train_iter.batch_size if train_state.updates > 0 and train_state.updates % checkpoint_frequency == 0: train_state.checkpoint += 1 self._save_params(output_folder, train_state.checkpoint) cleanup_params_files( output_folder, max_params_files_to_keep, train_state.checkpoint, self.training_monitor.get_best_checkpoint()) self.training_monitor.checkpoint_callback( train_state.checkpoint, metric_train) toc = time.time() logger.info( "Checkpoint [%d]\tUpdates=%d Epoch=%d Samples=%d Time-cost=%.3f", train_state.checkpoint, train_state.updates, train_state.epoch, train_state.samples, (toc - tic)) tic = time.time() for name, val in metric_train.get_name_value(): logger.info('Checkpoint [%d]\tTrain-%s=%f', train_state.checkpoint, name, val) metric_train.reset() # evaluation on validation set has_improved, best_checkpoint = self._evaluate( train_state, val_iter, metric_val) if self.lr_scheduler is not None: self.lr_scheduler.new_evaluation_result(has_improved) if has_improved: best_path = os.path.join(output_folder, C.PARAMS_BEST_NAME) if os.path.lexists(best_path): os.remove(best_path) actual_best_fname = C.PARAMS_NAME % best_checkpoint os.symlink(actual_best_fname, best_path) train_state.num_not_improved = 0 else: train_state.num_not_improved += 1 logger.info("Model has not improved for %d checkpoints", train_state.num_not_improved) if max_num_not_improved >= 0 and train_state.num_not_improved >= max_num_not_improved: logger.info( "Maximum number of not improved checkpoints (%d) reached: %d", max_num_not_improved, train_state.num_not_improved) stop_fit = True if min_num_epochs is not None and train_state.epoch < min_num_epochs: logger.info( "Minimum number of epochs (%d) not reached yet: %d", min_num_epochs, train_state.epoch) stop_fit = False if stop_fit: logger.info("Stopping fit") self.training_monitor.stop_fit_callback() final_training_state_dirname = os.path.join( output_folder, C.TRAINING_STATE_DIRNAME) if os.path.exists(final_training_state_dirname): shutil.rmtree(final_training_state_dirname) break self._checkpoint(train_state, output_folder, train_iter) cleanup_params_files(output_folder, max_params_files_to_keep, train_state.checkpoint, self.training_monitor.get_best_checkpoint())
def main(): args = parse_args() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = "./run/neumf/{}".format(config['timestamp']) print("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() t1 = time.time() # Load Data print('Loading data') train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz, len(test_ratings))) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers]) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) criterion = nn.BCEWithLogitsLoss() if use_cuda: # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') # Calculate initial Hit Ratio and NDCG hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, processes=args.processes) print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'.format( K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs))) for epoch in range(args.epochs): model.train() losses = utils.AverageMeter() begin = time.time() loader = tqdm.tqdm(train_dataloader) length = len(loader) if length < 101: print( 'Exiting, cannot profile the required 100 iterations. Please re-run with a larger batch size.' ) cuda.profile_stop() exit() for batch_index, (user, item, label) in enumerate(loader): if batch_index == length // 2 and epoch == 0: print('Starting profiling for 100 iterations.') cuda.profile_start() if batch_index == length // 2 + 100 and epoch == 0: print( 'Profiling completed, stopping profiling and continuing training.' ) cuda.profile_stop() user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda(async=True) item = item.cuda(async=True) label = label.cuda(async=True) outputs = model(user, item) loss = criterion(outputs, label) losses.update(loss.data.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch, processes=args.processes) val_time = time.time() - begin print( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},' ' train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=np.mean(hits), ndcg=np.mean(ndcgs), train_time=train_time, val_time=val_time)) if args.threshold is not None: if np.mean(hits) >= args.threshold: print("Hit threshold of {}".format(args.threshold)) return 0
def __call__(self, param): import numba.cuda as cuda if self.nbatch == param.nbatch and self.nepoch == param.epoch: cuda.profile_start()
def train(train_loader, model, criterion, optimizer, epoch, args): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode model.train() length = len(train_loader) end = time.time() for i, (input, target) in enumerate(train_loader): if i == length // 2: print('Starting profiling for 100 iterations.') cuda.profile_start() if i == length // 2 + 100: print('Profiling completed, stopping profiling and exiting.') cuda.profile_stop() exit() # measure data loading time data_time.update(time.time() - end) if args.gpu is not None: input = input.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(input) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) top5.update(acc5[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5))
#! /usr/bin/env python # -*- coding: utf-8 -*- from __future__ import print_function, division, absolute_import from timeit import default_timer as timer import numpy as np from numba import jit from numba.cuda import profile_start, profile_stop profile_start() @jit def mandel(x, y, max_iters): """ Given the real and imaginary parts of a complex number, determine if it is a candidate for membership in the Mandelbrot set given a fixed number of iterations. """ i = 0 c = complex(x, y) z = 0.0j for i in range(max_iters): z = z * z + c if (z.real * z.real + z.imag * z.imag) >= 4: return i return 255
def main(): args = parse_arguments() if args.use_env and 'LOCAL_RANK' in os.environ: args.local_rank = int(os.environ['LOCAL_RANK']) random.seed(args.seed + args.local_rank) np.random.seed(args.seed + args.local_rank) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) worker_init = WorkerInitObj(args.seed + args.local_rank) device, args = setup_training(args) dllogger.log(step="PARAMETER", data={"Config": [str(args)]}) # Prepare optimizer model, optimizer, lr_scheduler, checkpoint, global_step, criterion = prepare_model_and_optimizer(args, device) if is_main_process(): dllogger.log(step="PARAMETER", data={"SEED": args.seed}) raw_train_start = time.time() if args.do_train: if is_main_process(): dllogger.log(step="PARAMETER", data={"train_start": True}) dllogger.log(step="PARAMETER", data={"batch_size_per_gpu": args.train_batch_size}) dllogger.log(step="PARAMETER", data={"learning_rate": args.learning_rate}) model.train() most_recent_ckpts_paths = [] average_loss = 0.0 # averaged loss every args.log_freq steps epoch = 0 training_steps = 0 pool = ProcessPoolExecutor(1) running_total = 0 running_count = 0 # Note: We loop infinitely over epochs, termination is handled via iteration count while True: thread = None if not args.resume_from_checkpoint or epoch > 0 or (args.phase2 and global_step < 1) or args.init_checkpoint: files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and 'training' in f] files.sort() num_files = len(files) random.shuffle(files) f_start_id = 0 else: f_start_id = checkpoint['files'][0] files = checkpoint['files'][1:] args.resume_from_checkpoint = False num_files = len(files) shared_file_list = {} if torch.distributed.is_initialized() and torch.distributed.get_world_size() > num_files: remainder = torch.distributed.get_world_size() % num_files data_file = files[(f_start_id*torch.distributed.get_world_size()+torch.distributed.get_rank() + remainder*f_start_id)%num_files] else: data_file = files[(f_start_id*torch.distributed.get_world_size()+torch.distributed.get_rank())%num_files] previous_file = data_file train_data = pretraining_dataset(data_file, args.max_predictions_per_seq) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * args.n_gpu, num_workers=4, worker_init_fn=worker_init, pin_memory=True) # shared_file_list["0"] = (train_dataloader, data_file) overflow_buf = None if args.allreduce_post_accumulation: overflow_buf = torch.cuda.IntTensor([0]) if len(files) == 1: f_start_id = -1 for f_id in range(f_start_id + 1 , len(files)): if torch.distributed.get_world_size() > num_files: data_file = files[(f_id*torch.distributed.get_world_size()+torch.distributed.get_rank() + remainder*f_id)%num_files] else: data_file = files[(f_id*torch.distributed.get_world_size()+torch.distributed.get_rank())%num_files] previous_file = data_file dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init) train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader for step, batch in enumerate(train_iter): if global_step >= 500: batch_start_time = time.time() # profile the file if it has at least 200 batches if args.profile and len(train_dataloader.dataset) > 200 and step == 100: print("Profilling the kernel for 100 iternations") cuda.profile_start() if args.profile and len(train_dataloader.dataset) > 200 and step == 200: cuda.profile_stop() print("Profiling complete, exiting") exit() training_steps += 1 batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch prediction_scores, seq_relationship_score = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. divisor = args.gradient_accumulation_steps if args.gradient_accumulation_steps > 1: if not args.allreduce_post_accumulation: # this division was merged into predivision loss = loss / args.gradient_accumulation_steps divisor = 1.0 if args.fp16: with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss: scaled_loss.backward() else: loss.backward() average_loss += loss.item() if training_steps % args.gradient_accumulation_steps == 0: lr_scheduler.step() # learning rate warmup global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step) if global_step >= args.max_steps: train_time_raw = time.time() - raw_train_start last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda() average_loss = average_loss / (last_num_steps * divisor) if (torch.distributed.is_initialized()): average_loss /= torch.distributed.get_world_size() torch.distributed.all_reduce(average_loss) final_loss = average_loss.item() if is_main_process(): dllogger.log(step=(epoch, global_step, ), data={"final_loss": final_loss}) elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0: if is_main_process(): dllogger.log(step=(epoch, global_step, ), data={"average_loss": average_loss / (args.log_freq * divisor), "step_loss": loss.item() * args.gradient_accumulation_steps / divisor, "learning_rate": optimizer.param_groups[0]['lr']}) average_loss = 0 if global_step >= args.max_steps or training_steps % ( args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0: if is_main_process() and not args.skip_checkpoint: # Save a trained model dllogger.log(step="PARAMETER", data={"checkpoint_step": global_step}) model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self if args.resume_step < 0 or not args.phase2: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)) else: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step)) if args.do_train: torch.save({'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'master params': list(amp.master_params(optimizer)), 'files': [f_id] + files}, output_save_file) most_recent_ckpts_paths.append(output_save_file) if len(most_recent_ckpts_paths) > 3: ckpt_to_be_removed = most_recent_ckpts_paths.pop(0) os.remove(ckpt_to_be_removed) if global_step >= args.max_steps: del train_dataloader # thread.join() return args, final_loss, train_time_raw # give some warmup period and record throughput if global_step > 500: batch_duration = time.time() - batch_start_time running_total += args.train_batch_size * args.max_seq_length running_count += batch_duration del train_dataloader print("Running throughput average:", running_total/running_count) # thread.join() # Make sure pool has finished and switch train_dataloader # NOTE: Will block until complete train_dataloader, data_file = dataset_future.result(timeout=None) epoch += 1
def train(): # kvstore kv = mx.kvstore.create(args.kv_store) model_prefix = args.model_prefix if model_prefix is not None: model_prefix += "-%d" % (kv.rank) save_model_prefix = args.save_model_prefix if save_model_prefix is None: save_model_prefix = model_prefix log_config(args.log_dir, args.log_file, save_model_prefix, kv.rank) devs = mx.cpu() if args.gpus is None else [ mx.gpu(int(i)) for i in args.gpus.split(',') ] epoch_size = args.num_examples / args.batch_size if args.kv_store == 'dist_sync': epoch_size /= kv.num_workers # disable kvstore for single device if 'local' in kv.type and (args.gpus is None or len(args.gpus.split(',')) is 1): kv = None # module dataiter = rl_data.GymDataIter('Breakout-v0', args.batch_size, args.input_length, web_viz=True) net = sym.get_symbol_atari(dataiter.act_dim) module = mx.mod.Module(net, data_names=[d[0] for d in dataiter.provide_data], label_names=('policy_label', 'value_label'), context=devs) module.bind(data_shapes=dataiter.provide_data, label_shapes=[('policy_label', (args.batch_size, )), ('value_label', (args.batch_size, 1))], grad_req='add') # load model if args.load_epoch is not None: assert model_prefix is not None _, arg_params, aux_params = mx.model.load_checkpoint( model_prefix, args.load_epoch) else: arg_params = aux_params = None # save model checkpoint = None if save_model_prefix is None else mx.callback.do_checkpoint( save_model_prefix) init = mx.init.Mixed(['fc_value_weight|fc_policy_weight', '.*'], [ mx.init.Uniform(0.001), mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) ]) module.init_params(initializer=init, arg_params=arg_params, aux_params=aux_params) # optimizer module.init_optimizer(kvstore=kv, optimizer='adam', optimizer_params={ 'learning_rate': args.lr, 'wd': args.wd, 'epsilon': 1e-3 }) # logging np.set_printoptions(precision=3, suppress=True) T = 0 dataiter.reset() score = np.zeros((args.batch_size, 1)) final_score = np.zeros((args.batch_size, 1)) iteration = 0 for epoch in range(args.num_epochs): if save_model_prefix: module.save_params('%s-%04d.params' % (save_model_prefix, epoch)) for _ in range(int(epoch_size / args.t_max)): # <EcoSys> Added this profiling check. if iteration == args.profile_start: print("Profile start.") cuda.profile_start() elif iteration == args.profile_stop: print("Calling profile_stop().") cuda.profile_stop() print("Done calling profile_stop().") # </EcoSys> tic = time.time() # clear gradients for exe in module._exec_group.grad_arrays: for g in exe: g[:] = 0 S, A, V, r, D = [], [], [], [], [] for t in range(args.t_max + 1): data = dataiter.data() module.forward(mx.io.DataBatch(data=data, label=None), is_train=False) act, _, val = module.get_outputs() V.append(val.asnumpy()) if t < args.t_max: act = act.asnumpy() act = [ np.random.choice(dataiter.act_dim, p=act[i]) for i in range(act.shape[0]) ] reward, done = dataiter.act(act) S.append(data) A.append(act) r.append(reward.reshape((-1, 1))) D.append(done.reshape((-1, 1))) err = 0 R = V[args.t_max] for i in reversed(range(args.t_max)): R = r[i] + args.gamma * (1 - D[i]) * R adv = np.tile(R - V[i], (1, dataiter.act_dim)) batch = mx.io.DataBatch( data=S[i], label=[mx.nd.array(A[i]), mx.nd.array(R)]) module.forward(batch, is_train=True) pi = module.get_outputs()[1] h = -args.beta * (mx.nd.log(pi + 1e-7) * pi) out_acts = np.amax(pi.asnumpy(), 1) out_acts = np.reshape(out_acts, (-1, 1)) out_acts_tile = np.tile(-np.log(out_acts + 1e-7), (1, dataiter.act_dim)) module.backward([mx.nd.array(out_acts_tile * adv), h]) print('pi', pi[0].asnumpy()) print('h', h[0].asnumpy()) err += (adv**2).mean() score += r[i] final_score *= (1 - D[i]) final_score += score * D[i] score *= 1 - D[i] T += D[i].sum() module.update() logging.info('fps: %f err: %f score: %f final: %f T: %f' % (args.batch_size / (time.time() - tic), err / args.t_max, score.mean(), final_score.mean(), T)) print(score.squeeze()) print(final_score.squeeze()) iteration += 1
def train_step(sess, train_op, global_step, train_step_kwargs): """Function that takes a gradient step and specifies whether to stop. Args: sess: The current session. train_op: An `Operation` that evaluates the gradients and returns the total loss. global_step: A `Tensor` representing the global training step. train_step_kwargs: A dictionary of keyword arguments. Returns: The total loss and a boolean indicating whether or not to stop training. Raises: ValueError: if 'should_trace' is in `train_step_kwargs` but `logdir` is not. """ start_time = time.time() trace_run_options = None run_metadata = None if 'should_trace' in train_step_kwargs: if 'logdir' not in train_step_kwargs: raise ValueError( 'logdir must be present in train_step_kwargs when ' 'should_trace is present') if sess.run(train_step_kwargs['should_trace']): trace_run_options = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) run_metadata = config_pb2.RunMetadata() total_loss, np_global_step = sess.run([train_op, global_step], options=trace_run_options, run_metadata=run_metadata) time_elapsed = time.time() - start_time if 'nvprof_on' in train_step_kwargs: import numba.cuda as cuda if np_global_step == train_step_kwargs['nvprof_start_step']: cuda.profile_start() if np_global_step == train_step_kwargs['nvprof_stop_step']: cuda.profile_stop() if run_metadata is not None: tl = timeline.Timeline(run_metadata.step_stats) trace = tl.generate_chrome_trace_format() trace_filename = os.path.join(train_step_kwargs['logdir'], 'tf_trace-%d.json' % np_global_step) logging.info('Writing trace to %s', trace_filename) file_io.write_string_to_file(trace_filename, trace) if 'summary_writer' in train_step_kwargs: train_step_kwargs['summary_writer'].add_run_metadata( run_metadata, 'run_metadata-%d' % np_global_step) if 'should_log' in train_step_kwargs: if sess.run(train_step_kwargs['should_log']): logging.info('global step %d: loss = %.4f (%.3f sec/step)', np_global_step, total_loss, time_elapsed) # TODO(nsilberman): figure out why we can't put this into sess.run. The # issue right now is that the stop check depends on the global step. The # increment of global step often happens via the train op, which used # created using optimizer.apply_gradients. # # Since running `train_op` causes the global step to be incremented, one # would expected that using a control dependency would allow the # should_stop check to be run in the same session.run call: # # with ops.control_dependencies([train_op]): # should_stop_op = ... # # However, this actually seems not to work on certain platforms. if 'should_stop' in train_step_kwargs: should_stop = sess.run(train_step_kwargs['should_stop']) else: should_stop = False return total_loss, should_stop
def train_proc(conv_wid, conv_wn, fc_wid, fc_wn, wid, wn, pred_wid, succ_wid, bs, subbs, pd, input_shp, output_shp, sub_net, fp_head_list, fp_tail_list, bp_head_list, bp_tail_list, shared_cnters, train_step, global_step, sta_lidx, end_lidx): pid = os.getpid() print("train_proc pid=", pid) device = 'cuda' if torch.cuda.is_available() else 'cpu' optimizer = optim.SGD(sub_net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) iter_thresh = int(bs / subbs) fp_iter = 0 bp_iter = 0 inputs = None outputs = None if sta_lidx == 0: fake_input = torch.randn(input_shp) print(fake_input.size()) if end_lidx == -1: fake_target = torch.from_numpy( np.random.randint(0, 999, size=int(subbs / pd))) criterion = nn.CrossEntropyLoss() print(fake_target.size()) qu = Queue.Queue() local_step = 0 sta = time.time() prof_on = False #with torch.autograd.profiler.emit_nvtx(): while True: if local_step == 5 and prof_on == False: cuda.profile_start() prof_on = True print("Prof Start") if local_step == 10 and prof_on == True: cuda.profile_stop() prof_on = False print("Prof Stop") if not (local_step == global_step): time.sleep(0.001) continue if wid == 0 or wid == 1: #先查BP 再 FP #fp_head_tensor_list, fp_tail_tensor_list, bp_head_tensor_list, bp_tail_tensor_list if bp_iter < fp_iter: if bp_iter < shared_cnters[3]: backward_ctx = bp_tail_list[bp_iter].cuda() outputs = qu.get() outputs.backward(backward_ctx) bp_iter += 1 #print(wid," ", rank, " bp complete ", fp_iter, " ", bp_iter) if bp_iter == iter_thresh: #bp_to_recv has reached bs, then it is time to update grad and reset cnter optimizer.step() optimizer.zero_grad() train_step += 1 fp_iter = 0 bp_iter = 0 shared_cnters[3].zero_() local_step += 1 #print(wid, " ", sync_iter) #FP has not reached the threshold and can be executed if fp_iter < shared_cnters[0]: inputs = fake_input.cuda() outputs = sub_net(inputs) fp_tail_list[fp_iter].copy_(outputs) qu.put(outputs) shared_cnters[1] += 1 fp_iter += 1 #print(wid, " fp complete ", fp_iter, " ", bp_iter) elif wid == wn - 1: #print("last worker") #FP has not reached the threshold and can be executed if fp_iter < shared_cnters[0]: fp_head_list[fp_iter].requires_grad = True inputs = fp_head_list[fp_iter].cuda() outputs = sub_net(inputs) #shared_cnters[1] += 1 fp_iter += 1 target = fake_target.cuda() loss = criterion(outputs, target) loss.backward() #print(HookFunc.hook_dict) #time.sleep(5) #bp_ctx = HookFunc.hook_dict[pid] if HookFunc.hook_dict[pid] is not None: #should be forked bp_head_list[bp_iter].copy_(HookFunc.hook_dict[pid]) HookFunc.hook_dict[pid] = None shared_cnters[2] += 1 else: print("Err") exit(-1) bp_iter += 1 if bp_iter == iter_thresh: #bp_to_recv has reached bs, then it is time to update grad and reset cnter optimizer.step() optimizer.zero_grad() train_step += 1 global_step += 1 fp_iter = 0 bp_iter = 0 shared_cnters[0].zero_() local_step += 1 #print("wid={:d} global_step={:d}".format( int(wid), int(global_step) )) else: #middle #print("ff ", fp_iter, " ", shared_cnters[0], " ", bp_iter) if bp_iter < fp_iter: #print("Pre fp vs bp ", fp_iter, " ", bp_iter) if bp_iter < shared_cnters[3]: backward_ctx = bp_tail_list[bp_iter].cuda() outputs = qu.get() outputs.backward(backward_ctx) #bp_ctx = HookFunc.hook_dict[pid] #exec('bp_ctx = HookFunc_{}.hook_dict["backward_ctx"]'.format(rank)) if HookFunc.hook_dict[pid] is not None: #should be forked bp_head_list[bp_iter].copy_(HookFunc.hook_dict[pid]) #exec('HookFunc_{}.hook_dict["backward_ctx"]=None'.format(rank)) HookFunc.hook_dict[pid] = None shared_cnters[2] += 1 else: print("Err") exit(-1) bp_iter += 1 #print("fp vs bp ", fp_iter, " ", bp_iter) if bp_iter == iter_thresh: #bp_to_recv has reached bs, then it is time to update grad and reset cnter optimizer.step() optimizer.zero_grad() train_step += 1 global_step += 1 fp_iter = 0 bp_iter = 0 shared_cnters[0].zero_() shared_cnters[3].zero_() local_step += 1 #print("wid={:d} global_step={:d}".format(int(wid), int(global_step))) #FP has not reached the threshold and can be executed #print("ff ", fp_iter, " ", shared_cnters[0]) if fp_iter < shared_cnters[0]: fp_head_list[fp_iter].requires_grad = True inputs = fp_head_list[fp_iter].cuda() outputs = sub_net(inputs) qu.put(outputs) #print("debug: ", outputs.size(), output_shp) fp_tail_list[fp_iter].copy_(outputs) shared_cnters[1] += 1 fp_iter += 1
def train(hparams, scope=None, target_session="", single_cell_fn=None): """Train a translation model.""" log_device_placement = hparams.log_device_placement out_dir = hparams.out_dir num_train_steps = hparams.num_train_steps steps_per_stats = hparams.steps_per_stats steps_per_external_eval = hparams.steps_per_external_eval steps_per_eval = 10 * steps_per_stats if not steps_per_external_eval: steps_per_external_eval = 5 * steps_per_eval if not hparams.attention: model_creator = nmt_model.Model elif hparams.attention_architecture == "standard": model_creator = attention_model.AttentionModel elif hparams.attention_architecture in ["gnmt", "gnmt_v2"]: model_creator = gnmt_model.GNMTModel else: raise ValueError("Unknown model architecture") train_model = create_train_model(model_creator, hparams, scope, single_cell_fn) eval_model = create_eval_model(model_creator, hparams, scope, single_cell_fn) infer_model = inference.create_infer_model(model_creator, hparams, scope, single_cell_fn) # Preload data for sample decoding. dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src) dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt) sample_src_data = inference.load_data(dev_src_file) sample_tgt_data = inference.load_data(dev_tgt_file) summary_name = "train_log" model_dir = hparams.out_dir # Log and output files log_file = os.path.join(out_dir, "log_%d" % time.time()) log_f = tf.gfile.GFile(log_file, mode="a") utils.print_out("# log_file=%s" % log_file, log_f) avg_step_time = 0.0 # TensorFlow model config_proto = utils.get_config_proto( log_device_placement=log_device_placement) train_sess = tf.Session(target=target_session, config=config_proto, graph=train_model.graph) eval_sess = tf.Session(target=target_session, config=config_proto, graph=eval_model.graph) infer_sess = tf.Session(target=target_session, config=config_proto, graph=infer_model.graph) with train_model.graph.as_default(): loaded_train_model, global_step = model_helper.create_or_load_model( train_model.model, model_dir, train_sess, "train") # Summary writer summary_writer = tf.summary.FileWriter(os.path.join(out_dir, summary_name), train_model.graph) # First evaluation run_full_eval(model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data) last_stats_step = global_step last_eval_step = global_step last_external_eval_step = global_step # This is the training loop. step_time, checkpoint_loss, checkpoint_predict_count = 0.0, 0.0, 0.0 # <EcoSys> Added the measurements on total number of samples. checkpoint_total_count, checkpoint_total_samples = 0.0, 0.0 # checkpoint_total_count = 0.0 # </EcoSys> speed, train_ppl = 0.0, 0.0 start_train_time = time.time() utils.print_out( "# Start step %d, lr %g, %s" % (global_step, loaded_train_model.learning_rate.eval( session=train_sess), time.ctime()), log_f) # Initialize all of the iterators skip_count = hparams.batch_size * hparams.epoch_step utils.print_out("# Init train iterator, skipping %d elements" % skip_count) train_sess.run(train_model.iterator.initializer, feed_dict={train_model.skip_count_placeholder: skip_count}) while global_step < num_train_steps: # <EcoSys> Added the profiler start and end point. import numba.cuda as cuda if global_step == 501: cuda.profile_start() if global_step == 511: cuda.profile_stop() # </EcoSys> ### Run a step ### start_time = time.time() try: step_result = loaded_train_model.train(train_sess) (_, step_loss, step_predict_count, step_summary, global_step, step_word_count, batch_size) = step_result hparams.epoch_step += 1 except tf.errors.OutOfRangeError: # Finished going through the training dataset. Go to next epoch. hparams.epoch_step = 0 utils.print_out( "# Finished an epoch, step %d. Perform external evaluation" % global_step) run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, sample_src_data, sample_tgt_data) dev_scores, test_scores, _ = run_external_eval( infer_model, infer_sess, model_dir, hparams, summary_writer) train_sess.run(train_model.iterator.initializer, feed_dict={train_model.skip_count_placeholder: 0}) continue # Write step summary. summary_writer.add_summary(step_summary, global_step) # update statistics step_time += (time.time() - start_time) checkpoint_loss += (step_loss * batch_size) checkpoint_predict_count += step_predict_count checkpoint_total_count += float(step_word_count) # <EcoSys> Increase the total number of samples by batch size. checkpoint_total_samples += float(batch_size) # </EcoSys> # Once in a while, we print statistics. if global_step - last_stats_step >= steps_per_stats: last_stats_step = global_step # Print statistics for the previous epoch. avg_step_time = step_time / steps_per_stats train_ppl = utils.safe_exp(checkpoint_loss / checkpoint_predict_count) speed = checkpoint_total_count / (1000 * step_time) # <EcoSys> Added samples per second to the log file. speed_samples_per_sec = checkpoint_total_samples / (step_time) utils.print_out( " global step %d lr %g " "step-time %.2fs wps %.2fK sps %5.2f ppl %.2f %s" % (global_step, loaded_train_model.learning_rate.eval(session=train_sess), avg_step_time, speed, speed_samples_per_sec, train_ppl, _get_best_results(hparams)), log_f) # </EcoSys> """ utils.print_out( " global step %d lr %g " "step-time %.2fs wps %.2fK ppl %.2f %s" % (global_step, loaded_train_model.learning_rate.eval(session=train_sess), avg_step_time, speed, train_ppl, _get_best_results(hparams)), log_f) """ if math.isnan(train_ppl): break # Reset timer and loss. step_time, checkpoint_loss, checkpoint_predict_count = 0.0, 0.0, 0.0 checkpoint_total_count = 0.0 checkpoint_total_samples = 0.0 if global_step - last_eval_step >= steps_per_eval: last_eval_step = global_step utils.print_out("# Save eval, global step %d" % global_step) utils.add_summary(summary_writer, global_step, "train_ppl", train_ppl) # Save checkpoint loaded_train_model.saver.save(train_sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step) # Evaluate on dev/test run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, sample_src_data, sample_tgt_data) dev_ppl, test_ppl = run_internal_eval(eval_model, eval_sess, model_dir, hparams, summary_writer) if global_step - last_external_eval_step >= steps_per_external_eval: last_external_eval_step = global_step # Save checkpoint loaded_train_model.saver.save(train_sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step) run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, sample_src_data, sample_tgt_data) dev_scores, test_scores, _ = run_external_eval( infer_model, infer_sess, model_dir, hparams, summary_writer) # Done training loaded_train_model.saver.save(train_sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step) result_summary, _, dev_scores, test_scores, dev_ppl, test_ppl = run_full_eval( model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data) utils.print_out( "# Final, step %d lr %g " "step-time %.2f wps %.2fK ppl %.2f, %s, %s" % (global_step, loaded_train_model.learning_rate.eval(session=train_sess), avg_step_time, speed, train_ppl, result_summary, time.ctime()), log_f) utils.print_time("# Done training!", start_train_time) utils.print_out("# Start evaluating saved best models.") for metric in hparams.metrics: best_model_dir = getattr(hparams, "best_" + metric + "_dir") result_summary, best_global_step, _, _, _, _ = run_full_eval( best_model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams, summary_writer, sample_src_data, sample_tgt_data) utils.print_out( "# Best %s, step %d " "step-time %.2f wps %.2fK, %s, %s" % (metric, best_global_step, avg_step_time, speed, result_summary, time.ctime()), log_f) summary_writer.close() return (dev_scores, test_scores, dev_ppl, test_ppl, global_step)
#!/usr/bin/env python from numba import cuda, float32 cuda.profile_start() # Controls threads per block and shared memory usage. # The computation will be done on blocks of TPBxTPB elements. TPB = 16 @cuda.jit def fast_matmul(A, B, C): # Define an array in the shared memory # The size and type of the arrays must be known at compile time sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32) sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32) x, y = cuda.grid(2) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bpg = cuda.gridDim.x # blocks per grid if x >= C.shape[0] and y >= C.shape[1]: # Quit if (x, y) is outside of valid C boundary return # Each thread computes one element in the result matrix. # The dot product is chunked into dot products of TPB-long vectors. tmp = 0. for i in range(bpg):
def gpu_merge(points, err, numobj): cuda.profile_start() start = time.time() #reshape points for gpu centre = np.zeros((len(points), 4), dtype='float32') related = np.zeros((len(points), len(points)), dtype='int32') sources = np.zeros((len(points), numobj, 3), dtype='float32') e = 0 #global error a_e = [] #populate arrays for gpu for i, p in enumerate(points): centre[i, 0] = p[0] centre[i, 1] = p[1] centre[i, 2] = p[2] centre[i, 3] = p[8] e += p[8] for j, r in enumerate(p[6]): if r[0][6] == True: for k in xrange(len(points)): if (points[k][0] == r[1][0]) and (points[k][1] == r[1][1]): related[i, j] = k break else: related[i, j] = -1 for j in range(len(p[6]), len(points) / 2 + 1): related[i, j] = -2 for j, s in enumerate(p[7]): sources[i, j, 0] = s[0] sources[i, j, 1] = s[1] sources[i, j, 2] = s[2] end = time.time() print 'reshape time: {0}'.format(end - start) start = time.time() #transfer arrays to gpu d_results = cuda.device_array((len(points), 7), np.float32) d_centre = cuda.to_device(centre) d_related = cuda.to_device(related) d_sources = cuda.to_device(sources) end = time.time() print 'transfer time: {0}'.format(end - start) p = len(points) #get grid and block sizes b = 32 g = len(points) / b + 1 start = time.time() while (True): #call kernel d_get_best[g, b](d_centre, p, d_results, d_related, d_sources, err, numobj) results = d_results.copy_to_host() a_e.append(e) best = np.array([0, 0, 0, 0, 0, 0, err]) for r in range(results.shape[0]): if results[r, 6] < best[6]: for q in range(7): best[q] = results[r, q] if best[6] + e > err: print "Merge criteria met" print "Final Error: {0}".format(e) break else: e += best[6] h_do_merge(best, points) d_best = cuda.to_device(best) d_do_merge[g, b](d_best, d_centre, d_related, d_sources, p, numobj, err) end = time.time() print 'compute time: {0}'.format(end - start) cuda.profile_stop() return (a_e)