def train(args, trainer, task, epoch_itr): """Train the model for one epoch.""" # Update parameters every N batches update_freq = args.update_freq[epoch_itr.epoch - 1] \ if epoch_itr.epoch <= len(args.update_freq) else args.update_freq[-1] # Initialize data iterator itr = epoch_itr.next_epoch_itr( fix_batches_to_gpus=args.fix_batches_to_gpus, shuffle=(epoch_itr.epoch >= args.curriculum), ) itr = iterators.GroupedIterator(itr, update_freq) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, ) extra_meters = collections.defaultdict(lambda: AverageMeter()) valid_subsets = args.valid_subset.split(',') max_update = args.max_update or math.inf represent_configs = utils.get_represent_configs(args) for i, samples in enumerate(progress, start=epoch_itr.iterations_in_epoch): if args.train_subtransformer: # training one SubTransformer only configs = [utils.get_subtransformer_config(args)] else: # training SuperTransformer by randomly sampling SubTransformers configs = [ utils.sample_configs( utils.get_all_choices(args), reset_rand_seed=True, rand_seed=trainer.get_num_updates(), super_decoder_num_layer=args.decoder_layers) ] log_output = trainer.train_step(samples, configs=configs) if log_output is None: continue # log mid-epoch stats stats = utils.get_training_stats(trainer) for k, v in log_output.items(): if k in [ 'loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size' ]: continue # these are already logged above if 'loss' in k or k == 'accuracy': extra_meters[k].update(v, log_output['sample_size']) else: extra_meters[k].update(v) stats[k] = extra_meters[k].avg utils.log_arch_info(stats, configs[0]) progress.log(stats, tag='train', step=stats['num_updates']) # ignore the first mini-batch in words-per-second calculation if i == 0: trainer.get_meter('wps').reset() num_updates = trainer.get_num_updates() if (not args.disable_validation and args.save_interval_updates > 0 and num_updates % args.save_interval_updates == 0 and num_updates > 0): for k, v in represent_configs.items(): trainer.set_sample_config(config=v) valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, sampled_arch_name=k) checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) if num_updates >= max_update: break # log end-of-epoch stats stats = utils.get_training_stats(trainer) for k, meter in extra_meters.items(): stats[k] = meter.avg progress.print(stats, tag='train', step=stats['num_updates']) # reset training meters for k in [ 'train_loss', 'train_nll_loss', 'wps', 'ups', 'wpb', 'bsz', 'gnorm', 'clip', ]: meter = trainer.get_meter(k) if meter is not None: meter.reset()
def main(args): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Print args print(args) # Setup task task = tasks.setup_task(args) # Build model model = task.build_model(args) print(model) # specify the length of the dummy input for profile # for iwslt, the average length is 23, for wmt, that is 30 dummy_sentence_length_dict = {'iwslt': 23, 'wmt': 30} if 'iwslt' in args.arch: dummy_sentence_length = dummy_sentence_length_dict['iwslt'] elif 'wmt' in args.arch: dummy_sentence_length = dummy_sentence_length_dict['wmt'] else: raise NotImplementedError dummy_src_tokens = [2] + [7] * (dummy_sentence_length - 1) dummy_prev = [7] * (dummy_sentence_length - 1) + [2] # for latency predictor: latency dataset generation with open(args.lat_dataset_path, 'w') as fid: src_tokens_test = torch.tensor([dummy_src_tokens], dtype=torch.long) src_lengths_test = torch.tensor([dummy_sentence_length]) prev_output_tokens_test_with_beam = torch.tensor([dummy_prev] * args.beam, dtype=torch.long) if args.latcpu: model.cpu() print('Measuring model latency on CPU for dataset generation...') elif args.latgpu: model.cuda() src_tokens_test = src_tokens_test.cuda() src_lengths_test = src_lengths_test.cuda() prev_output_tokens_test_with_beam = prev_output_tokens_test_with_beam.cuda() src_tokens_test.get_device() print('Measuring model latency on GPU for dataset generation...') start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) feature_info = utils.get_feature_info() fid.write(','.join(feature_info) + ',') latency_info = ['latency_mean_encoder', 'latency_mean_decoder', 'latency_std_encoder', 'latency_std_decoder'] fid.write(','.join(latency_info) + '\n') for i in range(args.lat_dataset_size): print(i) config_sam = utils.sample_configs(utils.get_all_choices(args), reset_rand_seed=False, super_decoder_num_layer=args.decoder_layers) features = utils.get_config_features(config_sam) fid.write(','.join(map(str, features)) + ',') model.set_sample_config(config_sam) # dry runs for _ in range(5): encoder_out_test = model.encoder(src_tokens=src_tokens_test, src_lengths=src_lengths_test) encoder_latencies = [] print('Measuring encoder for dataset generation...') for _ in tqdm(range(args.latiter)): if args.latgpu: start.record() elif args.latcpu: start = time.time() model.encoder(src_tokens=src_tokens_test, src_lengths=src_lengths_test) if args.latgpu: end.record() torch.cuda.synchronize() encoder_latencies.append(start.elapsed_time(end)) if not args.latsilent: print('Encoder one run on GPU (for dataset generation): ', start.elapsed_time(end)) elif args.latcpu: end = time.time() encoder_latencies.append((end - start) * 1000) if not args.latsilent: print('Encoder one run on CPU (for dataset generation): ', (end - start) * 1000) # only use the 10% to 90% latencies to avoid outliers encoder_latencies.sort() encoder_latencies = encoder_latencies[int(args.latiter * 0.1): -max(1, int(args.latiter * 0.1))] print(f'Encoder latency for dataset generation: Mean: {np.mean(encoder_latencies)} ms; \t Std: {np.std(encoder_latencies)} ms') bsz = 1 new_order = torch.arange(bsz).view(-1, 1).repeat(1, args.beam).view(-1).long() if args.latgpu: new_order = new_order.cuda() encoder_out_test_with_beam = model.encoder.reorder_encoder_out(encoder_out_test, new_order) # dry runs for _ in range(5): model.decoder(prev_output_tokens=prev_output_tokens_test_with_beam, encoder_out=encoder_out_test_with_beam) # decoder is more complicated because we need to deal with incremental states and auto regressive things decoder_iterations_dict = {'iwslt': 23, 'wmt': 30} if 'iwslt' in args.arch: decoder_iterations = decoder_iterations_dict['iwslt'] elif 'wmt' in args.arch: decoder_iterations = decoder_iterations_dict['wmt'] decoder_latencies = [] print('Measuring decoder for dataset generation...') for _ in tqdm(range(args.latiter)): if args.latgpu: start.record() elif args.latcpu: start = time.time() incre_states = {} for k_regressive in range(decoder_iterations): model.decoder(prev_output_tokens=prev_output_tokens_test_with_beam[:, :k_regressive + 1], encoder_out=encoder_out_test_with_beam, incremental_state=incre_states) if args.latgpu: end.record() torch.cuda.synchronize() decoder_latencies.append(start.elapsed_time(end)) if not args.latsilent: print('Decoder one run on GPU (for dataset generation): ', start.elapsed_time(end)) elif args.latcpu: end = time.time() decoder_latencies.append((end - start) * 1000) if not args.latsilent: print('Decoder one run on CPU (for dataset generation): ', (end - start) * 1000) # only use the 10% to 90% latencies to avoid outliers decoder_latencies.sort() decoder_latencies = decoder_latencies[int(args.latiter * 0.1): -max(1, int(args.latiter * 0.1))] print(decoder_latencies) print(f'Decoder latency for dataset generation: Mean: {np.mean(decoder_latencies)} ms; \t Std: {np.std(decoder_latencies)} ms') lats = [np.mean(encoder_latencies), np.mean(decoder_latencies), np.std(encoder_latencies), np.std(decoder_latencies)] fid.write(','.join(map(str, lats)) + '\n')
def main(args): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' torch.manual_seed(args.seed) # Print args print(args) # Setup task task = tasks.setup_task(args) # Build model model = task.build_model(args) print(model) # specify the length of the dummy input for profile # for iwslt, the average length is 23, for wmt, that is 30 dummy_sentence_length_dict = {'iwslt': 23, 'wmt': 30} if 'iwslt' in args.arch: dummy_sentence_length = dummy_sentence_length_dict['iwslt'] elif 'wmt' in args.arch: dummy_sentence_length = dummy_sentence_length_dict['wmt'] else: raise NotImplementedError dummy_src_tokens = [2] + [7] * (dummy_sentence_length - 1) dummy_prev = [7] * (dummy_sentence_length - 1) + [2] # for device predictor: device dataset generation # The temporary format is (SubTransformer, DeviceFeature) -> CacheMisses with open(args.lat_dataset_path, 'w') as fid: src_tokens_test = torch.tensor([dummy_src_tokens], dtype=torch.long) src_lengths_test = torch.tensor([dummy_sentence_length]) prev_output_tokens_test_with_beam = torch.tensor([dummy_prev] * args.beam, dtype=torch.long) if args.latcpu: model.cpu() print( 'Measuring model cache misses on CPU for dataset generation...' ) elif args.latgpu: return feature_info = utils.get_feature_info() fid.write(','.join(feature_info) + ',') device_feature_info = [ 'l1d_size', 'l1i_size', 'l2_size', 'l3_size', 'num_cores' ] # unit for cache sizes is KB fid.write(','.join(device_feature_info) + ',') misses_info = ['l1d_misses', 'l1i_misses', 'l2_misses', 'l3_misses'] fid.write(','.join(misses_info) + '\n') device_feature_set = [[32, 32, 256, 4096, 1], [32, 32, 512, 4096, 1], [32, 32, 512, 8192, 1], [64, 64, 512, 8192, 1]] num_devices = len(device_feature_set) for i in range(args.lat_dataset_size): print(i) config_sam = utils.sample_configs( utils.get_all_choices(args), reset_rand_seed=False, super_decoder_num_layer=args.decoder_layers) features = utils.get_config_features(config_sam) fid.write(','.join(map(str, features)) + ',') device_id = random.randint(0, num_devices - 1) # randomly select data device_features = device_feature_set[device_id] fid.write(','.join(map(str, device_features)) + ',') model.set_sample_config(config_sam) # dry runs for _ in range(dry_run_iters): encoder_out_test = model.encoder(src_tokens=src_tokens_test, src_lengths=src_lengths_test) # pass the model(features&device_features) as parameters into a new python process arguments = [ dynamorio_dir + '/bin64/drrun', '-t', 'drcachesim', '-config_file', dev_conf_dir + '/dev' + str(device_id) + '.conf', '--', 'python', '-u', 'encoder_infer.py', str(dummy_sentence_length) ] print(arguments) enproc = subprocess.Popen([ dynamorio_dir + '/bin64/drrun', '-t', 'drcachesim', '-config_file', dev_conf_dir + '/dev' + str(device_id) + '.conf', '--', 'python', '-u', 'encoder_infer.py', str(dummy_sentence_length) ], stdout=subprocess.PIPE) while True: line = enproc.stdout.readline() if not line: break print('line: ' + line.decode('utf-8')) encoder_misses = [] print('Measuring encoder for dataset generation...') #TODO run model.encoder(src_tokens=src_tokens_test, src_lengths=src_lengths_test) in simulator and write results into encoder_misses print(encoder_misses) bsz = 1 new_order = torch.arange(bsz).view(-1, 1).repeat( 1, args.beam).view(-1).long() encoder_out_test_with_beam = model.encoder.reorder_encoder_out( encoder_out_test, new_order) # dry runs for _ in range(dry_run_iters): model.decoder( prev_output_tokens=prev_output_tokens_test_with_beam, encoder_out=encoder_out_test_with_beam) # decoder is more complicated because we need to deal with incremental states and auto regressive things decoder_iterations_dict = {'iwslt': 23, 'wmt': 30} if 'iwslt' in args.arch: decoder_iterations = decoder_iterations_dict['iwslt'] elif 'wmt' in args.arch: decoder_iterations = decoder_iterations_dict['wmt'] decoder_misses = [] print('Measuring decoder for dataset generation...') #TODO run the below commands in simulator and write results into decoder_misses #incre_states = {} #for k_regressive in range(decoder_iterations): # model.decoder(prev_output_tokens=prev_output_tokens_test_with_beam[:, :k_regressive + 1], # encoder_out=encoder_out_test_with_beam, incremental_state=incre_states) print(decoder_misses) #TODO Do we need to add the encoder/decoder misses together fid.write(','.join(map(str, encoder_misses)) + '\n') #temp