def process_trip(x, start_time): tt = time.localtime(start_time) data = [tt.tm_wday, tt.tm_hour] # distance from the center till cutting point v_mn = 0 head = 0 if len(x)>1: v_mn = haversineKaggle(x[0,:], x[1,:])[0] head = heading(x[0,:], x[1,:]) # distance from the center till cutting point d_st = haversineKaggle(x[0,:], CITY_CENTER) h_st = heading(x[0,:], CITY_CENTER[0]) data += [x[-1,0], x[-1,1], d_st, h_st, v_mn, head] return data
def process_trip(x, start_time): tt = time.localtime(start_time) data = [tt.tm_wday, tt.tm_hour] # distance from the center till cutting point v_mn = 0 head = 0 if len(x) > 1: v_mn = haversineKaggle(x[0, :], x[-1, :])[0] head = heading(x[0, :], x[-1, :]) # distance from the center till cutting point d_st = haversineKaggle(x[0, :], CITY_CENTER) h_st = heading(x[0, :], CITY_CENTER[0]) data += [x[-1, 0], x[-1, 1], d_st, h_st, v_mn, head] return data
def process_trip(x, start_time): tt = time.localtime(start_time) data = [tt.tm_wday, tt.tm_hour] # distance from the center till cutting point d_st = haversineKaggle(x, CITY_CENTER) head = heading(x, CITY_CENTER[0]) data += [x[0], x[1], d_st, head] return data
def from_pretrained_ckpt(args): config = PretrainingConfig( model_name='postprocessing', data_dir='postprocessing', generator_hidden_size=0.3333333, ) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) if args.amp: policy = tf.keras.mixed_precision.experimental.Policy( "mixed_float16", loss_scale="dynamic") tf.keras.mixed_precision.experimental.set_policy(policy) print('Compute dtype: %s' % policy.compute_dtype) # Compute dtype: float16 print('Variable dtype: %s' % policy.variable_dtype) # Variable dtype: float32 # Set up model model = PretrainingModel(config) # Load checkpoint checkpoint = tf.train.Checkpoint(step=tf.Variable(1), model=model) checkpoint.restore(args.pretrained_checkpoint).expect_partial() log(" ** Restored from {} at step {}".format(args.pretrained_checkpoint, int(checkpoint.step) - 1)) disc_dir = os.path.join(args.output_dir, 'discriminator') gen_dir = os.path.join(args.output_dir, 'generator') heading(" ** Saving discriminator") model.discriminator(model.discriminator.dummy_inputs) model.discriminator.save_pretrained(disc_dir) heading(" ** Saving generator") model.generator(model.generator.dummy_inputs) model.generator.save_pretrained(gen_dir)
def format_results(self, time_width=None, time_dp=None, time_ratio_dp=None, calls_width=None): time_width, time_dp, time_ratio_dp, calls_width = _val_widths( time_width, time_dp, time_ratio_dp, calls_width) return utils.heading("Timer set: " + self.timer_set_name + ", constructed at " + _form_dt_time(self.stime) + ", written at " + _form_dt_time()) + '\n' + \ reduce(lambda s, l: s + '\n' + l, self.format_timers( time_width, time_dp, time_ratio_dp, calls_width)) + '\n' + \ TimerSet.format_self_timer(time_width, time_dp, time_ratio_dp)
def process_trip(x, start_time): tt = time.localtime(start_time) data = [tt.tm_wday, tt.tm_hour] # cumulative sum of distance d_cs = 0 vcar = 0 vmed = 0 head = 0 if x.shape[0] > 1: d1 = haversineKaggle(x[:-1,:], x[1:,:]) d_cs = np.sum(d1) vmed = np.median(d1) vcar = d1[-1] head = heading(x[-2,:], x[-1,:]) # distance from the center till cutting point d_st = haversineKaggle(x[0,:], CITY_CENTER)[0] h_st = heading(x[0,:], CITY_CENTER[0]) d_cut = haversineKaggle(x[-1,:], CITY_CENTER)[0] h_cut = heading(CITY_CENTER[0], x[-1,:]) data += [x.shape[0], x[0,0], x[0,1], x[-1,0], x[-1,1], d_st, h_st, d_cut, h_cut, d_cs, vmed, vcar, head] return data
def get_heading(self): """Returns the angle robot points""" if self._heading: return self._heading else: self._send_get('/lokarria/localization') response = self.mrds.getresponse() if (response.status == 200): position_data = response.read() json_data = json.loads(position_data.decode('utf-8')) unit_vector = heading(json_data['Pose']['Orientation']) self._heading = atan2(unit_vector['Y'], unit_vector['X']) return self._heading else: return UnexpectedResponse(response)
def load_elements(self): self.elements = { # Date & Time 'creation_time': datetime.utcnow(), 'time_text': self.current['observation_time'], 'time_epoch': float(self.current['local_epoch']), 'time_offset': int(self.current['local_tz_offset']), 'time_local': datetime.fromtimestamp(float(self.current['local_epoch'])), 'time_utc': datetime.utcfromtimestamp(float(self.current['local_epoch'])), # Location info 'full_name': self.current['display_location']['full'], 'city': self.current['display_location']['city'], 'state': self.current['display_location']['state'], 'country': self.current['display_location']['country'], 'lat': self.current['display_location']['latitude'], 'lon': self.current['display_location']['longitude'], # Station info 'station_id': self.current['station_id'], 'station_name': self.current['observation_location']['full'], 'station_lat': self.current['observation_location']['latitude'], 'station_lon': self.current['observation_location']['longitude'], # Current conditions info 'wind_mph': float(self.current['wind_mph']), 'wind_kph': float(self.current['wind_kph']), 'wind_dir': heading(self.current['wind_degrees']), 'wind_gust_kph': self.current['wind_gust_kph'], 'wind_gust_mph': self.current['wind_gust_mph'], 'temp_f': int(self.current['temp_f']), 'temp_c': int(self.current['temp_c']), 'weather': self.current['weather'], # Forecast info 'rain_prob': self.forecast[0]['pop'] } self.elements['canfly'] = noaa.canfly( self.elements['wind_mph'], self.elements['rain_prob'], self.elements['temp_f'] )
def test_heading(self): assert_equal(heading(0), 'N') assert_equal(heading(45), 'NE') assert_equal(heading(90), 'E') assert_equal(heading(135), 'SE') assert_equal(heading(180), 'S') assert_equal(heading(225), 'SW') assert_equal(heading(270), 'W') assert_equal(heading(315), 'NW') assert_equal(heading(360), 'N') assert_equal(heading(325.7), 'NW') assert_equal(heading(5), 'N') assert_equal(heading('5'), 'N') assert_equal(heading(359.3), 'N') assert_equal(heading('359.3'), 'N') assert_equal(heading(112.5), 'E') assert_equal(heading('112.5'), 'E') assert_equal(heading(110), 'E') assert_equal(heading('110'), 'E')
dy.append(dest[1]) train_data['ORIGIN_LNG'] = ox train_data['ORIGIN_LAT'] = oy train_data['DEST_LNG'] = dx train_data['DEST_LAT'] = dy CC_LON = -8.615393063941816 CC_LAT = 41.15767687592546 origin_header = [] origin_distance_to_cc = [] for i in range(train_data.shape[0]): origin_lat = float(train_data['ORIGIN_LAT'][i]) origin_lng = float(train_data['ORIGIN_LNG'][i]) origin_header.append(heading((origin_lat, origin_lng), (CC_LAT, CC_LON))) origin_distance_to_cc.append(calHarDist(origin_lat, origin_lng, CC_LAT, CC_LON)) train_data['ORIGIN_HEADER'] = origin_header train_data['ORIGIN_DISTANCE_TO_CC'] = origin_distance_to_cc origin_cutoff_header = [] origin_distance_to_cutoff = [] for i in range(train_data.shape[0]): origin_lat = float(train_data['ORIGIN_LAT'][i]) origin_lng = float(train_data['ORIGIN_LNG'][i]) cutoff_lat = float(train_data['DEST_LNG'][i]) cutoff_lng = float(train_data['DEST_LNG'][i]) origin_cutoff_header.append(heading((origin_lat, origin_lng),(cutoff_lat, cutoff_lng))) origin_distance_to_cutoff.append(calHarDist(origin_lat, origin_lng, cutoff_lat, cutoff_lng))
def process_row_training(X, row): pln = ast.literal_eval(row['POLYLINE']) if len(pln)>3: n_samples = MAX_SAMPLES_PER_TRIP for i in range(n_samples): idx = np.random.randint(len(pln)-1) + 1 if idx < 4: continue data = [row['TRIP_ID'], row['ORIGIN_CALL'], row['ORIGIN_STAND'], row['TAXI_ID'], row['TIMESTAMP'], row['DATE'], row['END_TIME'], row['dayofweek'], row['hour'], row['ORIGIN_LNG'], row['ORIGIN_LAT'], row['DEST_LNG'], row['DEST_LAT'], row['ORIGIN_HEADER'], row['ORIGIN_DISTANCE_TO_CC']] data += [idx, pln[idx][1], pln[idx][0], calHarDist(pln[idx][1], pln[idx][0], CC_LAT, CC_LON), heading([CC_LAT, CC_LON], pln[idx])] data += [row['CALL_TYPE_A'], row['CALL_TYPE_B'], row['CALL_TYPE_C'], row['ACTUAL_DAYTYPE_A'], row['ACTUAL_DAYTYPE_B'], row['ACTUAL_DAYTYPE_C'], row['DURATION']] X.append(data) return X
def main(e2e_start_time): # Parse essential argumentss parser = argparse.ArgumentParser() parser.add_argument("--model_name", required=True) parser.add_argument("--model_size", default="base", type=str, help="base or large") parser.add_argument("--pretrain_tfrecords", type=str) parser.add_argument("--phase2", action='store_true') parser.add_argument("--fp16_compression", action='store_true') parser.add_argument("--amp", action='store_true', help="Whether to use fp16.") parser.add_argument("--xla", action='store_true', help="Whether to use xla.") parser.add_argument("--seed", default=42, type=int) parser.add_argument("--num_train_steps", type=int) parser.add_argument("--num_warmup_steps", type=int) parser.add_argument("--learning_rate", type=float) parser.add_argument("--train_batch_size", type=int) parser.add_argument("--max_seq_length", type=int) parser.add_argument("--mask_prob", type=float) parser.add_argument("--disc_weight", type=float) parser.add_argument("--generator_hidden_size", type=float) parser.add_argument("--log_freq", type=int, default=10, help="Training metrics logging frequency") parser.add_argument("--save_checkpoints_steps", type=int) parser.add_argument("--keep_checkpoint_max", type=int) parser.add_argument("--restore_checkpoint", default=None, type=str) parser.add_argument("--load_weights", action='store_true') parser.add_argument("--weights_dir") parser.add_argument("--optimizer", default="adam", type=str, help="adam or lamb") parser.add_argument( "--skip_adaptive", action='store_true', help="Whether to apply adaptive LR on LayerNorm and biases") parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of Gradient Accumulation steps") parser.add_argument("--lr_decay_power", type=float, default=0.5, help="LR decay power") parser.add_argument("--opt_beta_1", type=float, default=0.878, help="Optimizer beta1") parser.add_argument("--opt_beta_2", type=float, default=0.974, help="Optimizer beta2") parser.add_argument("--end_lr", type=float, default=0.0, help="Ending LR") parser.add_argument("--log_dir", type=str, default=None, help="Path to store logs") parser.add_argument("--results_dir", type=str, default=None, help="Path to store all model results") parser.add_argument("--skip_checkpoint", action='store_true', default=False, help="Path to store logs") parser.add_argument( '--json-summary', type=str, default=None, help= 'If provided, the json summary will be written to the specified file.') args = parser.parse_args() config = PretrainingConfig(**args.__dict__) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) # Set up tensorflow hvd.init() args.log_dir = config.log_dir # DLLogger setup_logger(args) set_affinity(hvd.local_rank()) gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') tf.config.optimizer.set_jit(config.xla) #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": config.amp}) if config.amp: policy = tf.keras.mixed_precision.experimental.Policy( "mixed_float16", loss_scale="dynamic") tf.keras.mixed_precision.experimental.set_policy(policy) print('Compute dtype: %s' % policy.compute_dtype) # Compute dtype: float16 print('Variable dtype: %s' % policy.variable_dtype) # Variable dtype: float32 #tf.random.set_seed(config.seed) # Set up config cont' if config.load_weights and config.restore_checkpoint: raise ValueError( "`load_weights` and `restore_checkpoint` should not be on at the same time." ) if config.phase2 and not config.restore_checkpoint: raise ValueError( "`phase2` cannot be used without `restore_checkpoint`.") utils.heading("Config:") log_config(config) # Save pretrain configs pretrain_config_json = os.path.join(config.checkpoints_dir, 'pretrain_config.json') if is_main_process(): utils.write_json(config.__dict__, pretrain_config_json) log("Configuration saved in {}".format(pretrain_config_json)) # Set up model model = PretrainingModel(config) # Set up metrics metrics = dict() metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf") metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss") metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy( name="masked_lm_accuracy") metrics["masked_lm_loss"] = tf.keras.metrics.Mean(name="masked_lm_loss") if config.electra_objective: metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy( name="sampled_masked_lm_accuracy") if config.disc_weight > 0: metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss") metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc") metrics["disc_accuracy"] = tf.keras.metrics.Accuracy( name="disc_accuracy") metrics["disc_precision"] = tf.keras.metrics.Accuracy( name="disc_precision") metrics["disc_recall"] = tf.keras.metrics.Accuracy( name="disc_recall") # Set up tensorboard current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = os.path.join( config.log_dir, current_time, 'train_' + str(get_rank()) + '_of_' + str(get_world_size())) train_summary_writer = tf.summary.create_file_writer(train_log_dir) # Set up dataset dataset = pretrain_utils.get_dataset(config, config.train_batch_size, world_size=get_world_size(), rank=get_rank()) train_iterator = iter(dataset) # Set up optimizer optimizer = create_optimizer(init_lr=config.learning_rate, num_train_steps=config.num_train_steps, num_warmup_steps=config.num_warmup_steps, weight_decay_rate=config.weight_decay_rate, optimizer=config.optimizer, skip_adaptive=config.skip_adaptive, power=config.lr_decay_power, beta_1=config.opt_beta_1, beta_2=config.opt_beta_2, end_lr=config.end_lr) accumulator = GradientAccumulator() if config.amp: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, "dynamic") # Set up model checkpoint checkpoint = tf.train.Checkpoint(step=tf.Variable(0), phase2=tf.Variable(False), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager( checkpoint, config.checkpoints_dir, max_to_keep=config.keep_checkpoint_max) if config.restore_checkpoint and config.restore_checkpoint != "latest": checkpoint.restore(config.restore_checkpoint) log(" ** Restored model checkpoint from {}".format( config.restore_checkpoint)) elif config.restore_checkpoint and config.restore_checkpoint == "latest" and manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) log(" ** Restored model checkpoint from {}".format( manager.latest_checkpoint)) elif config.load_weights: model.generator(model.generator.dummy_inputs) model.discriminator(model.discriminator.dummy_inputs) model.generator.load_weights( os.path.join(config.weights_dir, 'generator', 'tf_model.h5')) model.discriminator.load_weights( os.path.join(config.weights_dir, 'discriminator', 'tf_model.h5')) else: log(" ** Initializing from scratch.") restore_iterator = bool( config.restore_checkpoint) and config.restore_checkpoint == "latest" # Initialize global step for phase2 if config.phase2 and not bool(checkpoint.phase2): optimizer.iterations.assign(0) checkpoint.step.assign(0) checkpoint.phase2.assign(True) restore_iterator = False if bool(checkpoint.phase2): manager = tf.train.CheckpointManager( checkpoint, config.checkpoints_dir, checkpoint_name='ckpt-p2', max_to_keep=config.keep_checkpoint_max) # Set up iterator checkpoint iter_checkpoint = tf.train.Checkpoint(train_iterator=train_iterator, world_size=tf.Variable( get_world_size()), rank=tf.Variable(get_rank())) iter_manager = tf.train.CheckpointManager( iter_checkpoint, os.path.join(config.checkpoints_dir, 'iter_ckpt_rank_' + '{:02}'.format(get_rank())), checkpoint_name='iter_ckpt_rank_' + '{:02}'.format(get_rank()), max_to_keep=config.keep_checkpoint_max) if restore_iterator and iter_manager.latest_checkpoint: ckpt_world_size = tf.train.load_variable( iter_manager.latest_checkpoint, 'world_size/.ATTRIBUTES/VARIABLE_VALUE') if ckpt_world_size == get_world_size(): iter_checkpoint.restore(iter_manager.latest_checkpoint) log(" ** Restored iterator checkpoint from {}".format( iter_manager.latest_checkpoint), all_rank=True) utils.heading("Running training") accumulator.reset() train_start, start_step = time.time(), int(checkpoint.step) - 1 local_step = 0 saved_ckpt = False while int(checkpoint.step) <= config.num_train_steps: saved_ckpt = False step = int(checkpoint.step) features = next(train_iterator) iter_start = time.time() # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir) total_loss, eval_fn_inputs = train_one_step( config, model, optimizer, features, accumulator, local_step == 1, take_step=local_step % args.gradient_accumulation_steps == 0) # if step == 300: tf.profiler.experimental.stop() metrics["train_perf"].update_state(config.train_batch_size * get_world_size() / (time.time() - iter_start)) metrics["total_loss"].update_state(values=total_loss) metric_fn(config, metrics, eval_fn_inputs) if (step % args.log_freq == 0) and (local_step % args.gradient_accumulation_steps == 0): log_info_dict = { k: float(v.result().numpy() * 100) if "accuracy" in k else float(v.result().numpy()) for k, v in metrics.items() } dllogger.log(step=(step, ), data=log_info_dict, verbosity=0) log('Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, ' 'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f}, Loss Scaler: {loss_scale}, Elapsed: {elapsed}, ETA: {eta}, ' .format(step=step, **log_info_dict, loss_scale=optimizer.loss_scale if config.amp else 1, elapsed=utils.get_readable_time(time.time() - train_start), eta=utils.get_readable_time( (time.time() - train_start) / (step - start_step) * (config.num_train_steps - step))), all_rank=True) with train_summary_writer.as_default(): for key, m in metrics.items(): tf.summary.scalar(key, m.result(), step=step) if int(checkpoint.step) < config.num_train_steps: for m in metrics.values(): m.reset_states() #Print allreduced metrics on the last step if int(checkpoint.step) == config.num_train_steps and ( local_step % args.gradient_accumulation_steps == 0): log_info_dict = { k: float(hvd.allreduce(v.result()).numpy() * 100) if "accuracy" in k else float(hvd.allreduce(v.result()).numpy()) for k, v in metrics.items() } log_info_dict["training_sequences_per_second"] = log_info_dict[ "train_perf"] log_info_dict["final_loss"] = log_info_dict["total_loss"] log_info_dict["e2e_train_time"] = time.time() - e2e_start_time dllogger.log(step=(), data=log_info_dict, verbosity=0) log('<FINAL STEP METRICS> Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, ' 'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f},'. format(step=step, **log_info_dict), all_rank=False) if local_step % args.gradient_accumulation_steps == 0: checkpoint.step.assign(int(optimizer.iterations)) local_step += 1 if not config.skip_checkpoint and ( local_step % (config.save_checkpoints_steps * args.gradient_accumulation_steps) == 0): saved_ckpt = True if is_main_process(): save_path = manager.save(checkpoint_number=step) log(" ** Saved model checkpoint for step {}: {}".format( step, save_path)) iter_save_path = iter_manager.save(checkpoint_number=step) log(" ** Saved iterator checkpoint for step {}: {}".format( step, iter_save_path), all_rank=True) step = (int(checkpoint.step) - 1) dllogger.flush() if not config.skip_checkpoint and not saved_ckpt: if is_main_process(): save_path = manager.save(checkpoint_number=step) log(" ** Saved model checkpoint for step {}: {}".format( step, save_path)) iter_save_path = iter_manager.save(checkpoint_number=step) log(" ** Saved iterator checkpoint for step {}: {}".format( step, iter_save_path), all_rank=True) return args
def main(): # Parse essential args parser = argparse.ArgumentParser() parser.add_argument("--data_dir", required=True, help="Location of data files (model weights, etc).") parser.add_argument("--model_name", required=True, help="The name of the model being fine-tuned.") parser.add_argument("--pretrain_tfrecords", type=str) parser.add_argument("--seed", type=int) parser.add_argument("--num_train_steps", type=int) parser.add_argument("--num_warmup_steps", type=int) parser.add_argument("--learning_rate", type=float) parser.add_argument("--train_batch_size", type=int) parser.add_argument("--max_seq_length", type=int) parser.add_argument("--mask_prob", type=float) parser.add_argument("--disc_weight", type=float) parser.add_argument("--generator_hidden_size", type=float) parser.add_argument("--save_checkpoints_steps", type=int) parser.add_argument("--keep_checkpoint_max", type=int) parser.add_argument("--restore_checkpoint", action='store_true') parser.add_argument("--optimizer", default="adam", type=str, help="adam or lamb") args = parser.parse_args() config = PretrainingConfig(**args.__dict__) # Set up tensorflow hvd.init() gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') tf.config.optimizer.set_jit(config.xla) tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": config.amp}) tf.random.set_seed(config.seed) # Set up config if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug and config.do_train: utils.rmkdir(config.model_dir) utils.heading("Config:") log_config(config) # Save pretrain configs pretrain_config_json = os.path.join(config.checkpoints_dir, 'pretrain_config.json') if is_main_process(): utils.write_json(config.__dict__, pretrain_config_json) log("Configuration saved in {}".format(pretrain_config_json)) # Set up model model = PretrainingModel(config) # Set up metrics perf_metrics = dict() perf_metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf") eval_metrics = dict() eval_metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss") eval_metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy( name="masked_lm_accuracy") eval_metrics["masked_lm_loss"] = tf.keras.metrics.Mean( name="masked_lm_loss") if config.electra_objective: eval_metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy( name="sampled_masked_lm_accuracy") if config.disc_weight > 0: eval_metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss") eval_metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc") eval_metrics["disc_accuracy"] = tf.keras.metrics.Accuracy( name="disc_accuracy") eval_metrics["disc_precision"] = tf.keras.metrics.Accuracy( name="disc_precision") eval_metrics["disc_recall"] = tf.keras.metrics.Accuracy( name="disc_recall") # Set up tensorboard current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = os.path.join( config.log_dir, current_time, 'train_' + str(get_rank()) + '_of_' + str(get_world_size())) train_summary_writer = tf.summary.create_file_writer(train_log_dir) # Set up dataset dataset = pretrain_utils.get_dataset(config, config.train_batch_size, world_size=get_world_size(), rank=get_rank()) train_iterator = iter(dataset) # Set up optimizer optimizer = create_optimizer(init_lr=config.learning_rate, num_train_steps=config.num_train_steps, num_warmup_steps=config.num_warmup_steps, weight_decay_rate=config.weight_decay_rate, optimizer=config.optimizer) if config.amp: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, "dynamic") if config.do_train: # Set up checkpoint manager checkpoint = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager( checkpoint, config.checkpoints_dir, max_to_keep=config.keep_checkpoint_max) iter_checkpoint = tf.train.Checkpoint(train_iterator=train_iterator) iter_manager = tf.train.CheckpointManager( iter_checkpoint, os.path.join(config.checkpoints_dir, 'iter_ckpt_rank_' + '{:02}'.format(get_rank())), checkpoint_name='iter_ckpt_rank_' + '{:02}'.format(get_rank()), max_to_keep=config.keep_checkpoint_max) if config.restore_checkpoint and manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) log(" ** Restored model checkpoint from {}".format( manager.latest_checkpoint)) if iter_manager.latest_checkpoint: iter_checkpoint.restore(iter_manager.latest_checkpoint) log(" ** Restored iterator checkpoint from {}".format( iter_manager.latest_checkpoint), all_rank=True) else: log(" ** Initializing from scratch.") utils.heading("Running training") train_start, start_step = time.time(), int(checkpoint.step) - 1 while int(checkpoint.step) <= config.num_train_steps: step = int(checkpoint.step) features = next(train_iterator) iter_start = time.time() # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir) total_loss, eval_fn_inputs = train_one_step( config, model, optimizer, features, step <= 1) # if step == 300: tf.profiler.experimental.stop() perf_metrics["train_perf"].update_state(config.train_batch_size * get_world_size() / (time.time() - iter_start)) eval_metrics["total_loss"].update_state(values=total_loss) metric_fn(config, eval_metrics, eval_fn_inputs) if step % 100 == 0: log('Step:{:6d}, Loss:{:10.6f}, Gen_loss:{:10.6f}, Disc_loss:{:10.6f}, Gen_acc:{:6.2f}, ' 'Disc_acc:{:6.2f}, Perf:{:4.0f}, Elapsed: {}, ETA: {}, '. format( step, total_loss, eval_metrics["masked_lm_loss"].result().numpy(), eval_metrics["disc_loss"].result().numpy(), eval_metrics["masked_lm_accuracy"].result().numpy() * 100, eval_metrics["disc_accuracy"].result().numpy() * 100, perf_metrics["train_perf"].result().numpy(), utils.get_readable_time(time.time() - train_start), utils.get_readable_time( (time.time() - train_start) / (step - start_step) * (config.num_train_steps - step))), all_rank=True) with train_summary_writer.as_default(): for key, m in eval_metrics.items(): tf.summary.scalar(key, m.result(), step=step) for m in eval_metrics.values(): m.reset_states() checkpoint.step.assign_add(1) if step % config.save_checkpoints_steps == 0: if is_main_process(): save_path = manager.save() log(" ** Saved model checkpoint for step {}: {}".format( step, save_path)) iter_save_path = iter_manager.save() log(" ** Saved iterator checkpoint for step {}: {}".format( step, iter_save_path), all_rank=True) if config.do_eval: pass