def process_trip(x, start_time):
    tt = time.localtime(start_time)
    data = [tt.tm_wday, tt.tm_hour]
    # distance from the center till cutting point
    v_mn = 0
    head = 0
    if len(x)>1:
        v_mn = haversineKaggle(x[0,:], x[1,:])[0]
        head = heading(x[0,:], x[1,:])
    # distance from the center till cutting point
    d_st = haversineKaggle(x[0,:],  CITY_CENTER)
    h_st = heading(x[0,:],  CITY_CENTER[0])
    data += [x[-1,0], x[-1,1], d_st, h_st, v_mn, head]
    return data
Example #2
0
def process_trip(x, start_time):
    tt = time.localtime(start_time)
    data = [tt.tm_wday, tt.tm_hour]
    # distance from the center till cutting point
    v_mn = 0
    head = 0
    if len(x) > 1:
        v_mn = haversineKaggle(x[0, :], x[-1, :])[0]
        head = heading(x[0, :], x[-1, :])
    # distance from the center till cutting point
    d_st = haversineKaggle(x[0, :], CITY_CENTER)
    h_st = heading(x[0, :], CITY_CENTER[0])
    data += [x[-1, 0], x[-1, 1], d_st, h_st, v_mn, head]
    return data
Example #3
0
def process_trip(x, start_time):
    tt = time.localtime(start_time)
    data = [tt.tm_wday, tt.tm_hour]
    # distance from the center till cutting point
    d_st = haversineKaggle(x, CITY_CENTER)
    head = heading(x, CITY_CENTER[0])
    data += [x[0], x[1], d_st, head]
    return data
def process_trip(x, start_time):
    tt = time.localtime(start_time)
    data = [tt.tm_wday, tt.tm_hour]
    # distance from the center till cutting point
    d_st = haversineKaggle(x,  CITY_CENTER)
    head = heading(x,  CITY_CENTER[0])
    data += [x[0], x[1], d_st, head]
    return data
def from_pretrained_ckpt(args):
    config = PretrainingConfig(
        model_name='postprocessing',
        data_dir='postprocessing',
        generator_hidden_size=0.3333333,
    )

    # Padding for divisibility by 8
    if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)

    if args.amp:
        policy = tf.keras.mixed_precision.experimental.Policy(
            "mixed_float16", loss_scale="dynamic")
        tf.keras.mixed_precision.experimental.set_policy(policy)
        print('Compute dtype: %s' %
              policy.compute_dtype)  # Compute dtype: float16
        print('Variable dtype: %s' %
              policy.variable_dtype)  # Variable dtype: float32

    # Set up model
    model = PretrainingModel(config)

    # Load checkpoint
    checkpoint = tf.train.Checkpoint(step=tf.Variable(1), model=model)
    checkpoint.restore(args.pretrained_checkpoint).expect_partial()
    log(" ** Restored from {} at step {}".format(args.pretrained_checkpoint,
                                                 int(checkpoint.step) - 1))

    disc_dir = os.path.join(args.output_dir, 'discriminator')
    gen_dir = os.path.join(args.output_dir, 'generator')

    heading(" ** Saving discriminator")
    model.discriminator(model.discriminator.dummy_inputs)
    model.discriminator.save_pretrained(disc_dir)

    heading(" ** Saving generator")
    model.generator(model.generator.dummy_inputs)
    model.generator.save_pretrained(gen_dir)
Example #6
0
    def format_results(self,
                       time_width=None,
                       time_dp=None,
                       time_ratio_dp=None,
                       calls_width=None):
        time_width, time_dp, time_ratio_dp, calls_width = _val_widths(
            time_width, time_dp, time_ratio_dp, calls_width)

        return utils.heading("Timer set: " + self.timer_set_name + ", constructed at " +
                    _form_dt_time(self.stime) + ", written at " + _form_dt_time()) + '\n' + \
                    reduce(lambda s, l: s + '\n' + l, self.format_timers(
                        time_width, time_dp, time_ratio_dp, calls_width)) + '\n' + \
                    TimerSet.format_self_timer(time_width, time_dp, time_ratio_dp)
Example #7
0
def process_trip(x, start_time):
    tt = time.localtime(start_time)
    data = [tt.tm_wday, tt.tm_hour]
    # cumulative sum of distance
    d_cs = 0
    vcar = 0
    vmed = 0
    head = 0
    if x.shape[0] > 1:
        d1 = haversineKaggle(x[:-1,:], x[1:,:])
        d_cs = np.sum(d1)
        vmed = np.median(d1)
        vcar = d1[-1]
        head = heading(x[-2,:], x[-1,:])
    # distance from the center till cutting point
    d_st = haversineKaggle(x[0,:],  CITY_CENTER)[0]
    h_st = heading(x[0,:],  CITY_CENTER[0])
    d_cut = haversineKaggle(x[-1,:], CITY_CENTER)[0]
    h_cut = heading(CITY_CENTER[0], x[-1,:])
    data += [x.shape[0], x[0,0], x[0,1], x[-1,0], x[-1,1], d_st, h_st, d_cut, 
             h_cut, d_cs, vmed, vcar, head]
    return data
Example #8
0
 def get_heading(self):
     """Returns the angle robot points"""
     if self._heading:
         return self._heading
     else:
         self._send_get('/lokarria/localization')
         response = self.mrds.getresponse()
         if (response.status == 200):
             position_data = response.read()
             json_data = json.loads(position_data.decode('utf-8'))
             unit_vector = heading(json_data['Pose']['Orientation'])
             self._heading = atan2(unit_vector['Y'], unit_vector['X'])
             return self._heading
         else:
             return UnexpectedResponse(response)
Example #9
0
 def load_elements(self):
     self.elements = {
         # Date & Time
         'creation_time': datetime.utcnow(),
         'time_text': self.current['observation_time'],
         'time_epoch': float(self.current['local_epoch']),
         'time_offset': int(self.current['local_tz_offset']),
         'time_local': datetime.fromtimestamp(float(self.current['local_epoch'])),
         'time_utc': datetime.utcfromtimestamp(float(self.current['local_epoch'])),
         # Location info  
         'full_name': self.current['display_location']['full'],
         'city': self.current['display_location']['city'],
         'state': self.current['display_location']['state'],
         'country': self.current['display_location']['country'],
         'lat': self.current['display_location']['latitude'],
         'lon': self.current['display_location']['longitude'],
         # Station info
         'station_id': self.current['station_id'],
         'station_name': self.current['observation_location']['full'],
         'station_lat': self.current['observation_location']['latitude'],
         'station_lon': self.current['observation_location']['longitude'],
         # Current conditions info
         'wind_mph': float(self.current['wind_mph']),
         'wind_kph': float(self.current['wind_kph']),
         'wind_dir': heading(self.current['wind_degrees']),
         'wind_gust_kph': self.current['wind_gust_kph'],
         'wind_gust_mph': self.current['wind_gust_mph'],
         'temp_f': int(self.current['temp_f']),
         'temp_c': int(self.current['temp_c']),
         'weather': self.current['weather'],
         # Forecast info
         'rain_prob': self.forecast[0]['pop']
     }
     self.elements['canfly'] = noaa.canfly(
         self.elements['wind_mph'],
         self.elements['rain_prob'],
         self.elements['temp_f']
     )
Example #10
0
 def test_heading(self):
     assert_equal(heading(0),   'N')
     assert_equal(heading(45),  'NE')
     assert_equal(heading(90),  'E')
     assert_equal(heading(135), 'SE')
     assert_equal(heading(180), 'S')
     assert_equal(heading(225), 'SW')
     assert_equal(heading(270), 'W')
     assert_equal(heading(315), 'NW')
     assert_equal(heading(360), 'N')
     assert_equal(heading(325.7), 'NW')
     assert_equal(heading(5), 'N')
     assert_equal(heading('5'), 'N')
     assert_equal(heading(359.3), 'N')
     assert_equal(heading('359.3'), 'N')
     assert_equal(heading(112.5), 'E')
     assert_equal(heading('112.5'), 'E')
     assert_equal(heading(110), 'E')
     assert_equal(heading('110'), 'E')
Example #11
0
    dy.append(dest[1])

train_data['ORIGIN_LNG'] = ox
train_data['ORIGIN_LAT'] = oy
train_data['DEST_LNG'] = dx
train_data['DEST_LAT'] = dy

CC_LON = -8.615393063941816
CC_LAT = 41.15767687592546

origin_header = []
origin_distance_to_cc = []
for i in range(train_data.shape[0]):
    origin_lat = float(train_data['ORIGIN_LAT'][i])
    origin_lng = float(train_data['ORIGIN_LNG'][i])
    origin_header.append(heading((origin_lat, origin_lng), (CC_LAT, CC_LON)))
    origin_distance_to_cc.append(calHarDist(origin_lat, origin_lng, CC_LAT, CC_LON))

train_data['ORIGIN_HEADER'] = origin_header
train_data['ORIGIN_DISTANCE_TO_CC'] = origin_distance_to_cc

origin_cutoff_header = []
origin_distance_to_cutoff = []

for i in range(train_data.shape[0]):
    origin_lat = float(train_data['ORIGIN_LAT'][i])
    origin_lng = float(train_data['ORIGIN_LNG'][i])
    cutoff_lat = float(train_data['DEST_LNG'][i])
    cutoff_lng = float(train_data['DEST_LNG'][i])
    origin_cutoff_header.append(heading((origin_lat, origin_lng),(cutoff_lat, cutoff_lng)))
    origin_distance_to_cutoff.append(calHarDist(origin_lat, origin_lng, cutoff_lat, cutoff_lng))
Example #12
0
def process_row_training(X, row):
    pln = ast.literal_eval(row['POLYLINE'])
    if len(pln)>3:
        n_samples = MAX_SAMPLES_PER_TRIP
        for i in range(n_samples):
            idx = np.random.randint(len(pln)-1) + 1
            if idx < 4:
                continue
            data = [row['TRIP_ID'], row['ORIGIN_CALL'], row['ORIGIN_STAND'], row['TAXI_ID'], row['TIMESTAMP'], row['DATE'], row['END_TIME'], row['dayofweek'], row['hour'], row['ORIGIN_LNG'], row['ORIGIN_LAT'], row['DEST_LNG'], row['DEST_LAT'], row['ORIGIN_HEADER'], row['ORIGIN_DISTANCE_TO_CC']]
            data += [idx, pln[idx][1], pln[idx][0], calHarDist(pln[idx][1], pln[idx][0], CC_LAT, CC_LON), heading([CC_LAT, CC_LON], pln[idx])]
            data += [row['CALL_TYPE_A'], row['CALL_TYPE_B'], row['CALL_TYPE_C'], row['ACTUAL_DAYTYPE_A'], row['ACTUAL_DAYTYPE_B'], row['ACTUAL_DAYTYPE_C'], row['DURATION']]
            X.append(data)
    return X
Example #13
0
def main(e2e_start_time):
    # Parse essential argumentss
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", required=True)
    parser.add_argument("--model_size",
                        default="base",
                        type=str,
                        help="base or large")
    parser.add_argument("--pretrain_tfrecords", type=str)
    parser.add_argument("--phase2", action='store_true')
    parser.add_argument("--fp16_compression", action='store_true')
    parser.add_argument("--amp",
                        action='store_true',
                        help="Whether to use fp16.")
    parser.add_argument("--xla",
                        action='store_true',
                        help="Whether to use xla.")
    parser.add_argument("--seed", default=42, type=int)
    parser.add_argument("--num_train_steps", type=int)
    parser.add_argument("--num_warmup_steps", type=int)
    parser.add_argument("--learning_rate", type=float)
    parser.add_argument("--train_batch_size", type=int)
    parser.add_argument("--max_seq_length", type=int)

    parser.add_argument("--mask_prob", type=float)
    parser.add_argument("--disc_weight", type=float)
    parser.add_argument("--generator_hidden_size", type=float)

    parser.add_argument("--log_freq",
                        type=int,
                        default=10,
                        help="Training metrics logging frequency")
    parser.add_argument("--save_checkpoints_steps", type=int)
    parser.add_argument("--keep_checkpoint_max", type=int)
    parser.add_argument("--restore_checkpoint", default=None, type=str)
    parser.add_argument("--load_weights", action='store_true')
    parser.add_argument("--weights_dir")

    parser.add_argument("--optimizer",
                        default="adam",
                        type=str,
                        help="adam or lamb")
    parser.add_argument(
        "--skip_adaptive",
        action='store_true',
        help="Whether to apply adaptive LR on LayerNorm and biases")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=1,
                        help="Number of Gradient Accumulation steps")
    parser.add_argument("--lr_decay_power",
                        type=float,
                        default=0.5,
                        help="LR decay power")
    parser.add_argument("--opt_beta_1",
                        type=float,
                        default=0.878,
                        help="Optimizer beta1")
    parser.add_argument("--opt_beta_2",
                        type=float,
                        default=0.974,
                        help="Optimizer beta2")
    parser.add_argument("--end_lr", type=float, default=0.0, help="Ending LR")
    parser.add_argument("--log_dir",
                        type=str,
                        default=None,
                        help="Path to store logs")
    parser.add_argument("--results_dir",
                        type=str,
                        default=None,
                        help="Path to store all model results")
    parser.add_argument("--skip_checkpoint",
                        action='store_true',
                        default=False,
                        help="Path to store logs")
    parser.add_argument(
        '--json-summary',
        type=str,
        default=None,
        help=
        'If provided, the json summary will be written to the specified file.')
    args = parser.parse_args()
    config = PretrainingConfig(**args.__dict__)
    # Padding for divisibility by 8
    if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)

    # Set up tensorflow
    hvd.init()

    args.log_dir = config.log_dir
    # DLLogger
    setup_logger(args)

    set_affinity(hvd.local_rank())
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')
    tf.config.optimizer.set_jit(config.xla)
    #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": config.amp})

    if config.amp:
        policy = tf.keras.mixed_precision.experimental.Policy(
            "mixed_float16", loss_scale="dynamic")
        tf.keras.mixed_precision.experimental.set_policy(policy)
        print('Compute dtype: %s' %
              policy.compute_dtype)  # Compute dtype: float16
        print('Variable dtype: %s' %
              policy.variable_dtype)  # Variable dtype: float32

    #tf.random.set_seed(config.seed)

    # Set up config cont'
    if config.load_weights and config.restore_checkpoint:
        raise ValueError(
            "`load_weights` and `restore_checkpoint` should not be on at the same time."
        )
    if config.phase2 and not config.restore_checkpoint:
        raise ValueError(
            "`phase2` cannot be used without `restore_checkpoint`.")
    utils.heading("Config:")
    log_config(config)

    # Save pretrain configs
    pretrain_config_json = os.path.join(config.checkpoints_dir,
                                        'pretrain_config.json')
    if is_main_process():
        utils.write_json(config.__dict__, pretrain_config_json)
        log("Configuration saved in {}".format(pretrain_config_json))

    # Set up model
    model = PretrainingModel(config)

    # Set up metrics
    metrics = dict()
    metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf")
    metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss")
    metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy(
        name="masked_lm_accuracy")
    metrics["masked_lm_loss"] = tf.keras.metrics.Mean(name="masked_lm_loss")
    if config.electra_objective:
        metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy(
            name="sampled_masked_lm_accuracy")
        if config.disc_weight > 0:
            metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss")
            metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc")
            metrics["disc_accuracy"] = tf.keras.metrics.Accuracy(
                name="disc_accuracy")
            metrics["disc_precision"] = tf.keras.metrics.Accuracy(
                name="disc_precision")
            metrics["disc_recall"] = tf.keras.metrics.Accuracy(
                name="disc_recall")

    # Set up tensorboard
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = os.path.join(
        config.log_dir, current_time,
        'train_' + str(get_rank()) + '_of_' + str(get_world_size()))
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    # Set up dataset
    dataset = pretrain_utils.get_dataset(config,
                                         config.train_batch_size,
                                         world_size=get_world_size(),
                                         rank=get_rank())
    train_iterator = iter(dataset)

    # Set up optimizer
    optimizer = create_optimizer(init_lr=config.learning_rate,
                                 num_train_steps=config.num_train_steps,
                                 num_warmup_steps=config.num_warmup_steps,
                                 weight_decay_rate=config.weight_decay_rate,
                                 optimizer=config.optimizer,
                                 skip_adaptive=config.skip_adaptive,
                                 power=config.lr_decay_power,
                                 beta_1=config.opt_beta_1,
                                 beta_2=config.opt_beta_2,
                                 end_lr=config.end_lr)

    accumulator = GradientAccumulator()
    if config.amp:
        optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
            optimizer, "dynamic")

    # Set up model checkpoint
    checkpoint = tf.train.Checkpoint(step=tf.Variable(0),
                                     phase2=tf.Variable(False),
                                     optimizer=optimizer,
                                     model=model)
    manager = tf.train.CheckpointManager(
        checkpoint,
        config.checkpoints_dir,
        max_to_keep=config.keep_checkpoint_max)
    if config.restore_checkpoint and config.restore_checkpoint != "latest":
        checkpoint.restore(config.restore_checkpoint)
        log(" ** Restored model checkpoint from {}".format(
            config.restore_checkpoint))
    elif config.restore_checkpoint and config.restore_checkpoint == "latest" and manager.latest_checkpoint:
        checkpoint.restore(manager.latest_checkpoint)
        log(" ** Restored model checkpoint from {}".format(
            manager.latest_checkpoint))
    elif config.load_weights:
        model.generator(model.generator.dummy_inputs)
        model.discriminator(model.discriminator.dummy_inputs)
        model.generator.load_weights(
            os.path.join(config.weights_dir, 'generator', 'tf_model.h5'))
        model.discriminator.load_weights(
            os.path.join(config.weights_dir, 'discriminator', 'tf_model.h5'))
    else:
        log(" ** Initializing from scratch.")

    restore_iterator = bool(
        config.restore_checkpoint) and config.restore_checkpoint == "latest"
    # Initialize global step for phase2
    if config.phase2 and not bool(checkpoint.phase2):
        optimizer.iterations.assign(0)
        checkpoint.step.assign(0)
        checkpoint.phase2.assign(True)
        restore_iterator = False
    if bool(checkpoint.phase2):
        manager = tf.train.CheckpointManager(
            checkpoint,
            config.checkpoints_dir,
            checkpoint_name='ckpt-p2',
            max_to_keep=config.keep_checkpoint_max)

    # Set up iterator checkpoint
    iter_checkpoint = tf.train.Checkpoint(train_iterator=train_iterator,
                                          world_size=tf.Variable(
                                              get_world_size()),
                                          rank=tf.Variable(get_rank()))
    iter_manager = tf.train.CheckpointManager(
        iter_checkpoint,
        os.path.join(config.checkpoints_dir,
                     'iter_ckpt_rank_' + '{:02}'.format(get_rank())),
        checkpoint_name='iter_ckpt_rank_' + '{:02}'.format(get_rank()),
        max_to_keep=config.keep_checkpoint_max)
    if restore_iterator and iter_manager.latest_checkpoint:
        ckpt_world_size = tf.train.load_variable(
            iter_manager.latest_checkpoint,
            'world_size/.ATTRIBUTES/VARIABLE_VALUE')
        if ckpt_world_size == get_world_size():
            iter_checkpoint.restore(iter_manager.latest_checkpoint)
            log(" ** Restored iterator checkpoint from {}".format(
                iter_manager.latest_checkpoint),
                all_rank=True)

    utils.heading("Running training")
    accumulator.reset()
    train_start, start_step = time.time(), int(checkpoint.step) - 1
    local_step = 0
    saved_ckpt = False
    while int(checkpoint.step) <= config.num_train_steps:
        saved_ckpt = False
        step = int(checkpoint.step)
        features = next(train_iterator)
        iter_start = time.time()

        # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir)
        total_loss, eval_fn_inputs = train_one_step(
            config,
            model,
            optimizer,
            features,
            accumulator,
            local_step == 1,
            take_step=local_step % args.gradient_accumulation_steps == 0)
        # if step == 300: tf.profiler.experimental.stop()

        metrics["train_perf"].update_state(config.train_batch_size *
                                           get_world_size() /
                                           (time.time() - iter_start))
        metrics["total_loss"].update_state(values=total_loss)
        metric_fn(config, metrics, eval_fn_inputs)

        if (step % args.log_freq
                == 0) and (local_step % args.gradient_accumulation_steps == 0):
            log_info_dict = {
                k: float(v.result().numpy() *
                         100) if "accuracy" in k else float(v.result().numpy())
                for k, v in metrics.items()
            }
            dllogger.log(step=(step, ), data=log_info_dict, verbosity=0)
            log('Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, '
                'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f}, Loss Scaler: {loss_scale}, Elapsed: {elapsed}, ETA: {eta}, '
                .format(step=step,
                        **log_info_dict,
                        loss_scale=optimizer.loss_scale if config.amp else 1,
                        elapsed=utils.get_readable_time(time.time() -
                                                        train_start),
                        eta=utils.get_readable_time(
                            (time.time() - train_start) / (step - start_step) *
                            (config.num_train_steps - step))),
                all_rank=True)

            with train_summary_writer.as_default():
                for key, m in metrics.items():
                    tf.summary.scalar(key, m.result(), step=step)

            if int(checkpoint.step) < config.num_train_steps:
                for m in metrics.values():
                    m.reset_states()

        #Print allreduced metrics on the last step
        if int(checkpoint.step) == config.num_train_steps and (
                local_step % args.gradient_accumulation_steps == 0):
            log_info_dict = {
                k: float(hvd.allreduce(v.result()).numpy() * 100) if "accuracy"
                in k else float(hvd.allreduce(v.result()).numpy())
                for k, v in metrics.items()
            }
            log_info_dict["training_sequences_per_second"] = log_info_dict[
                "train_perf"]
            log_info_dict["final_loss"] = log_info_dict["total_loss"]
            log_info_dict["e2e_train_time"] = time.time() - e2e_start_time
            dllogger.log(step=(), data=log_info_dict, verbosity=0)
            log('<FINAL STEP METRICS> Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, '
                'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f},'.
                format(step=step, **log_info_dict),
                all_rank=False)

        if local_step % args.gradient_accumulation_steps == 0:
            checkpoint.step.assign(int(optimizer.iterations))

        local_step += 1
        if not config.skip_checkpoint and (
                local_step %
            (config.save_checkpoints_steps * args.gradient_accumulation_steps)
                == 0):
            saved_ckpt = True
            if is_main_process():
                save_path = manager.save(checkpoint_number=step)
                log(" ** Saved model checkpoint for step {}: {}".format(
                    step, save_path))
            iter_save_path = iter_manager.save(checkpoint_number=step)
            log(" ** Saved iterator checkpoint for step {}: {}".format(
                step, iter_save_path),
                all_rank=True)

    step = (int(checkpoint.step) - 1)
    dllogger.flush()
    if not config.skip_checkpoint and not saved_ckpt:
        if is_main_process():
            save_path = manager.save(checkpoint_number=step)
            log(" ** Saved model checkpoint for step {}: {}".format(
                step, save_path))
        iter_save_path = iter_manager.save(checkpoint_number=step)
        log(" ** Saved iterator checkpoint for step {}: {}".format(
            step, iter_save_path),
            all_rank=True)

    return args
Example #14
0
def main():
    # Parse essential args
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        required=True,
                        help="Location of data files (model weights, etc).")
    parser.add_argument("--model_name",
                        required=True,
                        help="The name of the model being fine-tuned.")
    parser.add_argument("--pretrain_tfrecords", type=str)

    parser.add_argument("--seed", type=int)
    parser.add_argument("--num_train_steps", type=int)
    parser.add_argument("--num_warmup_steps", type=int)
    parser.add_argument("--learning_rate", type=float)
    parser.add_argument("--train_batch_size", type=int)
    parser.add_argument("--max_seq_length", type=int)

    parser.add_argument("--mask_prob", type=float)
    parser.add_argument("--disc_weight", type=float)
    parser.add_argument("--generator_hidden_size", type=float)

    parser.add_argument("--save_checkpoints_steps", type=int)
    parser.add_argument("--keep_checkpoint_max", type=int)
    parser.add_argument("--restore_checkpoint", action='store_true')

    parser.add_argument("--optimizer",
                        default="adam",
                        type=str,
                        help="adam or lamb")

    args = parser.parse_args()
    config = PretrainingConfig(**args.__dict__)

    # Set up tensorflow
    hvd.init()
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')
    tf.config.optimizer.set_jit(config.xla)
    tf.config.optimizer.set_experimental_options(
        {"auto_mixed_precision": config.amp})
    tf.random.set_seed(config.seed)

    # Set up config
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug and config.do_train:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    log_config(config)

    # Save pretrain configs
    pretrain_config_json = os.path.join(config.checkpoints_dir,
                                        'pretrain_config.json')
    if is_main_process():
        utils.write_json(config.__dict__, pretrain_config_json)
        log("Configuration saved in {}".format(pretrain_config_json))

    # Set up model
    model = PretrainingModel(config)

    # Set up metrics
    perf_metrics = dict()
    perf_metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf")

    eval_metrics = dict()
    eval_metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss")
    eval_metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy(
        name="masked_lm_accuracy")
    eval_metrics["masked_lm_loss"] = tf.keras.metrics.Mean(
        name="masked_lm_loss")
    if config.electra_objective:
        eval_metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy(
            name="sampled_masked_lm_accuracy")
        if config.disc_weight > 0:
            eval_metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss")
            eval_metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc")
            eval_metrics["disc_accuracy"] = tf.keras.metrics.Accuracy(
                name="disc_accuracy")
            eval_metrics["disc_precision"] = tf.keras.metrics.Accuracy(
                name="disc_precision")
            eval_metrics["disc_recall"] = tf.keras.metrics.Accuracy(
                name="disc_recall")

    # Set up tensorboard
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = os.path.join(
        config.log_dir, current_time,
        'train_' + str(get_rank()) + '_of_' + str(get_world_size()))
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    # Set up dataset
    dataset = pretrain_utils.get_dataset(config,
                                         config.train_batch_size,
                                         world_size=get_world_size(),
                                         rank=get_rank())
    train_iterator = iter(dataset)

    # Set up optimizer
    optimizer = create_optimizer(init_lr=config.learning_rate,
                                 num_train_steps=config.num_train_steps,
                                 num_warmup_steps=config.num_warmup_steps,
                                 weight_decay_rate=config.weight_decay_rate,
                                 optimizer=config.optimizer)
    if config.amp:
        optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
            optimizer, "dynamic")

    if config.do_train:
        # Set up checkpoint manager
        checkpoint = tf.train.Checkpoint(step=tf.Variable(1),
                                         optimizer=optimizer,
                                         model=model)
        manager = tf.train.CheckpointManager(
            checkpoint,
            config.checkpoints_dir,
            max_to_keep=config.keep_checkpoint_max)
        iter_checkpoint = tf.train.Checkpoint(train_iterator=train_iterator)
        iter_manager = tf.train.CheckpointManager(
            iter_checkpoint,
            os.path.join(config.checkpoints_dir,
                         'iter_ckpt_rank_' + '{:02}'.format(get_rank())),
            checkpoint_name='iter_ckpt_rank_' + '{:02}'.format(get_rank()),
            max_to_keep=config.keep_checkpoint_max)
        if config.restore_checkpoint and manager.latest_checkpoint:
            checkpoint.restore(manager.latest_checkpoint)
            log(" ** Restored model checkpoint from {}".format(
                manager.latest_checkpoint))
            if iter_manager.latest_checkpoint:
                iter_checkpoint.restore(iter_manager.latest_checkpoint)
                log(" ** Restored iterator checkpoint from {}".format(
                    iter_manager.latest_checkpoint),
                    all_rank=True)
        else:
            log(" ** Initializing from scratch.")

        utils.heading("Running training")
        train_start, start_step = time.time(), int(checkpoint.step) - 1
        while int(checkpoint.step) <= config.num_train_steps:
            step = int(checkpoint.step)
            features = next(train_iterator)
            iter_start = time.time()

            # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir)
            total_loss, eval_fn_inputs = train_one_step(
                config, model, optimizer, features, step <= 1)
            # if step == 300: tf.profiler.experimental.stop()

            perf_metrics["train_perf"].update_state(config.train_batch_size *
                                                    get_world_size() /
                                                    (time.time() - iter_start))
            eval_metrics["total_loss"].update_state(values=total_loss)
            metric_fn(config, eval_metrics, eval_fn_inputs)

            if step % 100 == 0:
                log('Step:{:6d}, Loss:{:10.6f}, Gen_loss:{:10.6f}, Disc_loss:{:10.6f}, Gen_acc:{:6.2f}, '
                    'Disc_acc:{:6.2f}, Perf:{:4.0f}, Elapsed: {}, ETA: {}, '.
                    format(
                        step, total_loss,
                        eval_metrics["masked_lm_loss"].result().numpy(),
                        eval_metrics["disc_loss"].result().numpy(),
                        eval_metrics["masked_lm_accuracy"].result().numpy() *
                        100,
                        eval_metrics["disc_accuracy"].result().numpy() * 100,
                        perf_metrics["train_perf"].result().numpy(),
                        utils.get_readable_time(time.time() - train_start),
                        utils.get_readable_time(
                            (time.time() - train_start) / (step - start_step) *
                            (config.num_train_steps - step))),
                    all_rank=True)

                with train_summary_writer.as_default():
                    for key, m in eval_metrics.items():
                        tf.summary.scalar(key, m.result(), step=step)

                for m in eval_metrics.values():
                    m.reset_states()

            checkpoint.step.assign_add(1)
            if step % config.save_checkpoints_steps == 0:
                if is_main_process():
                    save_path = manager.save()
                    log(" ** Saved model checkpoint for step {}: {}".format(
                        step, save_path))
                iter_save_path = iter_manager.save()
                log(" ** Saved iterator checkpoint for step {}: {}".format(
                    step, iter_save_path),
                    all_rank=True)

    if config.do_eval:
        pass