def __init__(self, params, output_dir): self.strategy = tf.distribute.MirroredStrategy() self.params = params # Datasets tf_records = [os.path.join(params.data_dir,file) for file in os.listdir(params.data_dir) if file.endswith('.tfrecords')] self.train_dataset = self.strategy.experimental_distribute_dataset(input_fn(tf_records[:30])) self.val_dataset = self.strategy.experimental_distribute_dataset(input_fn(tf_records[30:])) num_samples = len(tf_records[:30]) self.total_iteration = (num_samples // params.batch_size) * params.epochs with self.strategy.scope(): # Models self.models = {} self.models['disparity'] = DisparityNet(input_shape=(params.input_h, params.input_w, 3)) self.models['pose'] = PoseNet(input_shape=(params.input_h, params.input_w, 3 * params.num_input_frames), num_input_frames=params.num_input_frames) # Optimizer learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(0.0002, end_learning_rate=0.000001, decay_steps=self.total_iteration, power=0.5) self.optimizer = tf.keras.optimizers.Adam(learning_rate_fn) # Tensorboard & Meters train_log_dir = os.path.join(output_dir, 'train_logs') val_log_dir = os.path.join(output_dir, 'val_logs') self.train_summary_writer = tf.summary.create_file_writer(train_log_dir) self.test_summary_writer = tf.summary.create_file_writer(val_log_dir) self.train_meter = { 'ssim': tf.keras.metrics.Mean(name='ssim'), 'l1': tf.keras.metrics.Mean(name='l1'), 'smooth': tf.keras.metrics.Mean(name='smooth'), } self.val_meter = { 'ssim': tf.keras.metrics.Mean(name='ssim'), 'l1': tf.keras.metrics.Mean(name='l1'), 'smooth': tf.keras.metrics.Mean(name='smooth'), } self.step = 0 # Load states from optimiser and model if available self.ckpt_disp, self.manager_disp = self.setup_logger(self.models['disparity'], os.path.join(output_dir, 'disparity_model')) self.ckpt_pose, self.manager_pose = self.setup_logger(self.models['pose'], os.path.join(output_dir, 'pose_model')) self.start_epoch = int(self.ckpt_disp.step) + 1 if self.manager_disp.latest_checkpoint else int( self.ckpt_disp.step) # Helpers self.pix_coords = pixel_coord(params.batch_size, params.input_h, params.input_w, True) # [b, 3, npoints] print("Starting training step {}".format(self.ckpt_disp.step.numpy()))
def train(num_layers, embedding_size, num_heads, dff, max_seq_len, vocab_size, optimizer="adam", batch_size=16, learning_rate=1e-3, distributed=False): tf_records = glob.glob((_ROOT + "/data/tf_records/*.tfrecord")) if distributed: dist_dataset = input_fn(tf_records, batch_size=batch_size) mirrored_strategy = tf.distribute.MirroredStrategy( devices=["/gpu:0", "/gpu:1"]) dist_dataset = mirrored_strategy.experimental_distribute_dataset( dist_dataset) with mirrored_strategy.scope(): model = Gpt2(num_layers, embedding_size, num_heads, dff, max_seq_len, vocab_size, optimizer=optimizer, learning_rate=learning_rate) model.creat_optimizer() model.create_checkpoint_manager(MODEL_DIR) model.create_summary_writer(LOG_DIR) model.mirrored_strategy = mirrored_strategy model.fit(dist_dataset) else: dataset = input_fn(tf_records, batch_size=batch_size) model = Gpt2(num_layers, embedding_size, num_heads, dff, max_seq_len, vocab_size, optimizer=optimizer, learning_rate=learning_rate) model.creat_optimizer() model.create_checkpoint_manager(MODEL_DIR) model.create_summary_writer(LOG_DIR) model.fit(dataset) print("Training Done................")
def train(model_dir, data_dir, batch_size=16, learning_rate=0.001, distributed=False, mxp=False, epochs=5): data_dir = os.path.abspath(data_dir) model_dir = os.path.abspath(model_dir) tf_records = glob.glob(data_dir + "/tf_records/*.tfrecord") dataset = input_fn(tf_records, batch_size=batch_size, epoch=epochs) if distributed: mirrored_strategy = tf.distribute.MirroredStrategy( devices=["/gpu:0", "/gpu:1"]) dataset = mirrored_strategy.experimental_distribute_dataset(dataset) with mirrored_strategy.scope(): model = Gpt2.create_from_params(model_dir) model.creat_optimizer(learning_rate=learning_rate, mixed_precission=mxp) model.create_checkpoint_manager(model_dir) model.create_summary_writer(LOG_DIR) model.mirrored_strategy = mirrored_strategy else: model = Gpt2.create_from_params(model_dir) model.create_optimizer(learning_rate=learning_rate, mixed_precission=mxp) model.create_checkpoint_manager(model_dir) model.create_summary_writer(LOG_DIR) print("Trainign Model...............") model.print_params() model.fit(dataset) print("Training Done................")
def __init__(self, params, output_dir): self.dataset_dir = params.dataset_dir self.demo_set = params.demo_set self.output_dir = output_dir self.params = params self.models = { 'disparity': DisparityNet(input_shape=(params.input_h, params.input_w, 3)) } self.load_checkpoint(self.models['disparity'], os.path.join(output_dir, 'disparity_model')) # Datasets tf_records = [ os.path.join(params.dataset_dir, file) for file in os.listdir(params.dataset_dir) if file.endswith('.tfrecords') ] self.val_dataset = input_fn(tf_records[30:], load_option='val') self.images = [] self.gt_depths = [] for i, data in enumerate(self.val_dataset): image = data['images'][0][0] self.images.append(image) gt_depth = data['depths'].numpy()[0][0] self.gt_depths.append(gt_depth) print(f'Total Images: {len(self.images)}')
def train(num_layers, embedding_size, num_heads, dff, max_seq_len, vocab_size, optimizer="adam", batch_size=16, learning_rate=1e-3, distributed=False): par_map = { "num_layers": num_layers, "d_model": embedding_size, "num_heads": num_heads, "dff": dff, "max_seq_len": max_seq_len, "vocab_size": vocab_size } exp_name = "_".join(['{}_{}'.format(k, v) for k, v in par_map.items()]) if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR) with open(MODEL_DIR + '/model_par.json', 'w') as f: json.dump(par_map, f) tf_records = glob.glob((_ROOT + "/data/tf_records/*.tfrecord")) if distributed: dist_dataset = input_fn(tf_records, batch_size=batch_size) mirrored_strategy = tf.distribute.MirroredStrategy( devices=["/gpu:0", "/gpu:1"]) dist_dataset = mirrored_strategy.experimental_distribute_dataset( dist_dataset) with mirrored_strategy.scope(): model = Gpt2(num_layers, embedding_size, num_heads, dff, max_seq_len, vocab_size, optimizer=optimizer, learning_rate=learning_rate) model.creat_optimizer() model.create_checkpoint_manager(MODEL_DIR) model.create_summary_writer(LOG_DIR) model.mirrored_strategy = mirrored_strategy model.fit(dist_dataset) else: dataset = input_fn(tf_records, batch_size=batch_size) model = Gpt2(num_layers, embedding_size, num_heads, dff, max_seq_len, vocab_size, optimizer=optimizer, learning_rate=learning_rate) model.creat_optimizer() model.create_checkpoint_manager(MODEL_DIR) model.create_summary_writer(LOG_DIR) model.fit(dataset) print("Training Done................")
def train(num_layers, embedding_size, num_heads, dff, max_seq_len, vocab_size, optimizer, batch_size, learning_rate, graph_mode, distributed): par_map = { "num_layers": num_layers, "d_model": embedding_size, "num_heads": num_heads, "dff": dff, "max_seq_len": max_seq_len, "vocab_size": vocab_size } # exp_name = "_".join(['{}_{}'.format(k, v) for k, v in par_map.items()]) if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR) with open(MODEL_DIR + '/model_par.json', 'w') as f: json.dump(par_map, f) tf_records = glob.glob((DATA_DIR + "/tf_records/*.tfrecord")) train_percent = int(len(tf_records) * (85 / 100)) print("No. of tf records:- ", len(tf_records)) train_tf_records = tf_records[:train_percent] test_tf_records = tf_records[train_percent:] train_dataset = input_fn(train_tf_records, batch_size=batch_size) test_dataset = input_fn(test_tf_records, batch_size=batch_size) if distributed: mirrored_strategy = tf.distribute.MirroredStrategy() train_dataset = mirrored_strategy.experimental_distribute_dataset( train_dataset) test_dataset = mirrored_strategy.experimental_distribute_dataset( test_dataset) with mirrored_strategy.scope(): model = Gpt2(num_layers, embedding_size, num_heads, dff, max_seq_len, vocab_size, optimizer=optimizer, learning_rate=learning_rate) model.create_optimizer() model.create_checkpoint_manager(MODEL_DIR) model.create_summary_writer(LOG_DIR) model.mirrored_strategy = mirrored_strategy model.global_batch_size = tf.cast(batch_size, tf.float32) else: model = Gpt2(num_layers, embedding_size, num_heads, dff, max_seq_len, vocab_size, optimizer=optimizer, learning_rate=learning_rate) model.create_optimizer() model.create_checkpoint_manager(MODEL_DIR) model.create_summary_writer(LOG_DIR) model.fit([train_dataset, test_dataset], graph_mode) print("Training Done................")