def _get_test_dataset(self): """ Reads TFRecords, decode and batches them :return: callable """ path = os.path.join(self._test_out_dir, "*.tfrecords") path = path.replace("//", "/") files = glob.glob(pathname=path) assert len(files) > 0 # TF dataset APIs # dataset = tf.data.TFRecordDataset(files, num_parallel_reads=self._num_cores) files = tf.data.Dataset.list_files(path) # TF dataset APIs # dataset = tf.data.TFRecordDataset(files, num_parallel_reads=self._num_cores) dataset = files.interleave( tf.data.TFRecordDataset, cycle_length=self._num_cores, num_parallel_calls=tf.data.experimental.AUTOTUNE) # Map the generator output as features as a dict and labels dataset = dataset.map(self.decode) dataset = dataset.batch(batch_size=self._hparams.batch_size, drop_remainder=False) # dataset = dataset.shuffle(self._prefetch_size * 2, 42) dataset = dataset.prefetch(self._prefetch_size) # dataset = dataset.repeat() print_info("Dataset output sizes are: ") print_info(dataset) return dataset
def _prepare_val_dataset(self): """ Reads TFRecords, decode and batches them :return: callable """ print_info("_get_val_dataset") memory_usage_psutil() path = os.path.join(self._val_out_dir, "*.tfrecords") path = path.replace("//", "/") # train_tfrecord_files = glob.glob(pathname=path) val_tfrecord_files = tf.data.Dataset.list_files(path) # TF dataset APIs # dataset = tf.data.TFRecordDataset(files, num_parallel_reads=self._num_cores) dataset = val_tfrecord_files.interleave( tf.data.TFRecordDataset, cycle_length=self._num_cores, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.shuffle(self._batch_size * 10, 42) # Map the generator output as features as a dict and labels dataset = dataset.map(map_func=self.decode, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.batch(batch_size=self._batch_size, drop_remainder=False) self._val_dataset = dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE)
def _get_val_dataset(self): self._prepare_val_dataset() print_info("Dataset output sizes are: ") print_info(self._val_dataset) memory_usage_psutil() # iterator = self._val_dataset.make_one_shot_iterator() # # batch_feats, batch_labels = iterator.get_next() return self._val_dataset
def get_number_steps_per_epcoh(self, num_train_examples): res = num_train_examples // self._batch_size print("\n\n\n\n\n") print_info( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" ) print_info(f"Number of examples per epoch is {num_train_examples}") print_info(f"Batch size is {self._batch_size}") print_info(f"Number of steps per epoch is {res}") print_info( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" ) print("\n\n\n\n\n") return res
def main(args): memory_used = [] process = psutil.Process(os.getpid()) #TODO add into argparser IS_EAST_IMAGE_TEST = True NUM_ARRAYS_PER_FILE = 10000 #TODO decode function needs this value as part of dataset map function, hence for now harcoded value # if needed chnage manually at func `numpy_array_decode` in dummy_dataset.py also NUM_FEATURES = 250 NUM_IMAGES_PER_FILE = 8 BATCH_SIZE = 4 TRAIN_DATA = os.getcwd() + "/data/train_data_img" VAL_DATA = os.getcwd() + "/data/val_data_img" MODEL_DIR = os.getcwd() + "/data/" + "east_net" EXPORT_DIR = MODEL_DIR + "/" + "export" NUM_EPOCHS = 3 NUM_SAMPLES_PER_FILE = NUM_IMAGES_PER_FILE if args["dataset"] == "numpy": IS_EAST_IMAGE_TEST = False BATCH_SIZE = 128 TRAIN_DATA = os.getcwd() + "/data/train_data" VAL_DATA = os.getcwd() + "/data/val_data" MODEL_DIR = os.getcwd() + "/" + "data/fwd_nnet" EXPORT_DIR = MODEL_DIR + "/" + "export" NUM_EPOCHS = 3 NUM_SAMPLES_PER_FILE = NUM_ARRAYS_PER_FILE elif args["dataset"] == "east": pass else: print_error("Invalid dataset") TOTAL_STEPS_PER_FILE = NUM_SAMPLES_PER_FILE / BATCH_SIZE if args["delete"] == True: print_info("Deleting old data files") shutil.rmtree(TRAIN_DATA) shutil.rmtree(VAL_DATA) gen_data(IS_EAST_IMAGE_TEST=IS_EAST_IMAGE_TEST, TRAIN_DATA=TRAIN_DATA, VAL_DATA=VAL_DATA, NUM_SAMPLES_PER_FILE=NUM_SAMPLES_PER_FILE, NUM_FEATURES=NUM_FEATURES, number_files=int(args["num_tfrecord_files"])) if args["mode"] == "test_iterator": print('objgraph growth list start') objgraph.show_growth(limit=50) print('objgraph growth list end') test_dataset(data_path=TRAIN_DATA, BATCH_SIZE=BATCH_SIZE, IS_EAST_IMAGE_TEST=IS_EAST_IMAGE_TEST) test_dataset(data_path=TRAIN_DATA, BATCH_SIZE=BATCH_SIZE, IS_EAST_IMAGE_TEST=IS_EAST_IMAGE_TEST) test_dataset(data_path=VAL_DATA, BATCH_SIZE=BATCH_SIZE, IS_EAST_IMAGE_TEST=IS_EAST_IMAGE_TEST) print('objgraph growth list start') objgraph.show_growth(limit=50) print('objgraph growth list end') return # print(dataset_to_iterator(data_path=TRAIN_DATA)) if IS_EAST_IMAGE_TEST: model = EASTTFModel(model_root_directory="store") else: model = NNet() estimator = tf.estimator.Estimator( model_fn=model, config=_init_tf_config(TOTAL_STEPS_PER_FILE=TOTAL_STEPS_PER_FILE, MODEL_DIR=MODEL_DIR), params=None) memory_usage_psutil() print('objgraph growth list start') objgraph.show_growth(limit=50) print('objgraph growth list end') # print(objgraph.get_leaking_objects()) # for epoch in tqdm(range(NUM_EPOCHS)): print("\n\n\n\n\n\n") print_error(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> New Epoch") memory_usage_psutil() # memory_used.append(process.memory_info()[0] / float(2 ** 20)) print_error(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Training") # train(estimator=estimator, # TRAIN_DATA=TRAIN_DATA, # BATCH_SIZE=BATCH_SIZE, # IS_EAST_IMAGE_TEST=IS_EAST_IMAGE_TEST) # print_error(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Evaluating") # evaluate(estimator=estimator, # VAL_DATA=VAL_DATA, # BATCH_SIZE=BATCH_SIZE, # IS_EAST_IMAGE_TEST=IS_EAST_IMAGE_TEST) train_n_evaluate(estimator=estimator, TRAIN_DATA=TRAIN_DATA, VAL_DATA=VAL_DATA, BATCH_SIZE=BATCH_SIZE, IS_EAST_IMAGE_TEST=IS_EAST_IMAGE_TEST, max_steps=None, NUM_EPOCHS=NUM_EPOCHS) print('objgraph growth list start') objgraph.show_growth(limit=50) print('objgraph growth list end') memory_usage_psutil() # plt.plot(memory_used) # plt.title('Evolution of memory') # plt.xlabel('iteration') # plt.ylabel('memory used (MB)') # plt.savefig("logs/" + args["dataset"] + "_dataset_memory_usage.png") # plt.show() print_error(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> New Epoch") export_model(estimator=estimator, model_export_path=EXPORT_DIR, IS_EAST_MODEL=IS_EAST_IMAGE_TEST) (objgraph.get_leaking_objects())
def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)): """A block that has a conv layer at shortcut. # Arguments input_tensor: input tensor kernel_size: default 3, the kernel size of middle conv layer at main path filters: list of integers, the filters of 3 conv layer at main path stage: integer, current stage label, used for generating layer names block: 'a','b'..., current block label, used for generating layer names strides: Strides for the first conv layer in the block. # Returns Output tensor for the block. Note that from stage 3, the first conv layer at main path is with strides=(2, 2) And the shortcut should have strides=(2, 2) as well """ print_warn(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> conv block") filters1, filters2, filters3 = filters bn_axis = 3 conv_name_base = 'res' + str(stage) + block + '_branch' bn_name_base = 'bn' + str(stage) + block + '_branch' print_info(input_tensor) # >>>>>>>>>>>>>>>>> x = layers.Conv2D(filters1, (1, 1), strides=strides, kernel_initializer='he_normal', name=conv_name_base + '2a')(input_tensor) print_info(x) x = layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x) print_info(x) x = layers.Activation('relu')(x) print_info(x) # >>>>>>>>>>>>>>>>> x = layers.Conv2D(filters2, kernel_size, padding='same', kernel_initializer='he_normal', name=conv_name_base + '2b')(x) print_info(x) x = layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x) print_info(x) x = layers.Activation('relu')(x) print_info(x) # >>>>>>>>>>>>>>>>> x = layers.Conv2D(filters3, (1, 1), kernel_initializer='he_normal', name=conv_name_base + '2c')(x) print_info(x) x = layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x) print_info(x) # >>>>>>>>>>>>>>>>> shortcut = layers.Conv2D(filters3, (1, 1), strides=strides, kernel_initializer='he_normal', name=conv_name_base + '1')(input_tensor) print_info(shortcut) shortcut = layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '1')(shortcut) print_info(shortcut) x = layers.add([x, shortcut]) print_info(x) x = layers.Activation('relu')(x) print_info(x) return x
def model(images, text_scale=512, weight_decay=1e-5, is_training=True): """ define the model, we use Keras implemention of resnet """ images = mean_image_subtraction(images) bn_axis = 3 end_points = dict() print_warn(">>>>>>>>>>>>>>> Model Definition Started: ") print_warn(images) # http://ethereon.github.io/netscope/#/gist/db945b393d40bfa26006 x = layers.ZeroPadding2D(padding=(3, 3), name='conv1_pad')(images) print_warn(x) x = layers.Conv2D(64, (7, 7), strides=(2, 2), padding='valid', kernel_initializer='he_normal', name='conv1')(x) print_warn(x) x = layers.BatchNormalization(axis=bn_axis, name='bn_conv1')(x) print_warn(x) x = layers.Activation('relu')(x) print_warn(x) x = layers.ZeroPadding2D(padding=(1, 1), name='pool1_pad')(x) print_warn(x) x = layers.MaxPooling2D((3, 3), strides=(2, 2))(x) print_warn(x) print_warn(">>>>>>>>>>>>>>> Resnet Definition Started: ") print_warn(">>>>> pool2") x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) print_warn(x) x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') print_warn(x) x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') print_warn(x) end_points["pool2"] = x print_warn(">>>>> pool3") x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') print_warn(x) x = identity_block(x, 3, [128, 128, 512], stage=3, block='b') print_warn(x) x = identity_block(x, 3, [128, 128, 512], stage=3, block='c') print_warn(x) x = identity_block(x, 3, [128, 128, 512], stage=3, block='d') print_warn(x) end_points["pool3"] = x print_warn(">>>>> pool4") x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') print_warn(x) x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b') print_warn(x) x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c') print_warn(x) x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d') print_warn(x) x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e') print_warn(x) x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f') print_info(x) end_points["pool4"] = x print_warn(">>>>> pool5") x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') print_warn(x) x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') print_warn(x) x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') print_warn(x) end_points["pool5"] = x f = [ end_points['pool5'], end_points['pool4'], end_points['pool3'], end_points['pool2'] ] for i in range(4): logging.info('Shape of f_{} : {}'.format(i, f[i].shape)) g = [None, None, None, None] h = [None, None, None, None] num_outputs = [None, 128, 64, 32] for i in range(4): if i == 0: h[i] = f[i] else: c1_1 = layers.Conv2D(filters=num_outputs[i], kernel_size=1)(tf.concat([g[i - 1], f[i]], axis=-1)) # slim.conv2d(tf.concat([g[i-1], f[i]], axis=-1), num_outputs[i], 1) h[i] = layers.Conv2D(filters=num_outputs[i], kernel_size=3, padding="same")(c1_1) #TODO kernel size to 3 # slim.conv2d(c1_1, num_outputs[i], 3) if i <= 2: g[i] = unpool(h[i]) else: g[i] = layers.Conv2D(filters=num_outputs[i], kernel_size=3, padding="same")(h[i]) #TODO kernel size to 3 # slim.conv2d(h[i], num_outputs[i], 3) logging.info('Shape of h_{} : {}, g_{} : {}'.format( i, h[i].shape, i, g[i].shape)) # here we use a slightly different way for regression part, # we first use a sigmoid to limit the regression range, and also # this is do with the angle map F_score = layers.Conv2D(filters=1, kernel_size=1, activation=tf.nn.sigmoid)(g[3]) # slim.conv2d(g[3], 1, 1, activation_fn=tf.nn.sigmoid, normalizer_fn=None) # 4 channel of axis aligned bbox and 1 channel rotation angle geo_map = layers.Conv2D(filters=4, kernel_size=1, activation=tf.nn.sigmoid)(g[3]) # slim.conv2d(g[3], 4, 1, activation_fn=tf.nn.sigmoid, normalizer_fn=None) * FLAGS.text_scale angle_map = layers.Conv2D(filters=1, kernel_size=1, activation=tf.nn.sigmoid)(g[3]) # (slim.conv2d(g[3], 1, 1, activation_fn=tf.nn.sigmoid, normalizer_fn=None) - 0.5) * np.pi/2 # angle is between [-45, 45] F_geometry = tf.concat([geo_map, angle_map], axis=-1) return F_score, F_geometry
def identity_block(input_tensor, kernel_size, filters, stage, block): """The identity block is the block that has no conv layer at shortcut. # Arguments input_tensor: input tensor kernel_size: default 3, the kernel size of middle conv layer at main path filters: list of integers, the filters of 3 conv layer at main path stage: integer, current stage label, used for generating layer names block: 'a','b'..., current block label, used for generating layer names # Returns Output tensor for the block. """ print_warn(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> identity block") filters1, filters2, filters3 = filters bn_axis = 3 conv_name_base = 'res' + str(stage) + block + '_branch' bn_name_base = 'bn' + str(stage) + block + '_branch' print_info(input_tensor) # >>>>>>>>>>>>>>>>> x = layers.Conv2D(filters1, (1, 1), kernel_initializer='he_normal', name=conv_name_base + '2a')(input_tensor) print_info(x) x = layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x) print_info(x) x = layers.Activation('relu')(x) print_info(x) # >>>>>>>>>>>>>>>>> x = layers.Conv2D(filters2, kernel_size, padding='same', kernel_initializer='he_normal', name=conv_name_base + '2b')(x) print_info(x) x = layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x) print_info(x) x = layers.Activation('relu')(x) print_info(x) # >>>>>>>>>>>>>>>>> x = layers.Conv2D(filters3, (1, 1), kernel_initializer='he_normal', name=conv_name_base + '2c')(x) print_info(x) x = layers.BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x) print_info(x) x = layers.add([x, input_tensor]) print_info(x) x = layers.Activation('relu')(x) print_info(x) return x
def generator(data_path, geometry, min_crop_side_ratio, min_text_size, input_size=512, batch_size=4, background_ratio=3. / 8, random_scale=np.array([0.5, 1, 2.0, 3.0]), vis=False): image_list = np.array(get_images(data_path)) print_info('{} training images in {}'.format(image_list.shape[0], data_path)) index = np.arange(0, image_list.shape[0]) while True: np.random.shuffle(index) images = [] image_fns = [] score_maps = [] geo_maps = [] training_masks = [] for i in index: try: im_fn = image_list[i] im = cv2.imread(im_fn) # print im_fn h, w, _ = im.shape txt_fn = im_fn.replace( os.path.basename(im_fn).split('.')[1], 'txt') print_info( f"Imgae file name : {im_fn} and text file name {txt_fn}") if not os.path.exists(txt_fn): print('text file {} does not exists'.format(txt_fn)) continue text_polys, text_tags = load_annoataion(txt_fn) text_polys, text_tags = check_and_validate_polys( text_polys, text_tags, (h, w)) # if text_polys.shape[0] == 0: # continue # random scale this image rd_scale = np.random.choice(random_scale) im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale) text_polys *= rd_scale # print rd_scale # random crop a area from image if np.random.rand() < background_ratio: # crop background im, text_polys, text_tags = crop_area( im=im, polys=text_polys, tags=text_tags, min_crop_side_ratio=min_crop_side_ratio, crop_background=False, max_tries=50) #(im, text_polys, text_tags, crop_background=True) if text_polys.shape[0] > 0: # cannot find background continue # pad and resize image new_h, new_w, _ = im.shape max_h_w_i = np.max([new_h, new_w, input_size]) im_padded = np.zeros((max_h_w_i, max_h_w_i, 3), dtype=np.uint8) im_padded[:new_h, :new_w, :] = im.copy() im = cv2.resize(im_padded, dsize=(input_size, input_size)) score_map = np.zeros((input_size, input_size), dtype=np.uint8) geo_map_channels = 5 if geometry == 'RBOX' else 8 geo_map = np.zeros( (input_size, input_size, geo_map_channels), dtype=np.float32) training_mask = np.ones((input_size, input_size), dtype=np.uint8) else: im, text_polys, text_tags = crop_area( im=im, polys=text_polys, tags=text_tags, min_crop_side_ratio=min_crop_side_ratio, crop_background=False, max_tries=50) if text_polys.shape[0] == 0: continue h, w, _ = im.shape # pad the image to the training input size or the longer side of image new_h, new_w, _ = im.shape max_h_w_i = np.max([new_h, new_w, input_size]) im_padded = np.zeros((max_h_w_i, max_h_w_i, 3), dtype=np.uint8) im_padded[:new_h, :new_w, :] = im.copy() im = im_padded # resize the image to input size new_h, new_w, _ = im.shape resize_h = input_size resize_w = input_size im = cv2.resize(im, dsize=(resize_w, resize_h)) resize_ratio_3_x = resize_w / float(new_w) resize_ratio_3_y = resize_h / float(new_h) text_polys[:, :, 0] *= resize_ratio_3_x text_polys[:, :, 1] *= resize_ratio_3_y new_h, new_w, _ = im.shape score_map, geo_map, training_mask = generate_rbox( (new_h, new_w), text_polys, text_tags, min_text_size=min_text_size) if vis: fig, axs = plt.subplots(3, 2, figsize=(20, 30)) # axs[0].imshow(im[:, :, ::-1]) # axs[0].set_xticks([]) # axs[0].set_yticks([]) # for poly in text_polys: # poly_h = min(abs(poly[3, 1] - poly[0, 1]), abs(poly[2, 1] - poly[1, 1])) # poly_w = min(abs(poly[1, 0] - poly[0, 0]), abs(poly[2, 0] - poly[3, 0])) # axs[0].add_artist(Patches.Polygon( # poly * 4, facecolor='none', edgecolor='green', linewidth=2, linestyle='-', fill=True)) # axs[0].text(poly[0, 0] * 4, poly[0, 1] * 4, '{:.0f}-{:.0f}'.format(poly_h * 4, poly_w * 4), # color='purple') # axs[1].imshow(score_map) # axs[1].set_xticks([]) # axs[1].set_yticks([]) axs[0, 0].imshow(im[:, :, ::-1]) axs[0, 0].set_xticks([]) axs[0, 0].set_yticks([]) for poly in text_polys: poly_h = min(abs(poly[3, 1] - poly[0, 1]), abs(poly[2, 1] - poly[1, 1])) poly_w = min(abs(poly[1, 0] - poly[0, 0]), abs(poly[2, 0] - poly[3, 0])) axs[0, 0].add_artist( Patches.Polygon(poly, facecolor='none', edgecolor='green', linewidth=2, linestyle='-', fill=True)) axs[0, 0].text(poly[0, 0], poly[0, 1], '{:.0f}-{:.0f}'.format(poly_h, poly_w), color='purple') axs[0, 1].imshow(score_map[::, ::]) axs[0, 1].set_xticks([]) axs[0, 1].set_yticks([]) axs[1, 0].imshow(geo_map[::, ::, 0]) axs[1, 0].set_xticks([]) axs[1, 0].set_yticks([]) axs[1, 1].imshow(geo_map[::, ::, 1]) axs[1, 1].set_xticks([]) axs[1, 1].set_yticks([]) axs[2, 0].imshow(geo_map[::, ::, 2]) axs[2, 0].set_xticks([]) axs[2, 0].set_yticks([]) axs[2, 1].imshow(training_mask[::, ::]) axs[2, 1].set_xticks([]) axs[2, 1].set_yticks([]) plt.tight_layout() plt.show() plt.close() images.append(im[:, :, ::-1].astype(np.float32)) # image_fns.append(im_fn) score_maps.append(score_map[::4, ::4, np.newaxis].astype(np.float32)) geo_maps.append(geo_map[::4, ::4, :].astype(np.float32)) training_masks.append( training_mask[::4, ::4, np.newaxis].astype(np.float32)) if len(images) == batch_size: #yield np.array(images), np.array(score_maps), np.array(geo_maps) yield { "images": np.array(images), "score_maps": np.array(score_maps), "geo_maps": np.array(geo_maps) }, np.array(images) images = [] score_maps = [] geo_maps = [] training_masks = [] except Exception as e: import traceback traceback.print_exc() continue
def train(self, num_max_steps=None, num_epoch=None): assert (num_max_steps is not None and num_epoch is not None, "Use steps or epoch at a time") model_dir = self._model.model_dir # data parallel for multi-GPU model = self.load_model(self._stored_model) model.train() num_samples = len(self._dataset) #TODO replace the dataset with actual batch_size = self._dataset._batch_size num_steps_per_epoch = num_samples // batch_size current_step = 0 i = 0 total_num_steps = -1 if num_epoch: total_num_steps = num_steps_per_epoch * num_epoch if num_max_steps: total_num_steps = num_max_steps # loss averager loss_avg = Averager() train_dataset = self._dataset.train_set() start_time = time.time() best_accuracy = -1 best_norm_ed = 1e+6 while (current_step < total_num_steps): print_info("Current step {}".format(current_step)) # train part image_tensors, labels = train_dataset.get_batch() images = image_tensors.to(TorchExecutor.device) cost = self._model.get_cost(model=self._model, features=images, labels=labels) optimizer = self._model.get_optimizer(model=model) model.zero_grad() cost.backward() grad_clip = 5 #TODO make as a param torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip) # gradient clipping with 5 (Default) optimizer.step() loss_avg.add(cost) # validation part if i % self._validation_interval_steps == 0: elapsed_time = time.time() - start_time print( f'[{i}/{self._max_train_steps}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}' ) # for log if not os.path.exists(f"./store/{self._experiment_name}"): os.makedirs(f"./store/{self._experiment_name}") with open(f'./store/{self._experiment_name}/log_train.txt', 'a') as log: log.write( f'[{i}/{self._max_train_steps}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}\n' ) loss_avg.reset() model.eval() with torch.no_grad(): valid_loss, current_accuracy, current_norm_ed, \ preds, labels, infer_time, length_of_data = self.validation(model=model) model.train() # # for pred, gt in zip(preds[:5], labels[:5]): # if 'Attn' in opt.Prediction: # pred = pred[:pred.find('[s]')] # gt = gt[:gt.find('[s]')] # print(f'{pred:20s}, gt: {gt:20s}, {str(pred == gt)}') # log.write(f'{pred:20s}, gt: {gt:20s}, {str(pred == gt)}\n') valid_log = f'[{i}/{self._max_train_steps}] valid loss: {valid_loss:0.5f}' valid_log += f' accuracy: {current_accuracy:0.3f}, norm_ED: {current_norm_ed:0.2f}' print(valid_log) log.write(valid_log + '\n') # keep best accuracy model if current_accuracy > best_accuracy: best_accuracy = current_accuracy self.store_model(file_name="best_accuracy.pth", model=model) if current_norm_ed < best_norm_ed: best_norm_ed = current_norm_ed self.store_model(file_name="best_norm_ed.pth", model=model) best_model_log = f'best_accuracy: {best_accuracy:0.3f}, best_norm_ed: {best_norm_ed:0.2f}' print(best_model_log) log.write(best_model_log + '\n') # save model per 1e+5 iter. if (i + 1) % 1e+5 == 0: self.store_model(file_name=f"iter_{i + 1}.pth", model=model) if i == self._max_train_steps: print('end the training') sys.exit() i += 1 current_step += 1