def train_model(self, producer=None, sess=None, max_iters=None, restore=False): iterator = producer.make_one_shot_iterator() next_element = iterator.get_next() timer = Timer() wrong = 0 for _ in range(1): while True: try: timer.tic() img, corner_data, img_info, reize_info, segmentation_mask = sess.run(next_element) # print(img.shape) # print(corner_data.shape) # print(img_info) # print(reize_info) print(corner_data) print(timer.toc()) break except tf.errors.OutOfRangeError: break except: # print(e) wrong += 1 print('get batch error') break print(wrong)
def train_model(self, sess): # 根据全部的roidb,获得一个data_layer对象 # data_layer对象是一批一批地传递处理好了的数据 data_layer = get_data_layer(self.roidb, self._cfg) total_loss, model_loss, rpn_cross_entropy, rpn_loss_box = self.net.build_loss( ) # cfg.TRAIN.LEARNING_RATE = 0.00001 lr = tf.Variable(self._cfg.TRAIN.LEARNING_RATE, trainable=False) # TRAIN.SOLVER = 'Momentum' if self._cfg.TRAIN.SOLVER == 'Adam': opt = tf.train.AdamOptimizer(self._cfg.TRAIN.LEARNING_RATE) elif self._cfg.TRAIN.SOLVER == 'RMS': opt = tf.train.RMSPropOptimizer(self._cfg.TRAIN.LEARNING_RATE) else: # lr = tf.Variable(0.0, trainable=False) momentum = self._cfg.TRAIN.MOMENTUM # 0.9 opt = tf.train.MomentumOptimizer(lr, momentum) global_step = tf.Variable(0, trainable=False) with_clip = True if with_clip: tvars = tf.trainable_variables() # 获取所有的可训练参数 # 下面这句话会产生UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. # This may consume a large amount of memory grads, norm = tf.clip_by_global_norm( tf.gradients(total_loss, tvars), 10.0) train_op = opt.apply_gradients(list(zip(grads, tvars)), global_step=global_step) else: train_op = opt.minimize(total_loss, global_step=global_step) # initialize variables sess.run(tf.global_variables_initializer()) restore_iter = 0 # load vgg16 if self.pretrained_model is not None and not self._restore: try: print(('Loading pretrained model ' 'weights from {:s}').format(self.pretrained_model)) # 从预训练模型中导入 self.net.load(self.pretrained_model, sess, True) except: raise 'Check your pretrained model {:s}'.format( self.pretrained_model) # resuming a trainer if self._restore: # restore为True表示训练过程中可能死机了, 现在重新启动训练 try: ckpt = tf.train.get_checkpoint_state(self.checkpoints_dir) print('Restoring from {}...'.format( ckpt.model_checkpoint_path), end=' ') self.saver.restore(sess, ckpt.model_checkpoint_path) stem = os.path.splitext( os.path.basename(ckpt.model_checkpoint_path))[0] restore_iter = int(stem.split('_')[-1]) sess.run(global_step.assign(restore_iter)) print("The starting iter is {:d}".format(restore_iter)) print('done') except: raise 'Check your pretrained {:s}'.format( ckpt.model_checkpoint_path) timer = Timer() loss_list = [total_loss, model_loss, rpn_cross_entropy, rpn_loss_box] train_list = [train_op] for iter in range(restore_iter, self.max_iter): timer.tic() # learning rate if iter != 0 and iter % self._cfg.TRAIN.STEPSIZE == 0: # 每STEPSIZE轮,学习率变为原来的0.1 sess.run(tf.assign(lr, lr.eval() * self._cfg.TRAIN.GAMMA)) print("learning rate at step {} is {}".format(iter, lr)) blobs = data_layer.forward() gt_boxes = blobs['gt_boxes'] if not gt_boxes.shape[0] > 0: print("warning: abandon a picture named {}, because it has " "no gt_boxes".format(blobs['im_name'])) continue feed_dict = { self.net.data: blobs['data'], # 一个形状为[批数,宽,高,通道数]的源图片,命名为“data” self.net.im_info: blobs['im_info'], # 一个三维向量,包含高,宽,缩放比例 self.net.keep_prob: 0.5, self.net.gt_boxes: gt_boxes, # GT_boxes信息,N×8矩阵,每一行为一个gt_box } try: _ = sess.run(fetches=train_list, feed_dict=feed_dict) except NoPositiveError: print("warning: abandon a picture named {}".format( blobs['im_name'])) except: continue _diff_time = timer.toc(average=False) if iter % self._cfg.TRAIN.DISPLAY == 0: total_loss_val, model_loss_val, rpn_loss_cls_val, rpn_loss_box_val \ = sess.run(fetches=loss_list, feed_dict=feed_dict) print( 'iter: %d / %d, total loss: %.4f, model loss: %.4f, rpn_loss_cls: %.4f, ' 'rpn_loss_box: %.4f, lr: %f' % (iter, self.max_iter, total_loss_val, model_loss_val, rpn_loss_cls_val, rpn_loss_box_val, lr.eval())) print('speed: {:.3f}s / iter'.format(_diff_time)) # 每1000次保存一次模型 if (iter + 1 ) % self._cfg.TRAIN.SNAPSHOT_ITERS == 0: # 每一千次保存一下ckeckpoints self.snapshot(sess, iter) # for循環結束以後,記錄下最後一次 self.snapshot(sess, self.max_iter - 1)