def distributed_test_epoch(epoch_num): summary_loss = AverageMeter() acc_score = ACCMeter() self.model.eval() t = time.time() with torch.no_grad(): for step in range(self.val_ds.size): images, data, target = self.train_ds() images = torch.from_numpy(images).to(self.device).float() data = torch.from_numpy(data).to(self.device).float() target = torch.from_numpy(target).to(self.device).float() batch_size = data.shape[0] output = self.model(images, data) loss = self.criterion(output, target) summary_loss.update(loss.detach().item(), batch_size) acc_score.update(target, output) if step % cfg.TRAIN.log_interval == 0: log_message = '[fold %d], '\ 'Val Step %d, ' \ 'summary_loss: %.6f, ' \ 'acc: %.6f, ' \ 'time: %.6f' % ( self.fold,step, summary_loss.avg, acc_score.avg, time.time() - t) logger.info(log_message) return summary_loss, acc_score
def load_weight(self): with self._graph.as_default(): if cfg.MODEL.continue_train: #########################restore the params variables_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) for v in tf.global_variables(): if 'moving_mean' in v.name or 'moving_variance' in v.name: variables_restore.append(v) saver2 = tf.train.Saver(variables_restore) saver2.restore(self.sess, cfg.MODEL.pretrained_model) elif cfg.MODEL.pretrained_model is not None: #########################restore the params variables_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=cfg.MODEL.net_structure) for v in tf.global_variables(): if 'moving_mean' in v.name or 'moving_variance' in v.name: if cfg.MODEL.net_structure in v.name: variables_restore.append(v) print(variables_restore) variables_restore_n = [v for v in variables_restore if 'GN' not in v.name] # Conv2d_1c_1x1 Bottleneck # print(variables_restore_n) saver2 = tf.train.Saver(variables_restore_n) saver2.restore(self.sess, cfg.MODEL.pretrained_model) else: logger.info('no pretrained model, train from sctrach')
def distributed_train_epoch(ds, epoch_num): total_loss = 0.0 num_train_batches = 0.0 for one_batch in ds: start = time.time() per_replica_loss = strategy.experimental_run_v2( self.train_step, args=(one_batch, )) current_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None) total_loss += current_loss num_train_batches += 1 self.iter_num += 1 time_cost_per_batch = time.time() - start images_per_sec = cfg.TRAIN.batch_size / time_cost_per_batch if self.iter_num % cfg.TRAIN.log_interval == 0: logger.info('epoch_num: %d, ' 'iter_num: %d, ' 'loss_value: %.6f, ' 'speed: %d images/sec ' % (epoch_num, self.iter_num, current_loss, images_per_sec)) return total_loss, num_train_batches
def distributed_test_epoch(epoch_num): summary_loss = AverageMeter() self.model.eval() t = time.time() with torch.no_grad(): for step in range(self.val_ds.size): feature, target1, target2 = self.val_ds() feature = torch.from_numpy(feature).to(self.device).float() target1 = torch.from_numpy(target1).to(self.device).float() target2 = torch.from_numpy(target2).to(self.device).float() batch_size = feature.shape[0] output, output2 = self.model(feature) loss1 = self.criterion(output, target1) loss2 = self.criterion(output2, target2) if self.pretrain: loss = loss2 else: loss = loss1 summary_loss.update(loss.detach().item(), batch_size) if step % cfg.TRAIN.log_interval == 0: log_message = '[fold %d], '\ 'Val Step %d, ' \ 'summary_loss: %.6f, ' \ 'time: %.6f' % ( self.fold,step, summary_loss.avg, time.time() - t) logger.info(log_message) return summary_loss
def load_weight(self): with self._graph.as_default(): if cfg.MODEL.continue_train: #########################restore the params variables_restore = tf.get_collection( tf.GraphKeys.MODEL_VARIABLES) print(variables_restore) saver2 = tf.train.Saver(variables_restore) saver2.restore(self._sess, cfg.MODEL.pretrained_model) elif 'npy' in cfg.MODEL.pretrained_model: params_dict = np.load(cfg.MODEL.pretrained_model, allow_pickle=True).item() #########################restore the params variables_restore = tf.get_collection( tf.GraphKeys.MODEL_VARIABLES, scope=cfg.MODEL.net_structure) ##filter if cfg.MODEL.cls != 1000: variables_restore = [ x for x in variables_restore if 'classifier' not in x.name ] print(variables_restore) for i, variables in enumerate(variables_restore): logger.info('assign %s with np data' % (variables.name)) self._sess.run( variables.assign(params_dict[variables.name])) elif cfg.MODEL.pretrained_model is not None: #########################restore the params variables_restore = tf.get_collection( tf.GraphKeys.MODEL_VARIABLES, scope=cfg.MODEL.net_structure) if cfg.MODEL.cls != 1000: variables_restore = [ x for x in variables_restore if 'classifier' not in x.name ] print(variables_restore) saver2 = tf.train.Saver(variables_restore) saver2.restore(self._sess, cfg.MODEL.pretrained_model) else: variables_restore = tf.get_collection( tf.GraphKeys.MODEL_VARIABLES, scope=cfg.MODEL.net_structure) print(variables_restore) logger.info('no pretrained model, train from sctrach')
def load_anns(self): with open(self.ann_json, 'r') as f: train_json_list = json.load(f) self.metas = train_json_list ###some change can be made here logger.info('the datasets contains %d samples' % (len(self.metas)))
def get_image_annos(self): """Read JSON file, and get and check the image list. Skip missing images. """ images_ids = self.coco.getImgIds() cats = self.coco.loadCats(self.coco.getCatIds()) cat_klass_map = {} for _cat in cats: cat_klass_map[_cat['id']] = _cat['name'] nms = [cat['name'] for cat in cats] print('COCO categories: \n{}\n'.format(' '.join(nms))) print(cat_klass_map) len_imgs = len(images_ids) for idx in range(len_imgs): images_info = self.coco.loadImgs([images_ids[idx]]) image_path = os.path.join(self.image_base_dir, images_info[0]['file_name']) # filter that some images might not in the list if not os.path.exists(image_path): print( "[skip] json annotation found, but cannot found image: {}". format(image_path)) continue annos_ids = self.coco.getAnnIds(imgIds=[images_ids[idx]]) annos_info = self.coco.loadAnns(annos_ids) bboxs = [] for ann in annos_info: if ann["iscrowd"]: continue bbox = ann['bbox'] cat = ann['category_id'] klass = nms.index(cat_klass_map[cat]) if bbox[2] < 1 or bbox[3] < 1: continue bboxs.append([ bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3], klass ]) if len(bboxs) > 0: tmp_meta = CocoMeta_bbox(images_ids[idx], image_path, bboxs) self.metas.append(tmp_meta) # sort from the biggest person to the smallest one logger.info("Overall get {} valid images from {} and {}".format( len(self.metas), self.image_base_dir, self.anno_path))
def load_anns(self): with open(self.ann_file, 'r') as f: image_label_list = f.readlines() self.metas=image_label_list ###some change can be made here logger.info('the datasets contains %d samples'%(len(self.metas)))
def parse_file(self, im_root_path, ann_file): ''' :return: ''' logger.info("[x] Get dataset from {}".format(im_root_path)) ann_info = data_info(im_root_path, ann_file) all_samples = ann_info.get_all_sample() return all_samples
def parse_file(self,im_root_path,ann_file): ''' :return: ''' logger.info("[x] Get dataset from {}".format(im_root_path)) ann_info = data_info(im_root_path, ann_file) all_samples = ann_info.get_all_sample() self.raw_data_set_size=len(all_samples) balanced_samples = self.balance(all_samples) return balanced_samples
def _train(self, _epoch): for step in range(cfg.TRAIN.iter_num_per_epoch): self.ite_num += 1 start_time = time.time() example_images, example_labels = next(self.train_ds) ########show_flag check the data if cfg.TRAIN.vis: for i in range(cfg.TRAIN.batch_size): example_image = example_images[i, :, :, :] example_label = example_labels[i] print(example_label) cv2.namedWindow('img', 0) cv2.imshow('img', example_image.astype(np.uint8)) cv2.waitKey(0) fetch_duration = time.time() - start_time for n in range(cfg.TRAIN.num_gpu): self.train_dict[self.inputs[0][n]] = example_images[ n * cfg.TRAIN.batch_size:(n + 1) * cfg.TRAIN.batch_size, :, :, :] self.train_dict[self.inputs[1][n]] = example_labels[ n * cfg.TRAIN.batch_size:(n + 1) * cfg.TRAIN.batch_size] self.train_dict[self.inputs[2]] = True _, total_loss_value, loss_value, top1_acc_value, top5_acc_value, l2_loss_value, learn_rate, = \ self._sess.run([*self.outputs], feed_dict=self.train_dict) duration = time.time() - start_time run_duration = duration - fetch_duration if self.ite_num % cfg.TRAIN.log_interval == 0: num_examples_per_step = cfg.TRAIN.batch_size * cfg.TRAIN.num_gpu examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / cfg.TRAIN.num_gpu format_str = ('epoch %d: iter %d, ' 'total_loss=%.6f ' 'loss=%.6f ' 'top1 acc=%.6f ' 'top5 acc=%.6f ' 'l2_loss=%.6f ' 'learn_rate =%e ' '(%.1f examples/sec; %.3f sec/batch) ' 'fetch data time = %.6f' 'run time = %.6f') logger.info(format_str % (_epoch, self.ite_num, total_loss_value, loss_value, top1_acc_value, top5_acc_value, l2_loss_value, learn_rate, examples_per_sec, sec_per_batch, fetch_duration, run_duration))
def loop(self, ): self.build() self.load_weight() with self._graph.as_default(): # Create a saver. self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=None) logger.info('A tmp model saved as %s \n' % saved_file) self.saver.save(self._sess, save_path=saved_file)
def report(self): ## report message = '' message += ('top1 acc:%.6f\n' % (self.top1_correct / self.total)) message += ('top5 acc:%.6f\n' % (self.top5_correct / self.total)) message += ('%d samples \n' % self.total) logger.info(message) self.top1_correct = 0 self.top5_correct = 0 self.total = 0
def forward(self, inputs, boxes, labels, l2_regulation, training_flag, with_loss=True): ###preprocess inputs = self.preprocess(inputs) ### extract feature maps origin_fms = self.ssd_backbone(inputs, l2_regulation, training_flag) print(origin_fms) ### head, regresssion and class #### train as a dsfd , anchor with 1 ratios per pixel , two shot logger.info('train with dsfd ') reg, cls = self.ssd_head(origin_fms, l2_regulation, training_flag, ratios_per_pixel=2) ### calculate loss reg_loss, cls_loss = ssd_loss(reg, cls, boxes, labels, 'ohem') ###### adjust the anchors to the image shape, but it trains with a fixed h,w ###adaptive anchor # h = tf.shape(inputs)[1] # w = tf.shape(inputs)[2] # anchors_ = get_all_anchors_fpn(max_size=[h, w]) # # if cfg.MODEL.dual_mode: # anchors_ = anchors_[0::2] # else: # anchors_ = anchors_ ###fix anchor anchors_ = anchor_tools.anchors / cfg.DATA.win # anchors_[:, 0] = anchors_[:, 0] / cfg.DATA.win # anchors_[:, 1] = anchors_[:, 1] / cfg.DATA.hin # anchors_[:, 2] = anchors_[:, 2] / cfg.DATA.win # anchors_[:, 3] = anchors_[:, 3] / cfg.DATA.hin self.postprocess(reg, cls, anchors_) return reg_loss, cls_loss
def distributed_train_epoch(epoch_num): total_loss = 0.0 num_train_batches = 0.0 self.model.train() for step in range(self.train_ds.size): start = time.time() images, target = self.train_ds() images_torch = torch.from_numpy(images) target_torch = torch.from_numpy(target) data, target = images_torch.to(self.device), target_torch.to( self.device) output1, output2, output3 = self.model(data) loss1, loss2, loss3, acc1, acc2, acc3 = self.loss_function( [output1, output2, output3], target) current_loss = loss1 + loss2 + loss3 self.optimizer.zero_grad() current_loss.backward() self.optimizer.step() total_loss += current_loss num_train_batches += 1 self.iter_num += 1 time_cost_per_batch = time.time() - start images_per_sec = cfg.TRAIN.batch_size / time_cost_per_batch if self.iter_num % cfg.TRAIN.log_interval == 0: logger.info( 'epoch_num: %d, ' 'iter_num: %d, ' 'loss1: %.6f, ' 'acc1: %.6f, ' 'loss2: %.6f, ' 'acc2: %.6f, ' 'loss3: %.6f, ' 'acc3: %.6f, ' 'loss_value: %.6f, ' 'speed: %d images/sec ' % (epoch_num, self.iter_num, loss1, acc1, loss2, acc2, loss3, acc3, current_loss, images_per_sec)) return total_loss, num_train_batches
def _map_func(self,dp,is_training): """Data augmentation function.""" ####customed here try: fname, ann = dp image = cv2.imread(fname, cv2.IMREAD_COLOR) if cfg.DATA.rgb: image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) label = np.array(ann) if is_training: image=self.random_crop_resize(image) if random.uniform(0, 1) > 0.5: image, _ = Mirror(image, label=None, symmetry=None) if random.uniform(0, 1) > 0.5: angle = random.uniform(-45, 45) image, _ = Rotate_aug(image, label=None, angle=angle) if random.uniform(0, 1) > 1.: strength = random.uniform(0, 50) image, _ = Affine_aug(image, strength=strength, label=None) if random.uniform(0, 1) > 0.5: image=self.color_augmentor(image) if random.uniform(0, 1) > 1.0: image=pixel_jitter(image,15) if random.uniform(0, 1) > 0.5: image = Img_dropout(image, 0.2) else: ###centercrop image = self.center_crop(image) label = label.astype(np.int64) image= image.astype(np.uint8) except: logger.info('some err happended with %s'%fname, ' but handled with -1') image=np.zeros(shape=[cfg.MODEL.hin,cfg.MODEL.win,3],dtype=np.uint8) label = np.array(-1,dtype=np.int64) return image, label
def save(self): """Train faces data for a number of epoch.""" self.build() self.load_weight() with self._graph.as_default(): # Create a saver. self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=None) logger.info('A tmp model saved as %s \n' % saved_file) self.saver.save(self.sess, save_path=saved_file) self.sess.close()
def read_txt(self): with open(self.txt_file) as _f: txt_lines = _f.readlines() txt_lines.sort() for line in txt_lines: line = line.rstrip() _img_path = line.rsplit('| ', 1)[0] _label = line.rsplit('| ', 1)[-1] current_img_path = os.path.join(self.root_path, _img_path) current_img_label = _label self.metas.append([current_img_path, current_img_label]) ###some change can be made here logger.info('the dataset contains %d images' % (len(txt_lines))) logger.info('the datasets contains %d samples' % (len(self.metas)))
def load_anns(self): with open(self.ann_file, 'r') as f: image_label_list = f.readlines() for line in image_label_list: cur_data_info = line.rstrip().split('|') fname = cur_data_info[0] label = cur_data_info[1] image_path = os.path.join(self.root_path, fname) self.metas.append([image_path, label]) ###some change can be made here logger.info('the datasets contains %d samples' % (len(image_label_list))) logger.info('the datasets contains %d samples after filter' % (len(self.metas)))
def train_loop(self): """Train faces data for a number of epoch.""" self.build() self.load_weight() with self._graph.as_default(): # Create a saver. self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=None) # Build the summary operation from the last tower summaries. self.summary_op = tf.summary.merge(self.summaries) self.summary_writer = tf.summary.FileWriter(cfg.MODEL.model_path, self.sess.graph) min_loss_control=1000. for epoch in range(cfg.TRAIN.epoch): self._train(epoch) val_loss=self._val(epoch) logger.info('**************' 'val_loss %f '%(val_loss)) #tmp_model_name=cfg.MODEL.model_path + \ # 'epoch_' + str(epoch ) + \ # 'L2_' + str(cfg.TRAIN.weight_decay_factor) + \ # '.ckpt' #logger.info('save model as %s \n'%tmp_model_name) #self.saver.save(self.sess, save_path=tmp_model_name) if 1: min_loss_control=val_loss low_loss_model_name = cfg.MODEL.model_path + \ 'epoch_' + str(epoch) + \ 'L2_' + str(cfg.TRAIN.weight_decay_factor) + '.ckpt' logger.info('A new low loss model saved as %s \n' % low_loss_model_name) self.saver.save(self.sess, save_path=low_loss_model_name) self.sess.close()
def forward(self, inputs, boxes, labels, l2_regulation, training_flag, with_loss=True): ###preprocess inputs = self.preprocess(inputs) ### extract feature maps origin_fms = self.ssd_backbone(inputs, l2_regulation, training_flag) print(origin_fms) ### head, regresssion and class #### train as a dsfd , anchor with 1 ratios per pixel , two shot logger.info('train with dsfd ') reg, cls = self.ssd_head(origin_fms, l2_regulation, training_flag, ratios_per_pixel=2) ### calculate loss reg_loss, cls_loss = ssd_loss(reg, cls, boxes, labels, 'focal_loss') ###### adjust the anchors to the image shape, but it trains with a fixed h,w ###adaptive anchor h = tf.shape(inputs)[1] w = tf.shape(inputs)[2] anchors_ = get_all_anchors_fpn(max_size=[h, w]) self.postprocess(reg, cls, anchors_) return reg_loss, cls_loss
def parse_file(self,feature,target,extra_target): train_features = feature labels_train = target extra_labels_train = extra_target def preprocess(df): """Returns preprocessed data frame""" df = df.copy() df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1}) df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1}) df.loc[:, 'cp_time'] = df.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2}) return df train_features=preprocess(train_features) ####filter control if cfg.DATA.filter_ctl_vehicle: filter_index = train_features['cp_type'] != 1 train_features = train_features[filter_index] labels_train = labels_train[filter_index] extra_labels_train = extra_labels_train[filter_index] train_features = train_features.drop(['sig_id', 'fold' ], axis=1).values labels_train = labels_train.drop('sig_id', axis=1).values extra_labels_train = extra_labels_train.drop('sig_id', axis=1).values logger.info('dataset contains %d samples'%(train_features.shape[0])) return train_features,labels_train,extra_labels_train
def load_weight(self): with self._graph.as_default(): if cfg.MODEL.continue_train: #########################restore the params variables_restore = tf.get_collection( tf.GraphKeys.MODEL_VARIABLES) print(variables_restore) saver2 = tf.train.Saver(variables_restore) saver2.restore(self._sess, cfg.MODEL.pretrained_model) elif cfg.MODEL.pretrained_model is not None and not cfg.MODEL.pruning: #########################restore the params variables_restore = tf.get_collection( tf.GraphKeys.MODEL_VARIABLES, scope=cfg.MODEL.net_structure) print(variables_restore) saver2 = tf.train.Saver(variables_restore) saver2.restore(self._sess, cfg.MODEL.pretrained_model) elif cfg.MODEL.pruning: #########################restore the params variables_restore = tf.get_collection( tf.GraphKeys.MODEL_VARIABLES) print(variables_restore) # print('......................................................') # # saver2 = tf.train.Saver(variables_restore) variables_restore_n = [ v for v in variables_restore if 'output' not in v.name ] # Conv2d_1c_1x1 Bottleneck # print(variables_restore_n) state_dict = np.load(cfg.MODEL.pretrained_model) state_dict = state_dict['arr_0'][()] for var in variables_restore_n: var_name = var.name.rsplit(':')[0] if var_name in state_dict: logger.info('recover %s from npz file' % var_name) self._sess.run(tf.assign(var, state_dict[var_name])) else: logger.info('the params of %s not in npz file' % var_name) else: logger.info('no pretrained model, train from sctrach')
from lib.helper.logger import logger from lib.core.base_trainer.net_work import Train from lib.dataset.dataietr import DataIter from lib.core.model.ShuffleNet_Series.ShuffleNetV2.network import ShuffleNetV2 from lib.core.model.semodel.SeResnet import se_resnet50 import cv2 import numpy as np from train_config import config as cfg import setproctitle logger.info('The trainer start') setproctitle.setproctitle("face*_*_") def main(): ###build dataset train_ds = DataIter(cfg.DATA.root_path, cfg.DATA.train_txt_path, True) test_ds = DataIter(cfg.DATA.root_path, cfg.DATA.val_txt_path, False) ###build trainer trainer = Train(train_ds=train_ds, val_ds=test_ds) trainer.load_weight() if cfg.TRAIN.vis: for step in range(train_ds.size): images, labels = train_ds()
def SSD(images,boxes,labels,L2_reg,training=True): images=preprocess(images) if 'MobilenetV1' in cfg.MODEL.net_structure: ssd_backbne=mobilenet_ssd elif 'resnet' in cfg.MODEL.net_structure: ssd_backbne = resnet_ssd elif 'vgg' in cfg.MODEL.net_structure: ssd_backbne = vgg_ssd elif 'efficientnet' in cfg.MODEL.net_structure: ssd_backbne= efficient_ssd else: ssd_backbne=None print('a net structure that not supported') origin_fms,enhanced_fms=ssd_backbne(images, L2_reg, training) print('origin_fms', origin_fms) print('enhanced_fms', enhanced_fms) with tf.variable_scope('ssd'): if not cfg.MODEL.fpn and not cfg.MODEL.dual_mode: logger.info('the model was trained as a plain ssd') reg_final, cla_final=ssd_out(origin_fms, L2_reg, training) reg_loss, cla_loss = ssd_loss(reg_final, cla_final, boxes, labels, 'ohem') elif cfg.MODEL.fpn and not cfg.MODEL.dual_mode: logger.info('the model was trained without dual shot') reg_final, cla_final = ssd_out(enhanced_fms, L2_reg, training) reg_loss, cla_loss = ssd_loss(reg_final, cla_final, boxes, labels, 'ohem') elif cfg.MODEL.dual_mode: logger.info('the model was trained with dual shot, FEM') reg, cla= ssd_out(origin_fms, L2_reg, training,1) boxes_small=boxes[:,1::2] label_small=labels[:,1::2] reg_loss, cla_loss = ssd_loss(reg, cla, boxes_small, label_small, 'ohem') with tf.variable_scope('dual'): reg_final, cla_final = ssd_out(enhanced_fms, L2_reg, training,1) boxes_norm = boxes[:, 0::2] label_norm = labels[:, 0::2] reg_loss_dual, cla_loss_dual = ssd_loss(reg_final, cla_final, boxes_norm, label_norm,'ohem') reg_loss=(reg_loss+reg_loss_dual) cla_loss=(cla_loss+cla_loss_dual) ###### make it easy to adjust the anchors, but it trains with a fixed h,w h = tf.shape(images)[1] w = tf.shape(images)[2] anchors_=get_all_anchors_fpn(max_size=[h,w]) if cfg.MODEL.dual_mode: anchors_ = anchors_[0::2] else: anchors_ = anchors_ get_predictions(reg_final,cla_final,anchors_) return reg_loss,cla_loss
def custom_loop(self): """Custom training and testing loop. Args: train_dist_dataset: Training dataset created using strategy. test_dist_dataset: Testing dataset created using strategy. strategy: Distribution strategy. Returns: train_loss, train_accuracy, test_loss, test_accuracy """ def distributed_train_epoch(epoch_num): total_loss = 0.0 num_train_batches = 0.0 self.model.train() for step in range(self.train_ds.size): start = time.time() images, target = self.train_ds() images_torch = torch.from_numpy(images) target_torch = torch.from_numpy(target) data, target = images_torch.to(self.device), target_torch.to( self.device) output1, output2, output3 = self.model(data) loss1, loss2, loss3, acc1, acc2, acc3 = self.loss_function( [output1, output2, output3], target) current_loss = loss1 + loss2 + loss3 self.optimizer.zero_grad() current_loss.backward() self.optimizer.step() total_loss += current_loss num_train_batches += 1 self.iter_num += 1 time_cost_per_batch = time.time() - start images_per_sec = cfg.TRAIN.batch_size / time_cost_per_batch if self.iter_num % cfg.TRAIN.log_interval == 0: logger.info( 'epoch_num: %d, ' 'iter_num: %d, ' 'loss1: %.6f, ' 'acc1: %.6f, ' 'loss2: %.6f, ' 'acc2: %.6f, ' 'loss3: %.6f, ' 'acc3: %.6f, ' 'loss_value: %.6f, ' 'speed: %d images/sec ' % (epoch_num, self.iter_num, loss1, acc1, loss2, acc2, loss3, acc3, current_loss, images_per_sec)) return total_loss, num_train_batches def distributed_test_epoch(epoch_num): total_loss = 0. total_acc1 = 0. total_acc2 = 0. total_acc3 = 0. num_test_batches = 0.0 self.model.eval() with torch.no_grad(): for i in range(self.val_ds.size): images, target = self.val_ds() images_torch = torch.from_numpy(images) target_torch = torch.from_numpy(target) data, target = images_torch.to( self.device), target_torch.to(self.device) output1, output2, output3 = self.model(data) loss1, loss2, loss3, acc1, acc2, acc3 = self.loss_function( [output1, output2, output3], target) cur_loss = loss1 + loss2 + loss3 total_loss += cur_loss total_acc1 += acc1 total_acc2 += acc2 total_acc3 += acc3 num_test_batches += 1 return total_loss,\ total_acc1,\ total_acc2,\ total_acc3, \ num_test_batches for epoch in range(self.epochs): self.scheduler.step() for param_group in self.optimizer.param_groups: lr = param_group['lr'] logger.info('learning rate: [%f]' % (lr)) start = time.time() train_total_loss, num_train_batches = distributed_train_epoch( epoch) test_total_loss, test_total_acc1, test_total_acc2, test_total_acc3, num_test_batches = distributed_test_epoch( epoch) time_consume_per_epoch = time.time() - start training_massage = 'Epoch: %d, ' \ 'Train Loss: %.6f, ' \ 'Test Loss: %.6f ' \ 'Test acc1: %.6f '\ 'Test acc2: %.6f '\ 'Test acc3: %.6f '\ 'Time consume: %.2f'%(epoch, train_total_loss / num_train_batches, test_total_loss / num_test_batches, test_total_acc1 / num_test_batches, test_total_acc2 / num_test_batches, test_total_acc3 / num_test_batches, time_consume_per_epoch) logger.info(training_massage) #### save the model every end of epoch current_model_saved_name = './model/epoch_%d_val_loss%.6f.pth' % ( epoch, test_total_loss / num_test_batches) logger.info('A model saved to %s' % current_model_saved_name) if not os.access(cfg.MODEL.model_path, os.F_OK): os.mkdir(cfg.MODEL.model_path) torch.save(self.model.state_dict(), current_model_saved_name) # save_checkpoint({ # 'state_dict': self.model.state_dict(), # },iters=epoch,tag=current_model_saved_name) return (train_total_loss / num_train_batches, test_total_loss / num_test_batches)
from lib.helper.logger import logger from lib.core.base_trainer.network import Train import setproctitle import cv2 cv2.setNumThreads(0) cv2.ocl.setUseOpenCL(False) logger.info('train start') setproctitle.setproctitle("detect") trainner = Train() trainner.custom_loop()
def custom_loop(self, train_dist_dataset, test_dist_dataset, strategy): """Custom training and testing loop. Args: train_dist_dataset: Training dataset created using strategy. test_dist_dataset: Testing dataset created using strategy. strategy: Distribution strategy. Returns: train_loss, train_accuracy, test_loss, test_accuracy """ def distributed_train_epoch(ds, epoch_num): total_loss = 0.0 num_train_batches = 0.0 for one_batch in ds: start = time.time() per_replica_loss = strategy.experimental_run_v2( self.train_step, args=(one_batch, )) current_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None) total_loss += current_loss num_train_batches += 1 self.iter_num += 1 time_cost_per_batch = time.time() - start images_per_sec = cfg.TRAIN.batch_size / time_cost_per_batch if self.iter_num % cfg.TRAIN.log_interval == 0: logger.info('epoch_num: %d, ' 'iter_num: %d, ' 'loss_value: %.6f, ' 'speed: %d images/sec ' % (epoch_num, self.iter_num, current_loss, images_per_sec)) return total_loss, num_train_batches def distributed_test_epoch(ds, epoch_num): total_loss = 0. num_test_batches = 0.0 for one_batch in ds: per_replica_loss = strategy.experimental_run_v2( self.test_step, args=(one_batch, )) current_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None) total_loss += current_loss num_test_batches += 1 return total_loss, num_test_batches if self.enable_function: distributed_train_epoch = tf.function(distributed_train_epoch) distributed_test_epoch = tf.function(distributed_test_epoch) for epoch in range(self.epochs): start = time.time() self.optimizer.learning_rate = self.decay(epoch) train_total_loss, num_train_batches = distributed_train_epoch( train_dist_dataset, epoch) test_total_loss, num_test_batches = distributed_test_epoch( test_dist_dataset, epoch) time_consume_per_epoch = time.time() - start training_massage = 'Epoch: %d, ' \ 'Train Loss: %.6f, ' \ 'Test Loss: %.6f '\ 'Time consume: %.2f'%(epoch, train_total_loss / num_train_batches, test_total_loss / num_test_batches, time_consume_per_epoch) logger.info(training_massage) #### save the model every end of epoch current_model_saved_name = os.path.join( cfg.MODEL.model_path, 'epoch_%d_val_loss%.6f' % (epoch, test_total_loss / num_test_batches)) if not os.access(cfg.MODEL.model_path, os.F_OK): os.mkdir(cfg.MODEL.model_path) tf.saved_model.save(self.model, current_model_saved_name) logger.info('A model saved to %s' % current_model_saved_name) return (train_total_loss / num_train_batches, test_total_loss / num_test_batches)
def forward(self, inputs, boxes, labels, l2_regulation, training_flag, with_loss=True): ###preprocess inputs = self.preprocess(inputs) ### extract feature maps origin_fms, enhanced_fms = self.ssd_backbone(inputs, l2_regulation, training_flag) ### head, regresssion and class if cfg.MODEL.dual_mode and cfg.MODEL.fpn: #### train as a dsfd , anchor with 1 ratios per pixel , two shot logger.info('train with dsfd ') ###first shot origin_reg, origin_cls = self.ssd_head(origin_fms, l2_regulation, training_flag, ratios_per_pixel=1) ###second shot with tf.variable_scope('dual'): final_reg, final_cls = self.ssd_head(enhanced_fms, l2_regulation, training_flag, ratios_per_pixel=1) ### calculate loss if with_loss: ## first shot anchors boxes_small = boxes[:, 1::2] label_small = labels[:, 1::2] ## first shot loss reg_loss, cls_loss = ssd_loss(origin_reg, origin_cls, boxes_small, label_small, 'ohem') ## second shot anchors boxes_norm = boxes[:, 0::2] label_norm = labels[:, 0::2] ## second shot loss with tf.name_scope('dual'): final_reg_loss, final_cls_loss_dual = ssd_loss( final_reg, final_cls, boxes_norm, label_norm, 'ohem') reg_loss = (reg_loss + final_reg_loss) cls_loss = (cls_loss + final_cls_loss_dual) elif cfg.MODEL.fpn: #### train as a plain ssd with fpn , anchor with 2 ratios per pixel logger.info('train with a ssd with fpn ') with tf.variable_scope('dual'): final_reg, final_cls = self.ssd_head(enhanced_fms, l2_regulation, training_flag) ### calculate loss if with_loss: reg_loss, cls_loss = ssd_loss(final_reg, final_cls, boxes, labels, 'ohem') else: #### train as a plain ssd , anchor with 2 ratios per pixel logger.info('train with a plain ssd') final_reg, final_cls = self.ssd_head(origin_fms, l2_regulation, training_flag) ### calculate loss if with_loss: reg_loss, cls_loss = ssd_loss(final_reg, final_cls, boxes, labels, 'ohem') ###### adjust the anchors to the image shape, but it trains with a fixed h,w h = tf.shape(inputs)[1] w = tf.shape(inputs)[2] anchors_ = get_all_anchors_fpn(max_size=[h, w]) if cfg.MODEL.dual_mode: anchors_ = anchors_[0::2] else: anchors_ = anchors_ self.postprocess(final_reg, final_cls, anchors_) return reg_loss, cls_loss
def balance(self,anns): res_anns = copy.deepcopy(anns) lar_count = 0 for ann in anns: ### 300w balance, according to keypoints if ann['keypoints'] is not None: label = ann['keypoints'] label = np.array(label, dtype=np.float).reshape((-1, 2)) bbox = ann['bbox'] bbox_width = bbox[2] - bbox[0] bbox_height = bbox[3] - bbox[1] if bbox_width < 50 or bbox_height < 50: res_anns.remove(ann) left_eye_close = np.sqrt( np.square(label[37, 0] - label[41, 0]) + np.square(label[37, 1] - label[41, 1])) / bbox_height < self.eye_close_thres \ or np.sqrt(np.square(label[38, 0] - label[40, 0]) + np.square(label[38, 1] - label[40, 1])) / bbox_height < self.eye_close_thres right_eye_close = np.sqrt( np.square(label[43, 0] - label[47, 0]) + np.square(label[43, 1] - label[47, 1])) / bbox_height < self.eye_close_thres \ or np.sqrt(np.square(label[44, 0] - label[46, 0]) + np.square(label[44, 1] - label[46, 1])) / bbox_height < self.eye_close_thres if left_eye_close or right_eye_close: for i in range(10): res_anns.append(ann) ###half face if np.sqrt(np.square(label[36, 0] - label[45, 0]) + np.square(label[36, 1] - label[45, 1])) / bbox_width < 0.5: for i in range(20): res_anns.append(ann) if np.sqrt(np.square(label[62, 0] - label[66, 0]) + np.square(label[62, 1] - label[66, 1])) / bbox_height > 0.15: for i in range(20): res_anns.append(ann) if np.sqrt(np.square(label[62, 0] - label[66, 0]) + np.square(label[62, 1] - label[66, 1])) / cfg.MODEL.hin > self.big_mouth_open_thres: for i in range(50): res_anns.append(ann) ##########eyes diff aug if left_eye_close and not right_eye_close: for i in range(40): res_anns.append(ann) lar_count += 1 if not left_eye_close and right_eye_close: for i in range(40): res_anns.append(ann) lar_count += 1 # elif ann['attr'] is not None: # # ###celeba data, # if ann['attr'][0]>0: # for i in range(10): # res_anns.append(ann) logger.info('befor balance the dataset contains %d images' % (len(anns))) logger.info('after balanced the datasets contains %d samples' % (len(res_anns))) random.shuffle(res_anns) return res_anns