def run_test(): parser = argparse.ArgumentParser() # parser.add_argument('--resume_weights', '-r', default=None, type=str) parser.add_argument('--start_epoch', '-s',default = 30, type=int) parser.add_argument('--end_epoch','-e', default= 50, type=int) parser.add_argument('--devices', '-d', default=1, type=int) args = parser.parse_args() # eval_all(args) # model_path model_dir = config.model_dir eval_dir = config.eval_dir # misc_utils.ensure_dir(evalDir) ensure_dir(config.eval_dir) records = load_json_lines(config.eval_source) start_epoch, end_epoch = args.start_epoch, args.end_epoch for epoch in range(start_epoch, end_epoch): model_file = osp.join(model_dir, 'epoch-{}.pkl'.format(epoch)) if not osp.exists(model_file): continue results = eval_all(model_file, records, args) fpath = osp.join(eval_dir, 'epoch-{}.human'.format(epoch)) save_json_lines(results, fpath)
def eval_all(args): # model_path saveDir = config.model_dir evalDir = config.eval_dir misc_utils.ensure_dir(evalDir) model_file = os.path.join(saveDir, 'epoch_{}.pkl'.format(args.resume_weights)) assert os.path.exists(model_file) # load data records = misc_utils.load_json_lines(config.eval_source) # multiprocessing num_records = len(records) num_devs = args.devices num_image = math.ceil(num_records / num_devs) result_queue = Queue(1000) procs = [] all_results = [] for i in range(num_devs): start = i * num_image end = min(start + num_image, num_records) split_records = records[start:end] proc = Process(target=inference, args=(model_file, i, split_records, result_queue)) proc.start() procs.append(proc) pbar = tqdm(total=num_records, ncols=50) for i in range(num_records): t = result_queue.get() all_results.append(t) pbar.update(1) for p in procs: p.join() fpath = os.path.join(evalDir, 'dump-{}.json'.format(args.resume_weights)) misc_utils.save_json_lines(all_results, fpath)
def train(args): # ------------------------ begin training -------------------------- # valid_nr_dev = mge.get_device_count("gpu") gpu_num = min(valid_nr_dev, args.num_gpus) assert gpu_num > 0 logger.info('Device Count: {}'.format(gpu_num)) ensure_dir(cfg.model_dir) if not osp.exists('output'): os.symlink(cfg.output_dir,'output') if gpu_num > 1: args.port =find_free_port() mp.set_start_method("spawn") processes = list() for i in range(gpu_num): process = mp.Process(target=worker, args=(i, gpu_num, args)) process.start() processes.append(process) for p in processes: p.join() else: worker(0, 1, args)
def inference(args): @jit.trace(symbolic=False) def val_func(): pred_boxes = net(net.inputs) return pred_boxes # model path saveDir = config.model_dir evalDir = config.eval_dir misc_utils.ensure_dir(evalDir) model_file = os.path.join(saveDir, 'epoch_{}.pkl'.format(args.resume_weights)) assert os.path.exists(model_file) # load model net = network.Network() net.eval() check_point = mge.load(model_file) net.load_state_dict(check_point['state_dict']) ori_image, image, im_info = get_data(args.img_path) net.inputs["image"].set_value(image.astype(np.float32)) net.inputs["im_info"].set_value(im_info) pred_boxes = val_func().numpy() num_tag = config.num_classes - 1 target_shape = (pred_boxes.shape[0] // num_tag // top_k, top_k) pred_tags = (np.arange(num_tag) + 1).reshape(-1, 1) pred_tags = np.tile(pred_tags, target_shape).reshape(-1, 1) # nms if if_set_nms: from set_nms_utils import set_cpu_nms n = pred_boxes.shape[0] // top_k idents = np.tile(np.arange(n)[:, None], (1, top_k)).reshape(-1, 1) pred_boxes = np.hstack((pred_boxes, idents)) keep = pred_boxes[:, -2] > args.thresh pred_boxes = pred_boxes[keep] pred_tags = pred_tags[keep] keep = set_cpu_nms(pred_boxes, 0.5) pred_boxes = pred_boxes[keep][:, :-1] pred_tags = pred_tags[keep] else: from set_nms_utils import cpu_nms keep = pred_boxes[:, -1] > args.thresh pred_boxes = pred_boxes[keep] pred_tags = pred_tags[keep] keep = cpu_nms(pred_boxes, 0.5) pred_boxes = pred_boxes[keep] pred_tags = pred_tags[keep] pred_tags = pred_tags.astype(np.int32).flatten() pred_tags_name = np.array(config.class_names)[pred_tags] visual_utils.draw_boxes(ori_image, pred_boxes[:, :-1], pred_boxes[:, -1], pred_tags_name) name = args.img_path.split('/')[-1].split('.')[-2] fpath = '/data/jupyter/{}.png'.format(name) cv2.imwrite(fpath, ori_image)
def inference(args): @jit.trace(symbolic=False) def val_func(): pred_boxes = net(net.inputs) return pred_boxes # model path saveDir = config.model_dir evalDir = config.eval_dir misc_utils.ensure_dir(evalDir) model_file = os.path.join(saveDir, 'epoch_{}.pkl'.format(args.resume_weights)) assert os.path.exists(model_file) # load model net = network.Network() net.eval() check_point = mge.load(model_file) net.load_state_dict(check_point['state_dict']) image, im_info = get_data(args.img_path) net.inputs["image"].set_value(image.astype(np.float32)) net.inputs["im_info"].set_value(im_info) pred_boxes = val_func().numpy() num_tag = config.num_classes - 1 target_shape = (pred_boxes.shape[0] // num_tag // top_k, top_k) pred_tags = (np.arange(num_tag) + 1).reshape(-1, 1) pred_tags = np.tile(pred_tags, target_shape).reshape(-1, 1) # nms if if_set_nms: from set_nms_utils import set_cpu_nms n = pred_boxes.shape[0] // top_k idents = np.tile(np.arange(n)[:, None], (1, top_k)).reshape(-1, 1) pred_boxes = np.hstack((pred_boxes, idents)) keep = pred_boxes[:, -2] > 0.05 pred_boxes = pred_boxes[keep] pred_tags = pred_tags[keep] keep = set_cpu_nms(pred_boxes, 0.5) pred_boxes = pred_boxes[keep][:, :-1] pred_tags = pred_tags[keep].flatten() else: from set_nms_utils import cpu_nms keep = pred_boxes[:, -1] > 0.05 pred_boxes = pred_boxes[keep] pred_tags = pred_tags[keep] keep = cpu_nms(pred_boxes, 0.5) pred_boxes = pred_boxes[keep] pred_tags = pred_tags[keep].flatten() result_dict = dict(height=int(im_info[0, -2]), width=int(im_info[0, -1]), dtboxes=boxes_dump(pred_boxes, pred_tags)) name = args.img_path.split('/')[-1].split('.')[-2] misc_utils.save_json_lines([result_dict], '{}.json'.format(name))
def run_test(): parser = argparse.ArgumentParser() parser.add_argument('--start_epoch', '-s',default = 30, type=int) parser.add_argument('--end_epoch','-e', default= 50, type=int) parser.add_argument('--devices', '-d', default=1, type=int) args = parser.parse_args() # model_path model_dir = config.model_dir eval_dir = config.eval_dir ensure_dir(config.eval_dir) records = load_json_lines(config.eval_source) start_epoch, end_epoch = args.start_epoch, args.end_epoch for epoch in range(start_epoch, end_epoch): model_file = osp.join(model_dir, 'epoch-{}.pkl'.format(epoch)) if not osp.exists(model_file): continue print('Processing {}'.format(osp.basename(model_file))) results = eval_all(model_file, records, args) fpath = osp.join(eval_dir, 'epoch-{}.human'.format(epoch)) save_json_lines(results, fpath)
from tqdm import tqdm import megengine as mge from megengine import distributed as dist from megengine import optimizer as optim import megengine.autodiff as autodiff from megengine import jit # import dataset import network from config import config as cfg from dataset.CrowdHuman import CrowdHuman from misc_utils import ensure_dir from megengine.core._imperative_rt.utils import Logger from megengine import data import pdb ensure_dir(cfg.output_dir) logger = mge.get_logger(__name__) log_path = osp.join(cfg.output_dir, 'logger.log') mge.set_log_file(log_path, mode='a') Logger.set_log_level(Logger.LogLevel.Error) def find_free_port(): import socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # Binding to port 0 will cause the OS to find an available port for us sock.bind(("", 0)) port = sock.getsockname()[1] sock.close() # NOTE: there is still a chance the port could be taken by other processes. return port def allreduce_cb(param, grad, group=dist.WORLD):
def train(params): total_nr_iters = config.train_base_iters batch_per_gpu = config.train_batch_per_gpu base_lr = config.base_lr line = 'network.base_lr.{}.train_iter.{}'.format(base_lr, total_nr_iters) print(line) # set model save path and log save path saveDir = config.model_dir misc_utils.ensure_dir(saveDir) fpath = os.path.join(config.output_dir, line+'.log') fid_log = open(fpath,'a') # set data input pipe program_name = config.program_name # check gpus torch.set_default_tensor_type('torch.FloatTensor') if not torch.cuda.is_available(): print('No GPU exists!') return else: num_gpus = torch.cuda.device_count() train_iter = total_nr_iters//(num_gpus*batch_per_gpu) train_lr_decay = np.array(config.lr_decay)//(num_gpus*batch_per_gpu) train_dump_interval = config.model_dump_interval//(num_gpus*batch_per_gpu) train_lr = base_lr * num_gpus bt_size = num_gpus * batch_per_gpu line = 'Num of GPUs:{}, learning rate:{:.5f}, batch size:{},\ train_iter:{}, decay_iter:{}, dump_interval:{}'.format( num_gpus,train_lr,bt_size,train_iter,train_lr_decay, train_dump_interval) print(line) print("Building netowrk.") net = network.Network() # Moves all model parameters and buffers to the GPU. net.cuda() if params.resume_weights: model_file = os.path.join(saveDir, 'dump-{}.pth'.format(params.resume_weights)) check_point = torch.load(model_file) net.load_state_dict(check_point['state_dict']) net = nn.DataParallel(net) # set the optimizer, use momentum and weight_decay optimizer = optim.SGD(net.parameters(), lr=train_lr, momentum=config.momentum, \ weight_decay=config.weight_decay) # check if resume training training_data = train_dataset() net.train() if(params.progressbar): tqdm.monitor_interval = 0 pbar = tqdm(total=train_iter, leave=False, ascii=True) dump_num = 1 start_iter = 0 if params.resume_weights: start_iter = int(params.resume_weights) * train_dump_interval if(start_iter >= train_lr_decay[0]): optimizer.param_groups[0]['lr'] = train_lr / 10 if(start_iter >= train_lr_decay[1]): optimizer.param_groups[0]['lr'] = train_lr / 100 dump_num = int(params.resume_weights) + 1 for step in range(start_iter, train_iter): # warm up if step < config.warm_iter: alpha = step / config.warm_iter lr_new = 0.1 * train_lr + 0.9 * alpha * train_lr optimizer.param_groups[0]['lr'] = lr_new elif step == config.warm_iter: optimizer.param_groups[0]['lr'] = train_lr if step == train_lr_decay[0]: optimizer.param_groups[0]['lr'] = train_lr / 10 elif step == train_lr_decay[1]: optimizer.param_groups[0]['lr'] = train_lr / 100 # get training data images, gt_boxes, img_info = process(training_data, num_gpus) optimizer.zero_grad() # forwad outputs = net(images, img_info, gt_boxes) # collect the loss total_loss = sum([outputs[key].mean() for key in outputs.keys()]) total_loss.backward() optimizer.step() if(params.progressbar): pbar.update(1) # stastic if step % config.log_dump_interval == 0: stastic_total_loss = total_loss.cpu().data.numpy() line = 'Iter {}: lr:{:.5f}, loss is {:.4f}.'.format( step, optimizer.param_groups[0]['lr'], stastic_total_loss) print(outputs) print(line) fid_log.write(line+'\n') fid_log.flush() # save the model if (step + 1)%train_dump_interval==0: fpath = os.path.join(saveDir,'dump-{}.pth'.format(dump_num)) dump_num += 1 model = dict(epoch = step, state_dict = net.module.state_dict(), optimizer = optimizer.state_dict()) torch.save(model,fpath) if(params.progressbar): pbar.close() fid_log.close()
def train(args): if type(config.train_source) == list: training_data = multi_train_dataset(args) else: training_data = train_dataset(args) number_of_training_instances = training_data.__next__() val_data = eval_dataset(args) number_of_val_instances = val_data.__next__() total_nr_iters = args.epochs * number_of_training_instances batch_per_gpu = config.train_batch_per_gpu base_lr = config.base_lr line = 'network.base_lr.{}.train_iter.{}'.format(base_lr, total_nr_iters) print(line) # set model save path and log save path saveDir = config.model_dir misc_utils.ensure_dir(saveDir) # set data input pipe program_name = config.program_name # check gpus torch.set_default_tensor_type('torch.FloatTensor') if not torch.cuda.is_available(): print('No GPU exists!') return else: num_gpus = torch.cuda.device_count() train_iter = total_nr_iters // (num_gpus * batch_per_gpu) print('[-]', num_gpus, batch_per_gpu, total_nr_iters) new_decay = (np.array(config.lr_decay) / 450000) * total_nr_iters train_lr_decay = new_decay // (num_gpus * batch_per_gpu) train_dump_interval = number_of_training_instances // (num_gpus * batch_per_gpu) train_lr = base_lr * num_gpus bt_size = num_gpus * batch_per_gpu line = 'Num of GPUs:{}, learning rate:{:.5f}, batch size:{},\ train_iter:{}, decay_iter:{}, dump_interval:{}'.format( num_gpus, train_lr, bt_size, train_iter, train_lr_decay, train_dump_interval) print(line) print("[-]Building netowrk.") net = network.Network(args) net.cuda() best = 10e10 epoch = 0 if args.resume: print("Load base model from :", os.path.join(args.save_dir, args.output_name, 'dump_last.pth')) check_point = torch.load( os.path.join(args.save_dir, args.output_name, 'dump_last.pth')) net.load_state_dict(check_point['state_dict']) start_iter = check_point['step'] if 'val_loss' in check_point: best = check_point['val_loss'] epoch = start_iter // train_dump_interval + 1 elif args.base_model: print("Load base model from :", args.base_model) check_point = torch.load(args.base_model) net.load_state_dict(check_point['state_dict'], strict=False) start_iter = 0 else: start_iter = 0 net = nn.DataParallel(net) # set the optimizer, use momentum and weight_decay optimizer = optim.SGD(net.parameters(), lr=train_lr, momentum=config.momentum, \ weight_decay=config.weight_decay) if (start_iter >= train_lr_decay[0]): optimizer.param_groups[0]['lr'] = train_lr / 10 if (start_iter >= train_lr_decay[1]): optimizer.param_groups[0]['lr'] = train_lr / 100 # check if resume training net.train() logger = Logger(args) iter_tqdm = None val_tqdm = None for step in range(start_iter, train_iter): # warm up if step < config.warm_iter: alpha = step / config.warm_iter lr_new = 0.1 * train_lr + 0.9 * alpha * train_lr optimizer.param_groups[0]['lr'] = lr_new elif step == config.warm_iter: optimizer.param_groups[0]['lr'] = train_lr if step == train_lr_decay[0]: optimizer.param_groups[0]['lr'] = train_lr / 10 elif step == train_lr_decay[1]: optimizer.param_groups[0]['lr'] = train_lr / 100 # get training data images, gt_boxes, img_info, done_an_epoch, extra = process( args, training_data, num_gpus) if done_an_epoch: epoch += 1 optimizer.zero_grad() # forward outputs = net(images, img_info, gt_boxes, extra=extra) # collect the loss total_loss = sum([outputs[key].mean() for key in outputs.keys()]) total_loss.backward() optimizer.step() # stastic stastic_total_loss = total_loss.cpu().data.numpy() line = '[*]Epoch:{} iter<{}> lr:{:.5f}, loss:{:.4f}'.format( epoch, step, optimizer.param_groups[0]['lr'], float(stastic_total_loss)) if step % config.log_dump_interval == 0: logger.scalar_summary('lr', optimizer.param_groups[0]['lr'], step) for k, v in outputs.items(): v = float(np.mean(v.cpu().data.numpy())) logger.scalar_summary(k, v, step) line += ', ' + k + ':{:.4}'.format(v) logger.scalar_summary('total_loss', float(stastic_total_loss), step) else: for k, v in outputs.items(): v = float(np.mean(v.cpu().data.numpy())) line += ', ' + k + ':{:.4}'.format(v) if iter_tqdm is None: iter_tqdm = tqdm(total=train_iter, desc='Iteration') iter_tqdm.update(start_iter) iter_tqdm.set_description("[-] " + line) iter_tqdm.refresh() # save the best model if done_an_epoch: if args.save_per_epoch > 0: if (epoch + 1) % args.save_per_epoch == 0: fpath = os.path.join(saveDir, 'dump_{}.pth'.format(epoch)) print('[.] Saving :', fpath) model = dict(epoch=epoch, step=step, state_dict=net.module.state_dict(), optimizer=optimizer.state_dict()) torch.save(model, fpath) fpath = os.path.join(saveDir, 'dump_last.pth') print('[.] Saving :', fpath) model = dict(epoch=epoch, step=step, state_dict=net.module.state_dict(), optimizer=optimizer.state_dict()) torch.save(model, fpath) net.train() iter_tqdm.update(1) iter_tqdm.close() fpath = os.path.join(saveDir, 'dump_last.pth') print('[.] Saving :', fpath) model = dict(step=step, state_dict=net.module.state_dict(), optimizer=optimizer.state_dict()) torch.save(model, fpath)