Example #1
0
def run_test():
    parser = argparse.ArgumentParser()
    # parser.add_argument('--resume_weights', '-r', default=None, type=str)
    parser.add_argument('--start_epoch', '-s',default = 30, type=int)
    parser.add_argument('--end_epoch','-e', default= 50, type=int)
    parser.add_argument('--devices', '-d', default=1, type=int)
    args = parser.parse_args()
    # eval_all(args)

    # model_path
    model_dir = config.model_dir
    eval_dir = config.eval_dir
    # misc_utils.ensure_dir(evalDir)
    ensure_dir(config.eval_dir)
    records = load_json_lines(config.eval_source)

    start_epoch, end_epoch = args.start_epoch, args.end_epoch
    for epoch in range(start_epoch, end_epoch):
        model_file = osp.join(model_dir, 'epoch-{}.pkl'.format(epoch))
        if not osp.exists(model_file):
            continue
        results = eval_all(model_file, records, args)
        
        fpath = osp.join(eval_dir, 'epoch-{}.human'.format(epoch))
        save_json_lines(results, fpath)
Example #2
0
def eval_all(args):
    # model_path
    saveDir = config.model_dir
    evalDir = config.eval_dir
    misc_utils.ensure_dir(evalDir)
    model_file = os.path.join(saveDir,
                              'epoch_{}.pkl'.format(args.resume_weights))
    assert os.path.exists(model_file)
    # load data
    records = misc_utils.load_json_lines(config.eval_source)
    # multiprocessing
    num_records = len(records)
    num_devs = args.devices
    num_image = math.ceil(num_records / num_devs)
    result_queue = Queue(1000)
    procs = []
    all_results = []
    for i in range(num_devs):
        start = i * num_image
        end = min(start + num_image, num_records)
        split_records = records[start:end]
        proc = Process(target=inference,
                       args=(model_file, i, split_records, result_queue))
        proc.start()
        procs.append(proc)
    pbar = tqdm(total=num_records, ncols=50)
    for i in range(num_records):
        t = result_queue.get()
        all_results.append(t)
        pbar.update(1)
    for p in procs:
        p.join()
    fpath = os.path.join(evalDir, 'dump-{}.json'.format(args.resume_weights))
    misc_utils.save_json_lines(all_results, fpath)
Example #3
0
def train(args):
    # ------------------------ begin training -------------------------- #
    valid_nr_dev = mge.get_device_count("gpu")
    gpu_num = min(valid_nr_dev, args.num_gpus)
    assert gpu_num > 0
    logger.info('Device Count: {}'.format(gpu_num))

    ensure_dir(cfg.model_dir)

    if not osp.exists('output'):
        os.symlink(cfg.output_dir,'output')

    if gpu_num > 1:
        args.port =find_free_port()
        mp.set_start_method("spawn")
        processes = list()
        for i in range(gpu_num):
            process = mp.Process(target=worker, args=(i, gpu_num, args))
            process.start()
            processes.append(process)

        for p in processes:
            p.join()
    else:
        worker(0, 1, args)
Example #4
0
def inference(args):
    @jit.trace(symbolic=False)
    def val_func():
        pred_boxes = net(net.inputs)
        return pred_boxes

    # model path
    saveDir = config.model_dir
    evalDir = config.eval_dir
    misc_utils.ensure_dir(evalDir)
    model_file = os.path.join(saveDir,
                              'epoch_{}.pkl'.format(args.resume_weights))
    assert os.path.exists(model_file)
    # load model
    net = network.Network()
    net.eval()
    check_point = mge.load(model_file)
    net.load_state_dict(check_point['state_dict'])
    ori_image, image, im_info = get_data(args.img_path)
    net.inputs["image"].set_value(image.astype(np.float32))
    net.inputs["im_info"].set_value(im_info)
    pred_boxes = val_func().numpy()
    num_tag = config.num_classes - 1
    target_shape = (pred_boxes.shape[0] // num_tag // top_k, top_k)
    pred_tags = (np.arange(num_tag) + 1).reshape(-1, 1)
    pred_tags = np.tile(pred_tags, target_shape).reshape(-1, 1)
    # nms
    if if_set_nms:
        from set_nms_utils import set_cpu_nms
        n = pred_boxes.shape[0] // top_k
        idents = np.tile(np.arange(n)[:, None], (1, top_k)).reshape(-1, 1)
        pred_boxes = np.hstack((pred_boxes, idents))
        keep = pred_boxes[:, -2] > args.thresh
        pred_boxes = pred_boxes[keep]
        pred_tags = pred_tags[keep]
        keep = set_cpu_nms(pred_boxes, 0.5)
        pred_boxes = pred_boxes[keep][:, :-1]
        pred_tags = pred_tags[keep]
    else:
        from set_nms_utils import cpu_nms
        keep = pred_boxes[:, -1] > args.thresh
        pred_boxes = pred_boxes[keep]
        pred_tags = pred_tags[keep]
        keep = cpu_nms(pred_boxes, 0.5)
        pred_boxes = pred_boxes[keep]
        pred_tags = pred_tags[keep]
    pred_tags = pred_tags.astype(np.int32).flatten()
    pred_tags_name = np.array(config.class_names)[pred_tags]
    visual_utils.draw_boxes(ori_image, pred_boxes[:, :-1], pred_boxes[:, -1],
                            pred_tags_name)
    name = args.img_path.split('/')[-1].split('.')[-2]
    fpath = '/data/jupyter/{}.png'.format(name)
    cv2.imwrite(fpath, ori_image)
Example #5
0
def inference(args):
    @jit.trace(symbolic=False)
    def val_func():
        pred_boxes = net(net.inputs)
        return pred_boxes

    # model path
    saveDir = config.model_dir
    evalDir = config.eval_dir
    misc_utils.ensure_dir(evalDir)
    model_file = os.path.join(saveDir,
                              'epoch_{}.pkl'.format(args.resume_weights))
    assert os.path.exists(model_file)
    # load model
    net = network.Network()
    net.eval()
    check_point = mge.load(model_file)
    net.load_state_dict(check_point['state_dict'])
    image, im_info = get_data(args.img_path)
    net.inputs["image"].set_value(image.astype(np.float32))
    net.inputs["im_info"].set_value(im_info)
    pred_boxes = val_func().numpy()
    num_tag = config.num_classes - 1
    target_shape = (pred_boxes.shape[0] // num_tag // top_k, top_k)
    pred_tags = (np.arange(num_tag) + 1).reshape(-1, 1)
    pred_tags = np.tile(pred_tags, target_shape).reshape(-1, 1)
    # nms
    if if_set_nms:
        from set_nms_utils import set_cpu_nms
        n = pred_boxes.shape[0] // top_k
        idents = np.tile(np.arange(n)[:, None], (1, top_k)).reshape(-1, 1)
        pred_boxes = np.hstack((pred_boxes, idents))
        keep = pred_boxes[:, -2] > 0.05
        pred_boxes = pred_boxes[keep]
        pred_tags = pred_tags[keep]
        keep = set_cpu_nms(pred_boxes, 0.5)
        pred_boxes = pred_boxes[keep][:, :-1]
        pred_tags = pred_tags[keep].flatten()
    else:
        from set_nms_utils import cpu_nms
        keep = pred_boxes[:, -1] > 0.05
        pred_boxes = pred_boxes[keep]
        pred_tags = pred_tags[keep]
        keep = cpu_nms(pred_boxes, 0.5)
        pred_boxes = pred_boxes[keep]
        pred_tags = pred_tags[keep].flatten()
    result_dict = dict(height=int(im_info[0, -2]),
                       width=int(im_info[0, -1]),
                       dtboxes=boxes_dump(pred_boxes, pred_tags))
    name = args.img_path.split('/')[-1].split('.')[-2]
    misc_utils.save_json_lines([result_dict], '{}.json'.format(name))
Example #6
0
def run_test():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_epoch', '-s',default = 30, type=int)
    parser.add_argument('--end_epoch','-e', default= 50, type=int)
    parser.add_argument('--devices', '-d', default=1, type=int)
    args = parser.parse_args()

    # model_path
    model_dir = config.model_dir
    eval_dir = config.eval_dir
    ensure_dir(config.eval_dir)
    records = load_json_lines(config.eval_source)

    start_epoch, end_epoch = args.start_epoch, args.end_epoch
    for epoch in range(start_epoch, end_epoch):
        model_file = osp.join(model_dir, 'epoch-{}.pkl'.format(epoch))
        if not osp.exists(model_file):
            continue
        print('Processing {}'.format(osp.basename(model_file)))
        results = eval_all(model_file, records, args)
        
        fpath = osp.join(eval_dir, 'epoch-{}.human'.format(epoch))
        save_json_lines(results, fpath)
Example #7
0
from tqdm import tqdm
import megengine as mge
from megengine import distributed as dist
from megengine import optimizer as optim
import megengine.autodiff as autodiff
from megengine import jit
# import dataset
import network
from config import config as cfg
from dataset.CrowdHuman import CrowdHuman
from misc_utils import ensure_dir
from megengine.core._imperative_rt.utils import Logger
from megengine import data
import pdb

ensure_dir(cfg.output_dir)
logger = mge.get_logger(__name__)
log_path = osp.join(cfg.output_dir, 'logger.log')
mge.set_log_file(log_path, mode='a')
Logger.set_log_level(Logger.LogLevel.Error)

def find_free_port():
    import socket
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    # Binding to port 0 will cause the OS to find an available port for us
    sock.bind(("", 0))
    port = sock.getsockname()[1]
    sock.close()
    # NOTE: there is still a chance the port could be taken by other processes.
    return port
def allreduce_cb(param, grad, group=dist.WORLD):
Example #8
0
def train(params):
    total_nr_iters = config.train_base_iters
    batch_per_gpu = config.train_batch_per_gpu
    base_lr = config.base_lr
    line = 'network.base_lr.{}.train_iter.{}'.format(base_lr, total_nr_iters)
    print(line)
    # set model save path and log save path
    saveDir = config.model_dir
    misc_utils.ensure_dir(saveDir)
    fpath = os.path.join(config.output_dir, line+'.log')
    fid_log = open(fpath,'a')
    # set data input pipe
    program_name = config.program_name
    # check gpus
    torch.set_default_tensor_type('torch.FloatTensor')
    if not torch.cuda.is_available():
        print('No GPU exists!')
        return
    else:
        num_gpus = torch.cuda.device_count()
        train_iter = total_nr_iters//(num_gpus*batch_per_gpu)
        train_lr_decay = np.array(config.lr_decay)//(num_gpus*batch_per_gpu)
        train_dump_interval = config.model_dump_interval//(num_gpus*batch_per_gpu)
    train_lr = base_lr * num_gpus
    bt_size = num_gpus * batch_per_gpu
    line = 'Num of GPUs:{}, learning rate:{:.5f}, batch size:{},\
            train_iter:{}, decay_iter:{}, dump_interval:{}'.format(
            num_gpus,train_lr,bt_size,train_iter,train_lr_decay, train_dump_interval)
    print(line)
    print("Building netowrk.")
    net = network.Network()
    # Moves all model parameters and buffers to the GPU.
    net.cuda()
    if params.resume_weights:
        model_file = os.path.join(saveDir, 'dump-{}.pth'.format(params.resume_weights))
        check_point = torch.load(model_file)
        net.load_state_dict(check_point['state_dict'])
    net = nn.DataParallel(net)
    # set the optimizer, use momentum and weight_decay
    optimizer = optim.SGD(net.parameters(), lr=train_lr, momentum=config.momentum, \
        weight_decay=config.weight_decay)
    # check if resume training
    training_data = train_dataset()

    net.train()

    if(params.progressbar):
        tqdm.monitor_interval = 0
        pbar = tqdm(total=train_iter, leave=False, ascii=True)

    dump_num = 1
    start_iter = 0
    if params.resume_weights:
        start_iter = int(params.resume_weights) * train_dump_interval
        if(start_iter >= train_lr_decay[0]):
            optimizer.param_groups[0]['lr'] = train_lr / 10
        if(start_iter >= train_lr_decay[1]):
            optimizer.param_groups[0]['lr'] = train_lr / 100
        dump_num = int(params.resume_weights) + 1

    for step in range(start_iter, train_iter):
        # warm up
        if step < config.warm_iter:
            alpha = step / config.warm_iter
            lr_new = 0.1 * train_lr + 0.9 * alpha * train_lr
            optimizer.param_groups[0]['lr'] = lr_new
        elif step == config.warm_iter:
            optimizer.param_groups[0]['lr'] = train_lr
        if step == train_lr_decay[0]:
            optimizer.param_groups[0]['lr'] = train_lr / 10
        elif step == train_lr_decay[1]:
            optimizer.param_groups[0]['lr'] = train_lr / 100
        # get training data
        images, gt_boxes, img_info = process(training_data, num_gpus)
        optimizer.zero_grad()
        # forwad
        outputs = net(images, img_info, gt_boxes)
        # collect the loss
        total_loss = sum([outputs[key].mean() for key in outputs.keys()])
        total_loss.backward()
        optimizer.step()
        if(params.progressbar):
            pbar.update(1)
        # stastic
        if step % config.log_dump_interval == 0:
            stastic_total_loss = total_loss.cpu().data.numpy()
            line = 'Iter {}: lr:{:.5f}, loss is {:.4f}.'.format(
                step, optimizer.param_groups[0]['lr'], stastic_total_loss)
            print(outputs)
            print(line)
            fid_log.write(line+'\n')
            fid_log.flush()
        # save the model
        if (step + 1)%train_dump_interval==0:
            fpath = os.path.join(saveDir,'dump-{}.pth'.format(dump_num))
            dump_num += 1
            model = dict(epoch = step,
                state_dict = net.module.state_dict(),
                optimizer = optimizer.state_dict())
            torch.save(model,fpath)

    if(params.progressbar):
        pbar.close()

    fid_log.close()
Example #9
0
def train(args):
    if type(config.train_source) == list:
        training_data = multi_train_dataset(args)
    else:
        training_data = train_dataset(args)
    number_of_training_instances = training_data.__next__()
    val_data = eval_dataset(args)
    number_of_val_instances = val_data.__next__()

    total_nr_iters = args.epochs * number_of_training_instances
    batch_per_gpu = config.train_batch_per_gpu

    base_lr = config.base_lr
    line = 'network.base_lr.{}.train_iter.{}'.format(base_lr, total_nr_iters)

    print(line)

    # set model save path and log save path
    saveDir = config.model_dir
    misc_utils.ensure_dir(saveDir)

    # set data input pipe
    program_name = config.program_name
    # check gpus
    torch.set_default_tensor_type('torch.FloatTensor')
    if not torch.cuda.is_available():
        print('No GPU exists!')
        return
    else:
        num_gpus = torch.cuda.device_count()

        train_iter = total_nr_iters // (num_gpus * batch_per_gpu)

        print('[-]', num_gpus, batch_per_gpu, total_nr_iters)

        new_decay = (np.array(config.lr_decay) / 450000) * total_nr_iters

        train_lr_decay = new_decay // (num_gpus * batch_per_gpu)

        train_dump_interval = number_of_training_instances // (num_gpus *
                                                               batch_per_gpu)

    train_lr = base_lr * num_gpus
    bt_size = num_gpus * batch_per_gpu

    line = 'Num of GPUs:{}, learning rate:{:.5f}, batch size:{},\
            train_iter:{}, decay_iter:{}, dump_interval:{}'.format(
        num_gpus, train_lr, bt_size, train_iter, train_lr_decay,
        train_dump_interval)
    print(line)

    print("[-]Building netowrk.")
    net = network.Network(args)
    net.cuda()

    best = 10e10
    epoch = 0
    if args.resume:
        print("Load base model from :",
              os.path.join(args.save_dir, args.output_name, 'dump_last.pth'))
        check_point = torch.load(
            os.path.join(args.save_dir, args.output_name, 'dump_last.pth'))
        net.load_state_dict(check_point['state_dict'])
        start_iter = check_point['step']
        if 'val_loss' in check_point:
            best = check_point['val_loss']
        epoch = start_iter // train_dump_interval + 1
    elif args.base_model:
        print("Load base model from :", args.base_model)
        check_point = torch.load(args.base_model)
        net.load_state_dict(check_point['state_dict'], strict=False)
        start_iter = 0
    else:
        start_iter = 0

    net = nn.DataParallel(net)
    # set the optimizer, use momentum and weight_decay
    optimizer = optim.SGD(net.parameters(), lr=train_lr, momentum=config.momentum, \
        weight_decay=config.weight_decay)

    if (start_iter >= train_lr_decay[0]):
        optimizer.param_groups[0]['lr'] = train_lr / 10
    if (start_iter >= train_lr_decay[1]):
        optimizer.param_groups[0]['lr'] = train_lr / 100

    # check if resume training
    net.train()
    logger = Logger(args)

    iter_tqdm = None
    val_tqdm = None
    for step in range(start_iter, train_iter):
        # warm up
        if step < config.warm_iter:
            alpha = step / config.warm_iter
            lr_new = 0.1 * train_lr + 0.9 * alpha * train_lr
            optimizer.param_groups[0]['lr'] = lr_new
        elif step == config.warm_iter:
            optimizer.param_groups[0]['lr'] = train_lr
        if step == train_lr_decay[0]:
            optimizer.param_groups[0]['lr'] = train_lr / 10
        elif step == train_lr_decay[1]:
            optimizer.param_groups[0]['lr'] = train_lr / 100
        # get training data
        images, gt_boxes, img_info, done_an_epoch, extra = process(
            args, training_data, num_gpus)
        if done_an_epoch:
            epoch += 1
        optimizer.zero_grad()
        # forward
        outputs = net(images, img_info, gt_boxes, extra=extra)
        # collect the loss
        total_loss = sum([outputs[key].mean() for key in outputs.keys()])
        total_loss.backward()
        optimizer.step()

        # stastic
        stastic_total_loss = total_loss.cpu().data.numpy()
        line = '[*]Epoch:{} iter<{}> lr:{:.5f}, loss:{:.4f}'.format(
            epoch, step, optimizer.param_groups[0]['lr'],
            float(stastic_total_loss))

        if step % config.log_dump_interval == 0:
            logger.scalar_summary('lr', optimizer.param_groups[0]['lr'], step)
            for k, v in outputs.items():
                v = float(np.mean(v.cpu().data.numpy()))
                logger.scalar_summary(k, v, step)
                line += ', ' + k + ':{:.4}'.format(v)
            logger.scalar_summary('total_loss', float(stastic_total_loss),
                                  step)
        else:
            for k, v in outputs.items():
                v = float(np.mean(v.cpu().data.numpy()))
                line += ', ' + k + ':{:.4}'.format(v)
        if iter_tqdm is None:
            iter_tqdm = tqdm(total=train_iter, desc='Iteration')
            iter_tqdm.update(start_iter)
        iter_tqdm.set_description("[-] " + line)
        iter_tqdm.refresh()
        # save the best model
        if done_an_epoch:
            if args.save_per_epoch > 0:
                if (epoch + 1) % args.save_per_epoch == 0:
                    fpath = os.path.join(saveDir, 'dump_{}.pth'.format(epoch))
                    print('[.] Saving :', fpath)
                    model = dict(epoch=epoch,
                                 step=step,
                                 state_dict=net.module.state_dict(),
                                 optimizer=optimizer.state_dict())
                    torch.save(model, fpath)

            fpath = os.path.join(saveDir, 'dump_last.pth')
            print('[.] Saving :', fpath)
            model = dict(epoch=epoch,
                         step=step,
                         state_dict=net.module.state_dict(),
                         optimizer=optimizer.state_dict())
            torch.save(model, fpath)

        net.train()

        iter_tqdm.update(1)
    iter_tqdm.close()

    fpath = os.path.join(saveDir, 'dump_last.pth')
    print('[.] Saving :', fpath)
    model = dict(step=step,
                 state_dict=net.module.state_dict(),
                 optimizer=optimizer.state_dict())
    torch.save(model, fpath)