コード例 #1
0
ファイル: train_0.py プロジェクト: chicm/detect
def run_train(args):
    assert torch.cuda.is_available(), 'Error: CUDA not found!'
    best_loss = float('inf')  # best test loss
    start_epoch = 0  # start from epoch 0 or last epoch

    # Data
    print('==> Preparing data..')

    trainloader = get_train_loader(img_dir=settings.IMG_DIR,
                                   batch_size=batch_size)
    #trainloader = get_small_train_loader()
    print(trainloader.num)
    #testloader = get_train_loader(img_dir=settings.IMG_DIR)

    # Model
    net = RetinaNet()
    #net.load_state_dict(torch.load('./model/net.pth'))
    net.load_state_dict(torch.load('./ckps/best_0.pth'))
    net = torch.nn.DataParallel(net,
                                device_ids=range(torch.cuda.device_count()))
    net.cuda()

    criterion = FocalLoss()
    #optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4)
    optimizer = optim.Adam(net.parameters(), lr=args.lr)

    iter_save = 200
    bgtime = time.time()
    # Training
    for epoch in range(start_epoch, start_epoch + 100):
        print('\nEpoch: %d' % epoch)
        net.train()
        #net.module.freeze_bn()
        train_loss = 0
        for batch_idx, (inputs, loc_targets,
                        cls_targets) in enumerate(trainloader):
            inputs = Variable(inputs.cuda())
            loc_targets = Variable(loc_targets.cuda())
            cls_targets = Variable(cls_targets.cuda())

            optimizer.zero_grad()
            loc_preds, cls_preds = net(inputs)
            loss = criterion(loc_preds, loc_targets, cls_preds, cls_targets)
            loss.backward()
            optimizer.step()

            #train_loss += loss.data[0]
            sample_num = (batch_idx + 1) * batch_size
            avg_loss = running_loss(loss.data[0])
            print(
                'Epoch: {}, num: {}/{} train_loss: {:.3f} | run_loss: {:.3f} min: {:.1f}'
                .format(epoch, sample_num, trainloader.num, loss.data[0],
                        avg_loss, (time.time() - bgtime) / 60),
                end='\r')

            if batch_idx % iter_save == 0:
                torch.save(
                    net.module.state_dict(),
                    './ckps/best_{}.pth'.format(batch_idx // iter_save % 5))
                log.info('batch: {}, loss: {:.4f}'.format(batch_idx, avg_loss))
コード例 #2
0
ファイル: train4.py プロジェクト: jiaojiechu/retinanet1
        }
        # ckpt_path = os.path.join('ckpts', args.exp)
        ckpt_path = '.store'
        if not os.path.isdir(ckpt_path):
            os.makedirs(ckpt_path)
        torch.save(state, os.path.join(ckpt_path, 'ckpt.pth'))
        best_loss = loss


for epoch in range(start_epoch + 1, start_epoch + cfg.num_epochs + 1):
    if epoch in cfg.lr_decay_epochs:
        lr *= 0.1
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
    print('\nTrain Epoch: %d' % epoch)
    net.train()

    train_loss = 0

    for batch_idx, (inputs, loc_targets, cls_targets) in enumerate(trainloader):
        # print(np.any(np.isnan(inputs.numpy())))
        # print(np.any(np.isnan(loc_targets.numpy())))
        # print(np.any(np.isnan(loc_targets.numpy())))
        # ipdb.set_trace()
        pos = cls_targets > 0
        # pos1=cls_targets ==0
        # pos2=cls_targets ==-1

        print(pos.data.long().sum())
        inputs = Variable(inputs.cuda())
        loc_targets = Variable(loc_targets.cuda())
コード例 #3
0
def train():
    args = parse_args()

    assert torch.cuda.is_available(), 'Error: CUDA not found!'
    assert args.focal_loss, "OHEM + ce_loss is not working... :("

    if not os.path.exists(args.save_folder):
        os.mkdir(args.save_folder)

    if not os.path.exists(args.logdir):
        os.mkdir(args.logdir)

    ###########################################################################
    # Data
    ###########################################################################

    print('==> Preparing data..')
    trainset = ListDataset(root='/mnt/9C5E1A4D5E1A2116/datasets/',
                           dataset=args.dataset,
                           train=True,
                           transform=Augmentation_traininig,
                           input_size=args.input_size,
                           multi_scale=args.multi_scale)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=args.num_workers,
                                              collate_fn=trainset.collate_fn)

    ###########################################################################

    # Training Detail option\
    stepvalues = (10000, 20000, 30000, 40000, 50000) if args.dataset in ["SynthText"] \
        else (2000, 4000, 6000, 8000, 10000)
    best_loss = float('inf')  # best test loss
    start_epoch = 0  # start from epoch 0 or last epoch
    iteration = 0
    cur_lr = args.lr
    mean = (0.485, 0.456, 0.406)
    var = (0.229, 0.224, 0.225)
    step_index = 0
    pEval = None

    ###########################################################################
    # Model
    ###########################################################################

    # set model (focal_loss vs OHEM_CE loss)
    if args.focal_loss:
        imagenet_pretrain = 'weights/retinanet_se50.pth'
        criterion = FocalLoss()
        num_classes = 1
    else:
        imagenet_pretrain = 'weights/retinanet_se50_OHEM.pth'
        criterion = OHEM_loss()
        num_classes = 2

    net = RetinaNet(num_classes)

    # Restore model weights
    net.load_state_dict(torch.load(imagenet_pretrain))

    if args.resume:
        print('==> Resuming from checkpoint..', args.resume)
        checkpoint = torch.load(args.resume)
        net.load_state_dict(checkpoint['net'])
        #start_epoch = checkpoint['epoch']
        #iteration = checkpoint['iteration']
        #cur_lr = checkpoint['lr']
        #step_index = checkpoint['step_index']
        # optimizer.load_state_dict(state["optimizer"])

    print("multi_scale : ", args.multi_scale)
    print("input_size : ", args.input_size)
    print("stepvalues : ", stepvalues)
    print("start_epoch : ", start_epoch)
    print("iteration : ", iteration)
    print("cur_lr : ", cur_lr)
    print("step_index : ", step_index)
    print("num_gpus : ", torch.cuda.device_count())

    # Data parellelism for multi-gpu training
    net = torch.nn.DataParallel(net,
                                device_ids=range(torch.cuda.device_count()))
    net.cuda()

    # Put model in training mode and freeze batch norm.
    net.train()
    net.module.freeze_bn()  # you must freeze batchnorm

    ###########################################################################
    # Optimizer
    ###########################################################################

    optimizer = optim.SGD(net.parameters(),
                          lr=cur_lr,
                          momentum=0.9,
                          weight_decay=1e-4)
    #optimizer = optim.Adam(net.parameters(), lr=cur_lr)

    ###########################################################################
    # Utils
    ###########################################################################

    encoder = DataEncoder()
    writer = SummaryWriter(log_dir=args.logdir)

    ###########################################################################
    # Training loop
    ###########################################################################

    t0 = time.time()
    for epoch in range(start_epoch, 10000):
        if iteration > args.max_iter:
            break

        for inputs, loc_targets, cls_targets in trainloader:
            inputs = Variable(inputs.cuda())
            loc_targets = Variable(loc_targets.cuda())
            cls_targets = Variable(cls_targets.cuda())

            optimizer.zero_grad()
            loc_preds, cls_preds = net(inputs)

            loc_loss, cls_loss = criterion(loc_preds, loc_targets, cls_preds,
                                           cls_targets)
            loss = loc_loss + cls_loss
            loss.backward()
            optimizer.step()

            if iteration % 20 == 0:
                t1 = time.time()

                print(
                    'iter ' + repr(iteration) + ' (epoch ' + repr(epoch) +
                    ') || loss: %.4f || l loc_loss: %.4f || l cls_loss: %.4f (Time : %.1f)'
                    % (loss.sum().item(), loc_loss.sum().item(),
                       cls_loss.sum().item(), (t1 - t0)))
                # t0 = time.time()

                writer.add_scalar('loc_loss', loc_loss.sum().item(), iteration)
                writer.add_scalar('cls_loss', cls_loss.sum().item(), iteration)
                writer.add_scalar('loss', loss.sum().item(), iteration)

                # show inference image in tensorboard
                infer_img = np.transpose(inputs[0].cpu().numpy(), (1, 2, 0))
                infer_img *= var
                infer_img += mean
                infer_img *= 255.
                infer_img = np.clip(infer_img, 0, 255)
                infer_img = infer_img.astype(np.uint8)
                h, w, _ = infer_img.shape

                boxes, labels, scores = encoder.decode(loc_preds[0],
                                                       cls_preds[0], (w, h))
                boxes = boxes.reshape(-1, 4, 2).astype(np.int32)

                if boxes.shape[0] != 0:
                    # infer_img = infer_img/np.float32(255)

                    # print(boxes)
                    # print(
                    #     f"infer_img prior to cv2.polylines - dtype: {infer_img.dtype}, shape: {infer_img.shape}, min: {infer_img.min()}, max: {infer_img.max()}")
                    # print(
                    #     f"boxes prior to cv2.polylines - dtype: {boxes.dtype}, shape: {boxes.shape}, min: {boxes.min()}, max: {boxes.max()}")
                    infer_img = cv2.polylines(infer_img.copy(), boxes, True,
                                              (0, 255, 0), 4)

                # print(
                #     f"infer_img - dtype: {infer_img.dtype}, shape: {infer_img.shape}, min: {infer_img.min()}, max: {infer_img.max()}")

                writer.add_image('image',
                                 infer_img,
                                 iteration,
                                 dataformats="HWC")
                writer.add_scalar('input_size', h, iteration)
                writer.add_scalar('learning_rate', cur_lr, iteration)

                t0 = time.time()

            if iteration % args.save_interval == 0 and iteration > 0:
                print('Saving state, iter : ', iteration)
                state = {
                    'net': net.module.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    'iteration': iteration,
                    'epoch': epoch,
                    'lr': cur_lr,
                    'step_index': step_index
                }
                model_file = args.save_folder + \
                    'ckpt_' + repr(iteration) + '.pth'
                torch.save(state, model_file)

            if iteration in stepvalues:
                step_index += 1
                cur_lr = adjust_learning_rate(cur_lr, optimizer, args.gamma,
                                              step_index)

            if iteration > args.max_iter:
                break

            if args.evaluation and iteration % args.eval_step == 0:
                try:
                    if pEval is None:
                        print("Evaluation started at iteration {} on IC15...".
                              format(iteration))
                        eval_cmd = "CUDA_VISIBLE_DEVICES=" + str(args.eval_device) + \
                            " python eval.py" + \
                            " --tune_from=" + args.save_folder + 'ckpt_' + repr(iteration) + '.pth' + \
                            " --input_size=1024" + \
                            " --output_zip=result_temp1"

                        pEval = Popen(eval_cmd,
                                      shell=True,
                                      stdout=PIPE,
                                      stderr=PIPE)

                    elif pEval.poll() is not None:
                        (scorestring, stderrdata) = pEval.communicate()

                        hmean = float(
                            str(scorestring).strip().split(":")[3].split(",")
                            [0].split("}")[0].strip())

                        writer.add_scalar('test_hmean', hmean, iteration)

                        print("test_hmean for {}-th iter : {:.4f}".format(
                            iteration, hmean))

                        if pEval is not None:
                            pEval.kill()
                        pEval = None

                except Exception as e:
                    print("exception happened in evaluation ", e)
                    if pEval is not None:
                        pEval.kill()
                    pEval = None

            iteration += 1