Example #1
0
def train(params):
    params = Params(params)

    set_random_seeds(params.seed)

    time_now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    params.save_root = params.save_root + f'/{params.project_name}_{time_now}_{params.version}'
    os.makedirs(params.save_root, exist_ok=True)

    logging.basicConfig(
        filename=
        f'{params.save_root}/{params.project_name}_{time_now}_{params.version}.log',
        filemode='a',
        format='%{asctime}s - %(levalname)s: %(message)s')

    if params.num_gpus == 0:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    logging.info(f'Available GPUs: {torch.cuda.device_count()}')

    # Train pipeline
    files = glob.glob(
        os.path.join(params.data_root, params.project_name, params.train_set,
                     '*/*.JPEG'))
    labels = []
    for fp in files:
        label = int(fp.split('/')[-2]) - 1
        labels.append(label)
    assert len(files) == len(labels)
    train_pipeline = TrainImageDecoderPipeline(params=params,
                                               device_id=0,
                                               files=files,
                                               labels=labels)
    train_pipeline.build()
    train_pii = pytorchIterator(train_pipeline,
                                last_batch_policy=LastBatchPolicy.DROP,
                                reader_name='Reader',
                                auto_reset=True)
    # Evaluation pipeline
    files = glob.glob(
        os.path.join(params.data_root, params.project_name, params.val_set,
                     '*.JPEG'))
    files = sorted(files,
                   key=lambda f: f.split('/')[-1].split('_')[-1].split('.')[0])
    labels = loadlabel(
        os.path.join(
            params.data_root, params.project_name,
            'ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt'
        ))
    eval_pipeline = EvalImageDecoderPipeline(params=params,
                                             device_id=0,
                                             files=files,
                                             labels=labels)
    eval_pipeline.build()
    eval_pii = pytorchIterator(eval_pipeline,
                               last_batch_policy=LastBatchPolicy.PARTIAL,
                               reader_name='Reader',
                               auto_reset=True)

    model = Darknet()

    last_step = 0
    last_epoch = 0
    if params.load_weights != 'None':
        try:
            state_dict = torch.load(params.load_weights)
            model.load_state_dict(state_dict)
            last_step = int(params.load_weights.split('_')[-1].split('.')[0])
            last_epoch = int(params.load_weights.split('_')[-2])
        except:
            logging.error('Fail to resuming from weight!')
            exit()

    if params.num_gpus > 0:
        model = model.cuda()
        if params.num_gpus > 1:
            model = nn.DataParallel(model)

    if params.optim == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=params.learning_rate)
    else:
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=params.learning_rate,
                                    momentum=0.9,
                                    nesterov=True)

    criterion = nn.CrossEntropyLoss()
    # ls_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, factor=0.5, verbose=True, patience=8)

    epoch = 0
    begin_epoch = max(0, last_epoch)
    step = max(0, last_step)
    best_loss = 100
    logging.info('Begin to train...')
    model.train()
    try:
        for epoch in range(begin_epoch, params.epoch):
            for iter, data in enumerate(train_pii):
                x = data[0]['data']
                label = data[0]['label'].squeeze(-1).long().cuda()
                output = model(x)
                loss = criterion(output, label)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if iter % params.save_interval == 0:
                    logging.info(
                        f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} '
                        f'Train Epoch: {epoch} iter: {iter} loss: {loss.item()}'
                    )
                step += 1
            if epoch % params.eval_interval == 0:
                model.eval()
                epoch_loss = 0
                prediciton = []
                target = []
                with torch.no_grad():
                    for iter, data in enumerate(eval_pii):
                        x = data[0]['data']
                        label = data[0]['label'].squeeze(-1).long().cuda()
                        output = model(x)
                        loss = criterion(output, label).item()
                        epoch_loss += loss * x.shape[0]
                        prediciton.append(output)
                        target.append(label)
                    loss = epoch_loss / 50000
                    prediciton = torch.cat(prediciton, dim=0)
                    target = torch.cat(target, dim=0)
                    acc = top1accuracy(prediciton, target)
                    acctop5 = top5accuracy(prediciton, target)
                    logging.info(
                        f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} '
                        f'Eval Epoch: {epoch} loss: {loss} accuracy: {acc} Top5 acc: {acctop5}'
                    )
                    if loss < best_loss:
                        best_loss = loss
                        save_checkpoint(
                            model, f'{params.save_root}/{epoch}_{step}.pth')
                model.train()

    except KeyboardInterrupt:
        save_checkpoint(model,
                        f'{params.save_root}/Interrupt_{epoch}_{step}.pth')
Example #2
0
            model.load_darknet_weights(opt.pretrained_weights)

    # Get dataloader
    dataset = ListDataset(train_path,
                          augment=True,
                          multiscale=opt.multiscale_training)
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=opt.batch_size,
        shuffle=True,
        num_workers=opt.n_cpu,
        pin_memory=True,
        collate_fn=dataset.collate_fn,
    )

    optimizer = torch.optim.Adam(model.parameters())

    metrics = [
        "grid_size",
        "loss",
        "x",
        "y",
        "w",
        "h",
        "conf",
        "cls",
        "cls_acc",
        "recall50",
        "recall75",
        "precision",
        "conf_obj",
Example #3
0
def train(params):
    params = Params(params)

    set_random_seeds(params.seed)

    time_now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    params.save_root = params.save_root + f'/{params.project_name}_{time_now}_{params.version}'
    os.makedirs(params.save_root, exist_ok=True)

    logging.basicConfig(
        filename=
        f'{params.save_root}/{params.project_name}_{time_now}_{params.version}.log',
        filemode='a',
        format='%{asctime}s - %(levalname)s: %(message)s')

    if params.num_gpus == 0:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    logging.info(f'Available GPUs: {torch.cuda.device_count()}')

    data_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=params.mean, std=params.std)
    ])

    train_set = TrainDataset(root=os.path.join(params.data_root,
                                               params.project_name,
                                               params.train_set),
                             transform=data_transform)
    # val_set = EvalDataset(root=os.path.join(params.data_root, params.project_name, params.val_set),
    #                       label_path=os.path.join(params.data_root, params.project_name,
    #                                               'ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt'),
    #                       transform=data_transform)
    train_params = {
        'batch_size': params.batch_size,
        'shuffle': False,
        'num_workers': 4,
        'drop_last': True
    }
    # val_params = {'batch_size': params.batch_size, 'shuffle': False, 'num_workers': params.num_gpus * 4,
    #               'drop_last': False}
    train_loader = DataLoader(train_set, **train_params)
    # val_loader = DataLoader(val_set, **val_params)

    eli = ExternalInputIterator(params.batch_size)
    params.mean = torch.Tensor(params.mean).unsqueeze(0).unsqueeze(0)
    params.std = torch.Tensor(params.std).unsqueeze(0).unsqueeze(0)
    pipe = ExternalSourcePipeline(params=params,
                                  num_threads=4,
                                  device_id=0,
                                  external_date=eli,
                                  seed=params.seed)
    # pipe.build()
    # images, _ = pipe.run()
    # print(np.array(images[0].as_cpu()).shape)
    import matplotlib.pyplot as plt
    # plt.imsave('image[0].jpg', np.array(images[0].as_cpu()).transpose((1, 2, 0)))
    # plt.imsave('image[0].jpg', images[0].as_cpu())
    # exit()

    pii = pytorchIterator(pipe,
                          last_batch_padded=True,
                          last_batch_policy=LastBatchPolicy.DROP)

    model = Darknet()
    criterion = nn.CrossEntropyLoss()

    last_step = 0
    last_epoch = 0
    if params.load_weights != 'None':
        try:
            state_dict = torch.load(params.load_weights)
            model.load_state_dict(state_dict)
            last_step = int(params.load_weights.split('_')[-1].split('.')[0])
            last_epoch = int(params.load_weights.split('_')[-2])
        except:
            logging.error('Fail to resuming from weight!')
            exit()

    if params.num_gpus > 0:
        model = model.cuda()
        if params.num_gpus > 1:
            model = nn.DataParallel(model)

    if params.optim == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=params.learning_rate)
    else:
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=params.learning_rate,
                                    momentum=0.9,
                                    nesterov=True)

    epoch = 0
    begin_epoch = max(0, last_epoch)
    step = max(0, last_step)
    logging.info('Begin to train...')
    model.train()
    try:
        import time
        for epoch in range(begin_epoch, params.epoch):
            for iter, (data_pii,
                       data_torch) in enumerate(zip(pii, train_loader)):
                t = time.time()
                # type(x)显示torch.Tensor,但是x已经在显存上
                x_pii = data_pii[0]['data']
                label_pii = data_pii[0]['label'].cuda()
                x_torch = data_torch[0].cuda()
                label_torch = data_torch[1].cuda()
                x_pii = x_pii.cpu().squeeze(0).numpy().transpose((1, 2, 0))
                x_torch = x_torch.cpu().squeeze(0).numpy().transpose((1, 2, 0))
                import matplotlib.pyplot as plt
                plt.imsave('x_pii.jpg', x_pii)
                plt.imsave('x_torch.jpg', x_torch)
                exit()
                # print('load data time:', time.time() - t)
                # t = time.time()
                # output = model(x)
                # loss = criterion(output, label)
                # optimizer.zero_grad()
                # loss.backward()
                # optimizer.step()
                # print('running time:', time.time() - t)
                # if iter == 6:
                #     exit()

    except KeyboardInterrupt:
        save_checkpoint(model,
                        f'{params.save_root}/Interrupt_{epoch}_{step}.pth')
Example #4
0
def train(payload):

    labeled = payload["labeled"]
    resume_from = payload["resume_from"]
    ckpt_file = payload["ckpt_file"]

    # hyperparameters
    batch_size = 16
    epochs = 2  # just for demo
    lr = 1e-2
    weight_decay = 1e-2

    coco = COCO("./data", Transforms(), samples=labeled, train=True)
    loader = DataLoader(coco,
                        shuffle=True,
                        batch_size=batch_size,
                        collate_fn=collate_fn)

    config_file = "yolov3.cfg"
    model = Darknet(config_file).to(device)
    optimizer = optim.Adam(model.parameters(),
                           lr=lr,
                           weight_decay=weight_decay)

    # resume model and optimizer from previous loop
    if resume_from is not None:
        ckpt = torch.load(os.path.join("./log", resume_from))
        model.load_state_dict(ckpt["model"])
        optimizer.load_state_dict(ckpt["optimizer"])

    # loss function
    priors = anchors.normalize("xyxy")
    loss_fn = HardNegativeMultiBoxesLoss(priors, device=device)

    model.train()
    for img, boxes, labels in loader:
        img = img.to(device)

        # 3 predictions from 3 yolo layers
        output = model(img)

        # batch predictions on each image
        batched_prediction = []
        for p in output:  # (batch_size, 3, gx, gy, 85)
            batch_size = p.shape[0]
            p = p.view(batch_size, -1, 85)

            batched_prediction.append(p)

        batched_prediction = torch.cat(batched_prediction, dim=1)
        # (batch_size, n_priors, 85)

        # the last dim of batched_prediction represent the predicted box
        # batched_prediction[...,:4] is the coordinate of the predicted bbox
        # batched_prediction[...,4] is the objectness score
        # batched_prediction[...,5:] is the pre-softmax class distribution

        # we need to apply some transforms to the those predictions
        # before we can use HardNegativeMultiBoxesLoss
        # In particular, the predicted bbox need to be relative to
        # normalized anchor priors
        # we will define another function bbox_transform
        # to do those transform, since it will be used by other processes
        # as well.
        # see documentation on HardNegativeMultiBoxesLoss
        # on its input parameters

        predicted_boxes, predicted_objectness, predicted_class_dist = bbox_transform(
            batched_prediction)

        loss = loss_fn(predicted_boxes, predicted_objectness,
                       predicted_class_dist, boxes, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # save ckpt for this loop
    ckpt = {"model": model.state_dict(), "optimizer": optimizer.state_dict()}

    torch.save(ckpt, os.path.join("./log", ckpt_file))
    return
def main(train_path="../data/train/images/",
         val_path="../data/train/images/",
         labels_path="../data/train/yolo_labels/",
         weights_path="../checkpoints/",
         preload_weights_file="darknet53.conv.74",
         output_path="../output",
         yolo_config_file="../config/yolov3-kitti.cfg",
         fraction=1,
         learning_rate=1e-3,
         weight_decay=1e-4,
         batch_size=2,
         epochs=30,
         freeze_struct=[True, 5]):
    """
        This is the point of entry to the neural network program.
        All the training history will be saved as a csv in the output path
        
        Args
            train_path (string): Directory containing the training images
            val_path (string):: Directory containing the val images
            labels_path (string):: Directory containing the yolo format labels for data
            weights_path (string):: Directory containing the weights (new weights for this program will also be added here)
            preload_weights_file (string): Name of preload weights file
            output_path (string): Directory to store the training history outputs as csv
            yolo_config_file (string): file path of yolo configuration file
            fraction (float): fraction of data to use for training
            learning_rate (float): initial learning rate
            weight_decay (float): weight decay value
            batch_size (int): batch_size for both training and validation
            epochs (int): maximum number of epochs to train the model
            freeze_struct (list): [bool, int] indicating whether to freeze the Darknet backbone and until which epoch should it be frozen
            
        Returns
            None
    
    """

    # Set up checkpoints path
    checkpoints_path = weights_path

    # Set up env variables and create required directories
    os.makedirs(output_path, exist_ok=True)
    os.makedirs(checkpoints_path, exist_ok=True)

    # Set up cuda
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    print("Available device = ", device)

    # Create model and load pretrained darknet weights
    model = Darknet(yolo_config_file)
    print("Loading imagenet weights to darknet")
    model.load_weights(os.path.join(weights_path, preload_weights_file))
    model.to(device)
    #print(model)

    # Create datasets
    train_dataset = KITTI2D(train_path,
                            labels_path,
                            fraction=fraction,
                            train=True)
    valid_dataset = KITTI2D(val_path,
                            labels_path,
                            fraction=fraction,
                            train=False)

    # Create dataloaders
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)
    valid_dataloader = DataLoader(valid_dataset,
                                  batch_size=batch_size,
                                  shuffle=False)

    # Create optimizers
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        model.parameters()),
                                 lr=learning_rate,
                                 weight_decay=weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 10)

    # Create log csv files
    train_log_file = open(os.path.join(output_path, "train_results.csv"),
                          "w",
                          newline="")
    valid_log_file = open(os.path.join(output_path, "valid_results.csv"),
                          "w",
                          newline="")
    train_csv = csv.writer(train_log_file)
    valid_csv = csv.writer(valid_log_file)

    print("Starting to train yolov3 model...")

    # Train model here
    train_model(model,
                device,
                optimizer,
                lr_scheduler,
                train_dataloader,
                valid_dataloader,
                train_csv,
                valid_csv,
                weights_path,
                max_epochs=epochs,
                tensor_type=torch.cuda.FloatTensor,
                update_gradient_samples=1,
                freeze_darknet=freeze_struct[0],
                freeze_epoch=freeze_struct[1])

    # Close the log files
    train_log_file.close()
    valid_log_file.close()

    print("Training completed")
Example #6
0
def main():
    # Hyperparameters parser
    parser = argparse.ArgumentParser()
    parser.add_argument("--year", type=str, default='2012', help="used to select training set")
    parser.add_argument("--set", type=str, default='train', help="used to select training set")
    parser.add_argument("--epochs", type=int, default=201, help="number of epochs")
    parser.add_argument("--batch_size", type=int, default=8, help="size of each image batch")
    parser.add_argument("--model_def", type=str, default="config/net/resnet_dropout.cfg", help="path to model definition file")
    # parser.add_argument("--model_def", type=str, default="config/net/dqnyolo_large.cfg", help="path to model definition file")
    # parser.add_argument("--model_def", type=str, default="config/net/dqnyolo_mini.cfg", help="path to model definition file")
    # parser.add_argument("--model_def", type=str, default="config/net/dqnyolo_tiny.cfg", help="path to model definition file")
    parser.add_argument("--img_size", type=int, default=416, help="size of each image dimension")
    parser.add_argument("--opt_lr", type=float, default=1e-5, help="learning rate for optimizer")
    parser.add_argument("--use_gpu", default=True, help="use GPU to accelerate training")
    parser.add_argument("--shuffle_train", default=True, help="shuffle the training dataset")
    parser.add_argument("--checkpoint_interval", type=int, default=20, help="interval between saving model weights")
    parser.add_argument("--evaluation_interval", type=int, default=10, help="interval evaluations on validation set")
    # parser.add_argument("--pretrained_weights", type=str, default="data/backbone/darknet53.conv.74", help="if specified starts from checkpoint model")
    # parser.add_argument("--pretrained_weights", type=str, default="logs/model/model_params_200.ckpt", help="if specified starts from checkpoint model")
    parser.add_argument("--pretrained_weights", default=False, help="if specified starts from checkpoint model")
    opt = parser.parse_args()
    print(opt)

    if opt.use_gpu is True:
        if torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            raise RuntimeError("Current Torch doesn't have GPU support.")
    else:
        device = torch.device('cpu')

    logger = SummaryWriter(exist_or_create_folder("./logs/tb/"))

    # Initiate model
    eval_model = Darknet(opt.model_def).to(device)
    if opt.pretrained_weights:
        print("Initialize model with pretrained_model")
        if opt.pretrained_weights.endswith(".ckpt"):
            eval_model.load_state_dict(torch.load(opt.pretrained_weights))
        else:
            eval_model.load_darknet_weights(opt.pretrained_weights)
    else:
        print("Initialize model randomly")
        eval_model.apply(weights_init_normal)
    # eval_model.load_state_dict(torch.load("./logs/saved_exp/master-v2/model_params_80.ckpt"))
    print(eval_model)
    summary(eval_model, (3, 416, 416))

    learn_batch_counter = 0  # for logger update (total numbers)
    batch_size = opt.batch_size

    # Get dataloader
    print("Begin loading train dataset ......")
    t_load_data = time.time()
    dataset = torchvision.datasets.VOCDetection(root='data/VOC/',
                                                year=opt.year,
                                                image_set=opt.set,
                                                transforms=None,
                                                download=True)
    dataset_dict = trans_voc(dataset)
    dataset = ListDataset(dataset_dict)
    loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=opt.batch_size,
        shuffle=opt.shuffle_train,
        pin_memory=True,
        collate_fn=dataset.collate_fn,
    )
    print("Complete loading train dataset in {} s".format(time.time() - t_load_data))

    optimizer = torch.optim.Adam(eval_model.parameters(), lr=opt.opt_lr)
    # Warmup and learning rate decay
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, opt.epochs)
    # 5 epoch warmup, lr from 1e-5 to 1e-4, after that schedule as after_scheduler
    scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=10,
                                              after_scheduler=scheduler_cosine)

    start_time = time.time()

    for i_epoch in range(opt.epochs):
        eval_model.train()

        for i_batch, (_, imgs, raw_targets, transform_params, tar_boxes) in enumerate(loader):
            print("\n++++++++++ i_epoch-i_batch {}-{} ++++++++++".format(i_epoch, i_batch))
            batch_step_counter = 0

            if len(imgs) != batch_size:
                print("Current batch size is smaller than opt.batch_size!")
                continue

            imgs = imgs.to(device)
            raw_targets = raw_targets.to(device)
            tar_boxes = tar_boxes.to(device)

            input_img = imgs

            if i_epoch == 0 and i_batch == 0:
                logger.add_graph(eval_model, input_img)

            # print(raw_targets)
            # print(raw_targets.size())
            # print(raw_targets[:, :, :, 6:].size())
            # print(raw_targets[:, :, :, 0].unsqueeze(3).size())
            cls_targets = torch.cat((raw_targets[:, :, :, 0].unsqueeze(3), raw_targets[:, :, :, 6:]), 3)
            # print(cls_targets.size())

            loss, pred = eval_model(input_img, cls_targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_step_counter += 1
            learn_batch_counter += 1

            print("Ep-bt: {}-{} | Loss: {}".format(i_epoch, i_batch, loss.item()))
            logger.add_scalar('loss/loss', loss.item(), learn_batch_counter)

        if (i_epoch + 1) % opt.checkpoint_interval == 0:
            print("Saving model in epoch {}".format(i_epoch))
            torch.save(eval_model.state_dict(),
                       exist_or_create_folder("./logs/model/model_params_{}.ckpt".format(i_epoch)))

        # Evaluate the model on the validation set
        if (i_epoch + 1) % opt.evaluation_interval == 0:
            precision, recall, AP, f1, ap_class = evaluate(
                eval_model,
                [opt.year, 'val'],
                [0.5, 0.5, 0.5],
                batch_size,
                True,
                diagnosis_code=1
            )
            evaluation_metrics = [
                ("val_precision", precision.mean()),
                ("val_recall", recall.mean()),
                ("val_mAP", AP.mean()),
                ("val_f1", f1.mean()),
            ]
            for tag, value in evaluation_metrics:
                logger.add_scalar("val/{}".format(tag), value.item(), i_epoch)

            # Print class APs and mAP
            ap_table = [["Index", "Class name", "AP"]]
            for i, c in enumerate(ap_class):
                ap_table += [[c, val2labels(c), "%.5f" % AP[i]]]
            print(AsciiTable(ap_table).table)
            print(f"---- validation mAP {AP.mean()}")

        # Evaluate the model on the training set
        if (i_epoch + 1) % opt.evaluation_interval == 0:
            precision, recall, AP, f1, ap_class = evaluate(
                eval_model,
                [opt.year, 'train'],
                [0.5, 0.5, 0.5],
                batch_size,
                True,
                diagnosis_code=1
            )
            evaluation_metrics = [
                ("train_precision", precision.mean()),
                ("train_recall", recall.mean()),
                ("train_mAP", AP.mean()),
                ("train_f1", f1.mean()),
            ]
            for tag, value in evaluation_metrics:
                logger.add_scalar("train/{}".format(tag), value.item(), i_epoch)

            # Print class APs and mAP
            ap_table = [["Index", "Class name", "AP"]]
            for i, c in enumerate(ap_class):
                ap_table += [[c, val2labels(c), "%.5f" % AP[i]]]
            print(AsciiTable(ap_table).table)
            print(f"---- training mAP {AP.mean()}")

        # Warmup and lr decay
        scheduler_warmup.step()

        # Free GPU memory
        torch.cuda.empty_cache()

    total_train_time = time.time() - start_time
    print("Training complete in {} hours".format(total_train_time / 3600))