Ejemplo n.º 1
0
def main(test, s3_data, batch, debug):
    """Train a semantic segmentation FPN model on the CamVid-Tiramisu dataset."""
    if batch:
        run_on_batch(test, debug)

    # Setup options
    batch_sz = 8
    num_workers = 4
    num_epochs = 20
    lr = 1e-4
    backbone_arch = 'resnet18'
    sample_pct = 1.0

    if test:
        batch_sz = 1
        num_workers = 0
        num_epochs = 2
        sample_pct = 0.01

    # Setup data
    tmp_dir_obj = tempfile.TemporaryDirectory()
    tmp_dir = tmp_dir_obj.name
    output_dir = local_output_uri
    make_dir(output_dir)

    data_dir = download_data(s3_data, tmp_dir)
    data = get_databunch(data_dir,
                         sample_pct=sample_pct,
                         batch_sz=batch_sz,
                         num_workers=num_workers)
    print(data)
    plot_data(data, output_dir)

    # Setup and train model
    num_classes = data.c
    model = SegmentationFPN(backbone_arch, num_classes)
    metrics = [acc_camvid]
    learn = Learner(data,
                    model,
                    metrics=metrics,
                    loss_func=SegmentationFPN.loss,
                    path=output_dir)
    learn.unfreeze()

    callbacks = [
        SaveModelCallback(learn, monitor='valid_loss'),
        CSVLogger(learn, filename='log'),
    ]

    learn.fit_one_cycle(num_epochs, lr, callbacks=callbacks)

    # Plot predictions and sync
    plot_preds(data, learn, output_dir)

    if s3_data:
        sync_to_dir(output_dir, remote_output_uri)
Ejemplo n.º 2
0
def multi_train(get_learn, epoch_len, epochs, opts, lrs, checkpoints, tb_log_root,autoSave=True):
    '''
    可以从checkpoint继续训练,为了保证训练连续性,需要手动设置lr与checkpoint保存时一致。
    '''
    # 清理tensorboard log dir
    if os.path.exists(tb_log_root): shutil.rmtree(tb_log_root)
    os.mkdir(tb_log_root)

    if not os.path.exists('./run_log/'): os.mkdir('./run_log/')
    txtlog = open('./run_log/log.txt',mode='w')
    for i,(opt,lr,checkpoint) in enumerate(zip(opts,lrs,checkpoints)):
        # create a learner
        learn = get_learn()

        # set optimizer
        learn.opt_func = opt

        # load checkpoint
        if checkpoint is not None:
            with open(checkpoint,'rb') as f:
                learn.load(f)

        # 在txt log中记录
        csv_log_dir = f'csv_log/'
        if not os.path.exists(learn.path/csv_log_dir): os.mkdir(learn.path/csv_log_dir)
        csv_fname = csv_log_dir+f'run_{i}'
        txt_write(txtlog,i,opt,lr,learn.path,csv_fname)

        callbacks = []
        # get csvlogger callback
        csvLog = CSVLogger(learn,filename=csv_fname)
        callbacks += [csvLog]

        if autoSave:
            # savemodel callback
            autoSave = SaveModelCallback(learn,monitor='valid_loss',mode='min',every='improvement',name=f'run_{i}')
            callbacks += [autoSave]

        # get tensorboard callback
        tbCb = get_tbCb(learn,tb_log_root+f'run_{i}')
        callbacks += [tbCb]

        # train
        fit(learn=learn, epoch_len=epoch_len, epochs=epochs, lr=lr, callbacks=callbacks)

    txtlog.close()
Ejemplo n.º 3
0
                           imsize=(SZ, SZ),
                           n_classes=NUM_CLASSES,
                           loss_func=CRITERION if TRAIN_MODE else None,
                           metrics=[METRIC],
                           device=DEVICE,
                           model_dir=LOGGING_FOLDER)

    if HS_MODEL is not None:
        learn.model.load_state_dict(torch.load(HS_MODEL)['model'])

    set_BN_momentum(learn.model, batch_size=BATCH_SIZE)
    learn.clip_grad(1.)

    # callbacks
    csv_logger = CSVLogger(learn=learn,
                           filename=f'{LOGGING_FOLDER}/fit_trace',
                           append=True)
    early_stopping = EarlyStoppingCallback(learn=learn,
                                           monitor='dice',
                                           patience=PATIENCE)
    save_model = SaveModelCallback(learn=learn,
                                   monitor='dice',
                                   name='best_model')
    acc_grad = AccumulateStep(learn, 64 // BATCH_SIZE)

    # # find optimal LR
    # learn.lr_find(stop_div=True, num_it=100)
    # learn.recorder.plot(suggestion=True)
    # opt_lr = learn.recorder.min_grad_lr
    # print(f'Initial optimal lr: {opt_lr}')
Ejemplo n.º 4
0

data.c = tree.n_obj + tree.n_parts

loss = parts.Loss(tree, preds_func=split_pred)
metrics = partial(parts.BrodenMetrics, obj_tree=tree, preds_func=split_pred)

learn = unet_learner(data,
                     models.resnet50,
                     loss_func=loss,
                     callback_fns=[metrics, utils.DataTime])

lr = 2e-4
learn.fit_one_cycle(5,
                    lr,
                    callbacks=[
                        SaveModelCallback(learn,
                                          monitor='object-P.A.',
                                          name='unet-stage1'),
                        CSVLogger(learn, filename='unet-stage1')
                    ])
learn.unfreeze()
learn.fit_one_cycle(10,
                    slice(1e-6, lr / 5),
                    callbacks=[
                        SaveModelCallback(learn,
                                          monitor='object-P.A.',
                                          name='unet-stage2'),
                        CSVLogger(learn, filename='unet-stage2')
                    ])
Ejemplo n.º 5
0
def create_callbacks(learn):
    return [
        EarlyStoppingCallback(learn, patience=3),
        SaveModelCallback(learn),
        CSVLogger(learn)]
Ejemplo n.º 6
0
    def __init__(self,
                 data_path: str = 'lang_model',
                 emb_sz: int = 800,
                 qrnn: bool = False,
                 bidir: bool = False,
                 n_layers: int = 4,
                 n_hid: int = 2500,
                 bs: int = 104,
                 bptt: int = 67,
                 lr: float = 0.0013,
                 wd: float = .012,
                 one_cycle: bool = True,
                 cycle_len: int = 1) -> None:
        """ Instantiate AWD_LSTM Language Model with hyper-parameters.
        
        data_path: str
            path where databunch is loaded from
        emb_sz: int
            size of word embeddings
        qrnn: bool
            whether or not to use qrnn (requires CudNN)
        bidir: bool
            if RNN should be bi-directional
        n_layers: int
            number of layers in lang model
        n_hid: int
            number of hidden units in model
        lr: float
            learning rate
        bptt: int
            back-propigation-through-time; max sequence length through which gradients will be accumulated.
        bs: int
            batch size
        
        The hyper-parameters are stored in a fastai dict called `fastai.text.models.awd_lstm_lm_config`:
           {'emb_sz': 400, 'n_hid': 1150, 'n_layers': 3, 'pad_token': 1, 'qrnn': False, 'bidir': False, 'output_p': 0.1,
            'hidden_p': 0.15, 'input_p': 0.25, 'embed_p': 0.02,'weight_p': 0.2, 'tie_weights': True, 'out_bias': True}
        """
        self.lr, self.wd, self.one_cycle, self.cycle_len = lr, wd, one_cycle, cycle_len
        awd_lstm_lm_config.update(
            dict(emb_sz=emb_sz,
                 qrnn=qrnn,
                 bidir=bidir,
                 n_layers=n_layers,
                 n_hid=n_hid))
        #log params
        wb_handle = wandb.init(config=awd_lstm_lm_config)
        wandb.config.update({
            'data_path': str(data_path),
            'bs': bs,
            'bptt': bptt,
            'lr': lr
        })
        self.csv_name = 'history_' + wb_handle.name
        wandb.config.update({'csvlog_save_path': self.csv_name})

        # instantiate databunch
        self.data_lm = load_data(data_path, bs=bs, bptt=bptt)

        # instantiate language model
        self.learn = language_model_learner(data=self.data_lm,
                                            arch=AWD_LSTM,
                                            pretrained=False,
                                            model_dir=Path('models_' +
                                                           wb_handle.name),
                                            config=awd_lstm_lm_config)
        self.full_model_path = str(self.learn.path / self.learn.model_dir)
        wandb.config.update({'model_save_path': self.full_model_path})

        # prepare callbacks
        escb = EarlyStoppingCallback(learn=self.learn, patience=2)
        smcb = SaveModelCallback(learn=self.learn,
                                 name='best_' + wb_handle.name)
        rpcb = ReduceLROnPlateauCallback(learn=self.learn, patience=1)
        csvcb = CSVLogger(learn=self.learn, filename=self.csv_name)
        wb = wandbCallback(self.learn)
        self.callbacks = [escb, smcb, rpcb, csvcb, wb]

        self.fit()
Ejemplo n.º 7
0
def main(config, args):
    if torch.cuda.is_available():
        cudnn.benchmark = True
        print('Using CUDA')
    else:
        print('**** CUDA is not available ****')

    pprint.pprint(config)

    if args.exp is None:
        if not os.path.exists('./config/old_configs/' + config.exp_name):
            os.makedirs('./config/old_configs/' + config.exp_name)
        shutil.copy2(
            './config/config.py',
            './config/old_configs/{}/config.py'.format(config.exp_name))

    if not os.path.exists('./model_weights/' + config.exp_name):
        os.makedirs('./model_weights/' + config.exp_name)
    if not os.path.exists('./logs/' + config.exp_name):
        os.makedirs('./logs/' + config.exp_name)

    data_df = pd.read_csv(config.DATA_CSV_PATH)
    if os.path.exists('/content/data'):
        print('On Colab')
        data_df['Id'] = data_df['Id'].apply(lambda x: '/content' + x[1:])

    if config.dataclass is not None:
        data_df = data_df[data_df['Type(Full/Head/Unclean/Bad)'] ==
                          config.dataclass].reset_index(drop=True)
    split_train_mask = (data_df['Fold'] != 'Fold{}'.format(args.foldidx))
    train_df = data_df[split_train_mask
                       & (data_df['Split'] == 'Train')].reset_index(drop=True)
    valid_df = data_df[(~split_train_mask)
                       & (data_df['Split'] == 'Train')].reset_index(drop=True)
    test_df = data_df[data_df['Split'] == 'Test'].reset_index(drop=True)
    maintest_df = data_df[data_df['Split'] == 'MainTest'].reset_index(
        drop=True)

    print("Training with valid fold: ", args.foldidx)
    print(valid_df.head())

    if config.pseudo_path is not None:
        assert not (config.add_val_pseudo and config.add_val_orig)
        if config.add_val_pseudo:
            pseudo_df = pd.concat((valid_df, test_df, maintest_df))
        else:
            pseudo_df = pd.concat((test_df, maintest_df))
        pseudo_df['Id'] = pseudo_df['Id'] + '_pseudo'
        if config.add_val_orig:
            pseudo_df = pd.concat((pseudo_df, valid_df))
        train_df = pd.concat((train_df, pseudo_df)).reset_index(drop=True)

    train_tfms = get_train_tfms(config)
    print(train_tfms)
    if config.debug and config.reduce_dataset:
        if config.pseudo_path is not None:
            train_df = pd.concat(
                (train_df[:10], pseudo_df[:10])).reset_index(drop=True)
        else:
            train_df = train_df[:10]
        valid_df = valid_df[:10]


#     DatasetClass = KBPDataset2D if psutil.virtual_memory().total < 20e9 else KBPDataset2DStack
    DatasetClass = KBPDataset2D
    train_ds = DatasetClass(config, train_df, transform=train_tfms)
    valid_ds = DatasetClass(config, valid_df, valid=True)

    # valid_dl = DataLoader(valid_ds, batch_size=128, shuffle=False, num_workers=config.num_workers)

    criterion = KBPLoss(config)

    Net = getattr(model_list, config.model_name)

    net = Net(config=config).to(config.device)
    print(net)

    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    print("Number of parameters: ", count_parameters(net))

    if config.load_model_ckpt is not None:
        print('Loading model from {}'.format(
            config.load_model_ckpt.format(args.foldidx)))
        net.load_state_dict(
            torch.load(config.load_model_ckpt.format(args.foldidx))['model'])

    gpu = setup_distrib(config.gpu)
    opt = config.optimizer
    mom = config.mom
    alpha = config.alpha
    eps = config.eps

    if opt == 'adam':
        opt_func = partial(optim.Adam,
                           betas=(mom, alpha),
                           eps=eps,
                           amsgrad=config.amsgrad)
    elif opt == 'adamw':
        opt_func = partial(optim.AdamW, betas=(mom, alpha), eps=eps)
    elif opt == 'radam':
        opt_func = partial(RAdam,
                           betas=(mom, alpha),
                           eps=eps,
                           degenerated_to_sgd=config.radam_degenerated_to_sgd)
    elif opt == 'sgd':
        opt_func = partial(optim.SGD, momentum=mom, nesterov=config.nesterov)
    elif opt == 'ranger':
        opt_func = partial(Ranger, betas=(mom, alpha), eps=eps)
    else:
        raise ValueError("Optimizer not recognized")
    print(opt_func)

    data = DataBunch.create(train_ds,
                            valid_ds,
                            bs=config.batch_size,
                            num_workers=config.num_workers)

    # metrics = [dose_score, dvh_score, pred_mean, target_mean]
    metrics = [dose_score2D, dvh_score2D, pred_mean2D, target_mean2D]
    evalbatchaccum = EvalBatchAccumulator(config,
                                          target_bs=128,
                                          num_metrics=len(metrics))
    learn = (Learner(evalbatchaccum,
                     data,
                     net,
                     wd=config.weight_decay,
                     opt_func=opt_func,
                     bn_wd=False,
                     true_wd=True,
                     loss_func=criterion,
                     metrics=metrics,
                     path='./model_weights/{}/'.format(config.exp_name)))
    if config.fp16:
        print('Training with mixed precision...')
        learn = learn.to_fp16(dynamic=True)
    else:
        print('Full precision training...')
    if gpu is None: learn.to_parallel()
    elif num_distrib() > 1: learn.to_distributed(gpu)
    if config.mixup: learn = learn.mixup(alpha=config.mixup, stack_y=False)
    print("Learn path: ", learn.path)
    best_save_cb = SaveBestModel(learn,
                                 config,
                                 outfile='_fold{}'.format(args.foldidx))
    logger_cb = CSVLogger(learn)
    logger_cb.path = Path(
        str(logger_cb.path).replace('model_weights/', 'logs/').replace(
            '.csv', '_fold{}.csv'.format(args.foldidx)))
    callbacks = [best_save_cb, logger_cb]

    if config.teachers is not None:
        package = 'config.old_configs.{}.config'.format(config.teachers)
        teacherconfig = importlib.import_module(package).config
        teachers = []
        for fold in range(5):
            teacher = getattr(model_list, teacherconfig.model_name)
            teacher = teacher(teacherconfig)
            model_ckpt = './model_weights/{}/models/best_dose_fold{}.pth'.format(
                teacherconfig.exp_name, fold)
            print("Loading teacher {} encoder from {}".format(
                fold, model_ckpt))
            teacher.load_state_dict(torch.load(model_ckpt)['model'])
            teacher.to(config.device)
            teacher.eval()
            for param in teacher.parameters():
                param.requires_grad = False
            teachers.append(teacher)
    else:
        teachers = None

    if config.wandb:
        wandb.init(project=config.wandb_project, name=config.exp_name)
        wandb_cb = WandbCallback(learn)
        callbacks.append(wandb_cb)

    print(learn.loss_func.config.loss_dict)
    print(learn.opt_func)
    print("Weight decay: ", learn.wd)

    learn.fit_one_cycle(config.epochs,
                        config.lr,
                        callbacks=callbacks,
                        div_factor=config.div_factor,
                        pct_start=config.pct_start,
                        final_div=config.final_div,
                        teachers=teachers)

    best_str = "Best valid loss: {}, dose score: {}, dvh score: {}".format(
        best_save_cb.best_loss, best_save_cb.best_dose.item(),
        best_save_cb.best_dvh.item())
    print(best_str)
    f = open(
        "./logs/{}/bestmetrics_fold{}.txt".format(config.exp_name,
                                                  args.foldidx), "a")
    f.write(best_str)
    f.close()
Ejemplo n.º 8
0
def get_callbacks(learn, name, monitor):
    """TODO"""
    cbs = [SaveModelCallback(learn, monitor)]
    cbs += [CSVLogger(learn, filename=name, append=True)]
    return cbs
Ejemplo n.º 9
0
def main(test, s3_data, batch, debug):
    if batch:
        run_on_batch(test, debug)

    # Setup options
    bs = 16
    size = 256
    num_workers = 4
    num_epochs = 100
    lr = 1e-4
    # for size 256
    # Subtract 2 because there's no padding on final convolution
    grid_sz = 8 - 2

    if test:
        bs = 8
        size = 128
        num_debug_images = 32
        num_workers = 0
        num_epochs = 1
        # for size 128
        grid_sz = 4 - 2

    # Setup data
    make_dir(output_dir)

    data_dir = untar_data(URLs.PASCAL_2007, dest='/opt/data/pascal2007/data')
    img_path = data_dir/'train/'
    trn_path = data_dir/'train.json'
    trn_images, trn_lbl_bbox = get_annotations(trn_path)
    val_path = data_dir/'valid.json'
    val_images, val_lbl_bbox = get_annotations(val_path)

    images, lbl_bbox = trn_images+val_images, trn_lbl_bbox+val_lbl_bbox
    img2bbox = dict(zip(images, lbl_bbox))
    get_y_func = lambda o: img2bbox[o.name]

    with open(trn_path) as f:
        d = json.load(f)
        classes = sorted(d['categories'], key=lambda x: x['id'])
        classes = [x['name'] for x in classes]
        classes = ['background'] + classes
        num_classes = len(classes)

    anc_sizes = torch.tensor([
        [1, 1],
        [2, 2],
        [3, 3],
        [3, 1],
        [1, 3]], dtype=torch.float32)
    grid = ObjectDetectionGrid(grid_sz, anc_sizes, num_classes)
    score_thresh = 0.1
    iou_thresh = 0.8

    class MyObjectCategoryList(ObjectCategoryList):
        def analyze_pred(self, pred):
            boxes, labels, _ = grid.get_preds(
                pred.unsqueeze(0), score_thresh=score_thresh,
                iou_thresh=iou_thresh)
            return (boxes[0], labels[0])

    class MyObjectItemList(ObjectItemList):
        _label_cls = MyObjectCategoryList

    def get_data(bs, size, ):
        src = MyObjectItemList.from_folder(img_path)
        if test:
            src = src[0:num_debug_images]
        src = src.split_by_files(val_images)
        src = src.label_from_func(get_y_func, classes=classes)
        src = src.transform(get_transforms(), size=size, tfm_y=True)
        return src.databunch(path=data_dir, bs=bs, collate_fn=bb_pad_collate,
                             num_workers=num_workers)

    data = get_data(bs, size)
    print(data)
    plot_data(data, output_dir)

    # Setup model
    model = ObjectDetectionModel(grid)

    def loss(out, gt_boxes, gt_classes):
        gt = model.grid.encode(gt_boxes, gt_classes)
        box_loss, class_loss = model.grid.compute_losses(out, gt)
        return box_loss + class_loss

    metrics = [F1(grid, score_thresh=score_thresh, iou_thresh=iou_thresh)]
    learn = Learner(data, model, metrics=metrics, loss_func=loss,
                    path=output_dir)
    callbacks = [
        CSVLogger(learn, filename='log')
    ]
    # model.freeze_body()
    learn.fit_one_cycle(num_epochs, lr, callbacks=callbacks)

    plot_preds(data, learn, output_dir)

    if s3_data:
        sync_to_dir(output_dir, output_uri)