Ejemplo n.º 1
0
        logger.info('Using Cuda...')
        model = net.Net(params).cuda()
    else:
        params.device = torch.device('cpu')
        # torch.manual_seed(230)
        logger.info('Not using cuda...')
        model = net.Net(params)

    utils.set_logger(os.path.join(model_dir, 'train.log'))
    logger.info('Loading the datasets...')

    train_set = TrainDataset(data_dir, args.dataset, params.num_class)
    test_set = TestDataset(data_dir, args.dataset, params.num_class)
    sampler = WeightedSampler(data_dir, args.dataset) # Use weighted sampler instead of random sampler
    train_loader = DataLoader(train_set, batch_size=params.batch_size, sampler=sampler, num_workers=4)
    test_loader = DataLoader(test_set, batch_size=params.predict_batch, sampler=RandomSampler(test_set), num_workers=4)
    logger.info('Loading complete.')

    logger.info(f'Model: \n{str(model)}')
    optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)

    # fetch loss function
    loss_fn = net.loss_fn

    # Train the model
    logger.info('Starting training for {} epoch(s)'.format(params.num_epochs))
    train_and_evaluate(model,
                       train_loader,
                       test_loader,
                       optimizer,
                       loss_fn,
Ejemplo n.º 2
0
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
        discriminator = nn.DataParallel(discriminator)

    model.to(params.device)
    discriminator.to(params.device)

    utils.set_logger(os.path.join(model_dir, 'train.log'))
    logger.info('Loading the datasets...')

    train_set = TrainDataset(data_dir, args.dataset, params.num_class)
    valid_set = ValidDataset(data_dir, args.dataset, params.num_class)
    test_set = TestDataset(data_dir, args.dataset, params.num_class)
    #sampler = WeightedSampler(data_dir, args.dataset) # Use weighted sampler instead of random sampler
    train_loader = DataLoader(train_set, batch_size=params.batch_size, sampler=RandomSampler(train_set), num_workers=4)
    valid_loader = DataLoader(valid_set, batch_size=params.predict_batch, sampler=RandomSampler(valid_set), num_workers=4)
    test_loader = DataLoader(test_set, batch_size=params.predict_batch, sampler=RandomSampler(test_set), num_workers=4)
    logger.info('Loading complete.')

    n_updates_total = (train_set.__len__() // params.batch_size) * params.num_epochs

    optimizer_D = optim.RMSprop(discriminator.parameters(), lr = params.lr_d)
    optimizer_G = OpenAIAdam(model.parameters(),
                           lr=params.lr,
                           schedule=params.lr_schedule,
                           warmup=params.lr_warmup,
                           t_total=n_updates_total,
                           b1=0.9,
                           b2=0.999,
                           e=1e-8,
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1

    gkf = GroupKFold(n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(
                    histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ',
            logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode='tq_a',
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode='tq_a',
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = BCEWithLogitsLoss()
        state_dict = RobertaModel.from_pretrained(MODEL_PRETRAIN).state_dict()
        model = RobertaModelForBinaryMultiLabelClassifier(
            num_labels=len(LABEL_COL),
            config_path=MODEL_CONFIG_PATH,
            state_dict=state_dict,
            token_size=len(trn_dataset.tokenizer),
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader,
                                       DEVICE)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader, DEVICE, mode='valid')

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [
                    val_loss,
                ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [
                    val_metric,
                ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [
                    val_metric_raws,
                ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}', logger)
            model = model.to('cpu')
            model = model.module
            save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model,
                            optimizer, scheduler, histories, val_y_preds,
                            val_y_trues, val_qa_ids, fold, epoch, val_loss,
                            val_metric)
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(
                histories["val_metric"][fold])])
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer,
                                      clean=False)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
Ejemplo n.º 4
0
    environment_provider=env,
    properties=['energy', 'forces'])

if not os.path.isdir(args.modelpath):
    os.makedirs(args.modelpath)

if args.split_path is not None:
    copyfile(args.split_path, split_path)

#from sklearn.model_selection import train_test_split
#train,test = train_test_split(df, test_size=0.20, random_state=42,stratify=df['Ebin'].values)
print(args.batch_size)
train_loader = schnetpack2.custom.data.AtomsLoader(
    data_train,
    batch_size=args.batch_size,
    sampler=RandomSampler(data_train),
    num_workers=9 * torch.cuda.device_count(),
    pin_memory=True)
val_loader = schnetpack2.custom.data.AtomsLoader(data_val,
                                                 batch_size=args.batch_size,
                                                 num_workers=9 *
                                                 torch.cuda.device_count(),
                                                 pin_memory=True)
mean, stddev = train_loader.get_statistics('energy', False)

mean_forces, stddev_forces = train_loader.get_statistics('forces', True)
#mean = -4178.7568
#stddev = 29.6958
if args.uncertainty:
    mean, stddev = torch.tensor([-1202.6432, 0]), torch.tensor([12.3304, 1])
else:
Ejemplo n.º 5
0
def fit(
    model,
    train_dataset,
    val_dataset,
    optimizer_name="adam",
    samples_per_player=0,
    epochs=50,
    batch_size=32,
    val_bs=32,
    warmup_prop=0.1,
    lr=1e-3,
    acc_steps=1,
    swa_first_epoch=50,
    num_classes_aux=0,
    aux_mode="sigmoid",
    verbose=1,
    first_epoch_eval=0,
    device="cuda",
):
    """
    Fitting function for the classification task.

    Args:
        model (torch model): Model to train.
        train_dataset (torch dataset): Dataset to train with.
        val_dataset (torch dataset): Dataset to validate with.
        optimizer_name (str, optional): Optimizer name. Defaults to 'adam'.
        samples_per_player (int, optional): Number of images to use per player. Defaults to 0.
        epochs (int, optional): Number of epochs. Defaults to 50.
        batch_size (int, optional): Training batch size. Defaults to 32.
        val_bs (int, optional): Validation batch size. Defaults to 32.
        warmup_prop (float, optional): Warmup proportion. Defaults to 0.1.
        lr (float, optional): Learning rate. Defaults to 1e-3.
        acc_steps (int, optional): Accumulation steps. Defaults to 1.
        swa_first_epoch (int, optional): Epoch to start applying SWA from. Defaults to 50.
        num_classes_aux (int, optional): Number of auxiliary classes. Defaults to 0.
        aux_mode (str, optional): Mode for auxiliary classification. Defaults to 'sigmoid'.
        verbose (int, optional): Period (in epochs) to display logs at. Defaults to 1.
        first_epoch_eval (int, optional): Epoch to start evaluating at. Defaults to 0.
        device (str, optional): Device for torch. Defaults to "cuda".

    Returns:
        numpy array [len(val_dataset)]: Last predictions on the validation data.
        numpy array [len(val_dataset) x num_classes_aux]: Last aux predictions on the val data.
    """

    optimizer = define_optimizer(optimizer_name, model.parameters(), lr=lr)

    if swa_first_epoch <= epochs:
        optimizer = SWA(optimizer)

    loss_fct = nn.BCEWithLogitsLoss()
    loss_fct_aux = nn.BCEWithLogitsLoss(
    ) if aux_mode == "sigmoid" else nn.CrossEntropyLoss()
    aux_loss_weight = 1 if num_classes_aux else 0

    if samples_per_player:
        sampler = PlayerSampler(
            RandomSampler(train_dataset),
            train_dataset.players,
            batch_size=batch_size,
            drop_last=True,
            samples_per_player=samples_per_player,
        )
        train_loader = DataLoader(
            train_dataset,
            batch_sampler=sampler,
            num_workers=NUM_WORKERS,
            pin_memory=True,
        )

        print(
            f"Using {len(train_loader)} out of {len(train_dataset) // batch_size} "
            f"batches by limiting to {samples_per_player} samples per player.\n"
        )
    else:
        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True,
            drop_last=True,
            num_workers=NUM_WORKERS,
            pin_memory=True,
        )

    val_loader = DataLoader(
        val_dataset,
        batch_size=val_bs,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True,
    )

    num_training_steps = int(epochs * len(train_loader))
    num_warmup_steps = int(warmup_prop * num_training_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps,
                                                num_training_steps)

    for epoch in range(epochs):
        model.train()

        start_time = time.time()
        optimizer.zero_grad()

        avg_loss = 0

        if epoch + 1 > swa_first_epoch:
            optimizer.swap_swa_sgd()

        for batch in train_loader:
            images = batch[0].to(device)
            y_batch = batch[1].to(device).view(-1).float()
            y_batch_aux = batch[2].to(device).float()
            y_batch_aux = y_batch_aux.float(
            ) if aux_mode == "sigmoid" else y_batch_aux.long()

            y_pred, y_pred_aux = model(images)

            loss = loss_fct(y_pred.view(-1), y_batch)
            if aux_loss_weight:
                loss += aux_loss_weight * loss_fct_aux(y_pred_aux, y_batch_aux)
            loss.backward()

            avg_loss += loss.item() / len(train_loader)
            optimizer.step()
            scheduler.step()
            for param in model.parameters():
                param.grad = None

        if epoch + 1 >= swa_first_epoch:
            optimizer.update_swa()
            optimizer.swap_swa_sgd()

        preds = np.empty(0)
        preds_aux = np.empty((0, num_classes_aux))
        model.eval()
        avg_val_loss, auc, scores_aux = 0., 0., 0.
        if epoch + 1 >= first_epoch_eval or epoch + 1 == epochs:
            with torch.no_grad():
                for batch in val_loader:
                    images = batch[0].to(device)
                    y_batch = batch[1].to(device).view(-1).float()
                    y_aux = batch[2].to(device).float()
                    y_batch_aux = y_aux.float(
                    ) if aux_mode == "sigmoid" else y_aux.long()

                    y_pred, y_pred_aux = model(images)

                    loss = loss_fct(y_pred.detach().view(-1), y_batch)
                    if aux_loss_weight:
                        loss += aux_loss_weight * loss_fct_aux(
                            y_pred_aux.detach(), y_batch_aux)

                    avg_val_loss += loss.item() / len(val_loader)

                    y_pred = torch.sigmoid(y_pred).view(-1)
                    preds = np.concatenate(
                        [preds, y_pred.detach().cpu().numpy()])

                    if num_classes_aux:
                        y_pred_aux = (y_pred_aux.sigmoid() if aux_mode
                                      == "sigmoid" else y_pred_aux.softmax(-1))
                        preds_aux = np.concatenate(
                            [preds_aux,
                             y_pred_aux.detach().cpu().numpy()])

            auc = roc_auc_score(val_dataset.labels, preds)

            if num_classes_aux:
                if aux_mode == "sigmoid":
                    scores_aux = np.round(
                        [
                            roc_auc_score(val_dataset.aux_labels[:, i],
                                          preds_aux[:, i])
                            for i in range(num_classes_aux)
                        ],
                        3,
                    ).tolist()
                else:
                    scores_aux = np.round(
                        [
                            roc_auc_score((val_dataset.aux_labels
                                           == i).astype(int), preds_aux[:, i])
                            for i in range(num_classes_aux)
                        ],
                        3,
                    ).tolist()
            else:
                scores_aux = 0

        elapsed_time = time.time() - start_time
        if (epoch + 1) % verbose == 0:
            elapsed_time = elapsed_time * verbose
            lr = scheduler.get_last_lr()[0]
            print(
                f"Epoch {epoch + 1:02d}/{epochs:02d} \t lr={lr:.1e}\t t={elapsed_time:.0f}s \t"
                f"loss={avg_loss:.3f}",
                end="\t",
            )

            if epoch + 1 >= first_epoch_eval:
                print(
                    f"val_loss={avg_val_loss:.3f} \t auc={auc:.3f}\t aucs_aux={scores_aux}"
                )
            else:
                print("")

    del val_loader, train_loader, y_pred
    torch.cuda.empty_cache()

    return preds, preds_aux
        article_torch = utils.to_tensor((X_data[index]))

        dict_ = {'article': article_torch}

        return dict_


def my_collate(batch):
    return batch


if __name__ == '__main__':

    batch_size = 8
    train_data, val_data, vocabulary = (
        utils.to_tensor(np.concatenate(np.load('./dataset/wiki.train.npy'))),
        utils.to_tensor(np.concatenate(np.load('./dataset/wiki.valid.npy'))),
        np.load('./dataset/vocab.npy'))

    wiki_train_ds = WikiDataset(train_data)

    wiki_train_loader = data.DataLoader(wiki_train_ds,
                                        batch_size=batch_size,
                                        sampler=RandomSampler(wiki_train_ds),
                                        collate_fn=my_collate)

    for batch_index, batch_dict in enumerate(val_loader):
        print(batch_dict)
        if batch_index == 0: break
Ejemplo n.º 7
0
def construct_loader(cfg, split, is_precise_bn=False):
    """
    Constructs the data loader for the given dataset.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        split (str): the split of the data loader. Options include `train`,
            `val`, and `test`.
    """
    assert split in ["train", "val", "test"]
    if split in ["train"]:
        dataset_name = cfg.TRAIN.DATASET
        batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS))
        shuffle = True
        drop_last = True
    elif split in ["val"]:
        dataset_name = cfg.TRAIN.DATASET
        batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS))
        shuffle = False
        drop_last = False
    elif split in ["test"]:
        dataset_name = cfg.TEST.DATASET
        batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS))
        shuffle = False
        drop_last = False

    # Construct the dataset
    dataset = build_dataset(dataset_name, cfg, split)

    if cfg.MULTIGRID.SHORT_CYCLE and split in ["train"] and not is_precise_bn:
        # Create a sampler for multi-process training
        sampler = (
            DistributedSampler(dataset)
            if cfg.NUM_GPUS > 1
            else RandomSampler(dataset)
        )
        batch_sampler = ShortCycleBatchSampler(
            sampler, batch_size=batch_size, drop_last=drop_last, cfg=cfg
        )
        # Create a loader
        loader = torch.utils.data.DataLoader(
            dataset,
            batch_sampler=batch_sampler,
            num_workers=cfg.DATA_LOADER.NUM_WORKERS,
            pin_memory=cfg.DATA_LOADER.PIN_MEMORY,
        )
    else:
        # Create a sampler for multi-process training
        sampler = DistributedSampler(dataset) if cfg.NUM_GPUS > 1 else None
        # Create a loader
        loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=(False if sampler else shuffle),
            sampler=sampler,
            num_workers=cfg.DATA_LOADER.NUM_WORKERS,
            pin_memory=cfg.DATA_LOADER.PIN_MEMORY,
            drop_last=drop_last,
            collate_fn=detection_collate if cfg.DETECTION.ENABLE else None,
        )
    return loader
Ejemplo n.º 8
0
def load_data(args):
    """Load data from here and return.
    Note:
        Compose Composes several transforms together and if augmentation is chosen you compose an additional
        bunch of transforms to be applied to the train data and you send this to the DataTransformer class
        which returns the data set that is used in the data loader. The data loader then takes in this dataset with a
        batch size and sampler. Sampler is defines the strategy to draw samples from the dataset. Here for training
        data random sampling is used and for validation sequential is used. You can also write a custom sampler class
        if you want.
    :param args:
        main_dir (string)       : path to the main directory from the args.
        image_size (int)        : size of the image to be resized.
        transform_prob (float)  : probability to apply transformations on the data.
        batch_size (int)        : batch size to be used in the data loader.
    :return:
        the train loader and validation loader to be used for training and validating.
    """
    # get data set file path
    data_path = os.path.join(args.main_dir, 'data', 'train-volume.tif')
    labels_path = os.path.join(args.main_dir, 'data', 'train-labels.tif')

    # compose the transforms for the train set
    train_data = Compose([Resize(args.image_size), ToTensor()])

    # choose between augmentations for train data
    if args.augment:
        train_augment = augmentations(args)
        train_transform = DataTransformer(data_path,
                                          labels_path,
                                          image_transform=train_data,
                                          image_augmentation=train_augment)

    else:
        # transforming the train data and returning a 4D tensor
        train_transform = DataTransformer(data_path,
                                          labels_path,
                                          image_transform=train_data,
                                          image_augmentation=None)

    # transform for validation data
    val_data = Compose([Resize(args.image_size), ToTensor()])
    val_transform = DataTransformer(data_path,
                                    labels_path,
                                    image_transform=val_data,
                                    image_augmentation=None)

    # split the train and validation indices
    train_indices, validation_indices = train_test_split(range(
        len(train_transform)),
                                                         test_size=0.15)

    # call the sampler for the train and validation data
    train_samples = RandomSampler(train_indices)
    validation_samples = SequentialSampler(validation_indices)

    # load train and validation data
    train_loader = DataLoader(train_transform,
                              batch_size=args.batch_size,
                              sampler=train_samples)
    val_loader = DataLoader(val_transform,
                            batch_size=args.batch_size,
                            sampler=validation_samples)

    return train_loader, val_loader
train_dataset = DatasetRetriever(
    image_ids=train_csv.frame_no.unique(),
    marking=train_csv,
    transforms=get_valid_transforms(),
    test=False,
)
validation_dataset = DatasetRetriever(
    image_ids=test_csv.frame_no.unique(),
    marking=test_csv,
    transforms=get_valid_transforms(),
    test=True,
)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    sampler=RandomSampler(train_dataset),
    pin_memory=False,
    drop_last=True,
    num_workers=6,
    collate_fn=collate_fn,
)
val_loader = torch.utils.data.DataLoader(
    validation_dataset,
    batch_size=BATCH_SIZE,
    num_workers=6,
    shuffle=False,
    sampler=SequentialSampler(validation_dataset),
    pin_memory=False,
    collate_fn=collate_fn,
)
Ejemplo n.º 10
0
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)
    model = DistilBertForQuestionAnswering.from_pretrained(
        "distilbert-base-uncased")
    if args.do_finetune:
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        for name, param in model.named_parameters():
            if name.startswith("distilbert.embeddings."):
                param.requires_grad = False
            for i in range(args.freeze_layer):
                if name.startswith("distilbert.transformer.layer.%s." % i):
                    param.requires_grad = False
        return
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')

    if args.do_train:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
        log = util.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
        log.info("Preparing Training Data...")
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        trainer = Trainer(args, log)
        train_dataset, _ = get_dataset(args, args.train_datasets,
                                       args.train_dir, tokenizer, 'train')
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = get_dataset(args, args.train_datasets,
                                            args.val_dir, tokenizer, 'val')
        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  sampler=RandomSampler(train_dataset))
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))
        best_scores = trainer.train(model, train_loader, val_loader, val_dict)
    if args.do_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util.get_logger(args.save_dir, f'log_{split_name}')
        trainer = Trainer(args, log)
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path)
        model.to(args.device)
        eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                              args.eval_dir, tokenizer,
                                              split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   eval_loader,
                                                   eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')
        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
Ejemplo n.º 11
0
def run_training():

    #-------------------------------------------- Training settings --------------------------------------------
    out_dir = '../'  # s_xx1'
    # initial_checkpoint = None
    initial_checkpoint = '../checkpoint/best_train_model.pth'
    # pretrained_file = '../trained_models/LB=0.69565_inc3_00075000_model.pth'
    pretrained_file = None
    skip = []  #['fc.weight', 'fc.bias']

    num_iters = 1000 * 1000
    iter_smooth = 50
    iter_valid = 100  #500
    iter_log = 5
    iter_save_freq = 50
    iter_save = [0, num_iters - 1] + list(
        range(0, num_iters, 1 *
              iter_save_freq))  # first and last iters, then every 1000 iters

    validation_num = 10000

    batch_size = 128  #60   #512  #96 #256
    validation_batch_size = 128
    iter_accum = 4  #2  #448//batch_size

    valid_loss = 0.0
    valid_acc = 0.0
    batch_loss = 0.0
    batch_acc = 0.0
    best_valid_acc = 0.0
    best_train_acc = 0.0
    rate = 0

    iter_time_meter = AverageMeter()
    train_loss_meter = AverageMeter()
    train_acc_meter = AverageMeter()

    j = 0  # number of iters in total
    i = 0  # number of real iters where bp is conducted

    #-----------------------------------------------------------------------------------------------------------

    ## setup  ---------------------------
    os.makedirs(out_dir + '/checkpoint/' + IDENTIFIER, exist_ok=True)
    os.makedirs(out_dir + '/backup/' + IDENTIFIER, exist_ok=True)

    log.write('\n--- [START %s] %s\n\n' %
              (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '-' * 64))
    log.write('** some experiment setting **\n')
    log.write('\tIDENTIFIER   = %s\n' % IDENTIFIER)
    log.write('\tSEED         = %u\n' % SEED)
    log.write('\tPROJECT_PATH = %s\n' % PROJECT_PATH)
    log.write('\tout_dir      = %s\n' % out_dir)

    ## net -------------------------------
    log.write('** net setting **\n')
    net = Net(in_shape=(3, CDISCOUNT_HEIGHT, CDISCOUNT_WIDTH),
              num_classes=CDISCOUNT_NUM_CLASSES)
    if use_cuda: net.cuda()
    ####
    # if 0: #freeze early layers
    #     for p in net.layer0.parameters():
    #         p.requires_grad = False
    #     for p in net.layer1.parameters():
    #         p.requires_grad = False
    #     for p in net.layer2.parameters():
    #         p.requires_grad = False
    #     for p in net.layer3.parameters():
    #         p.requires_grad = False

    log.write('%s\n\n' % (type(net)))
    # log.write('\n%s\n'%(str(net)))
    # log.write(inspect.getsource(net.__init__)+'\n')
    # log.write(inspect.getsource(net.forward )+'\n')
    log.write('\n')

    ## optimiser ----------------------------------
    #LR = StepLR([ (0, 0.01),  (200, 0.001),  (300, -1)])
    LR = StepLR([(0, 0.01), (1, 0.001), (3, 0.0001)])

    ## optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0005)  ###0.0005
    optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()),
                          lr=0.01,
                          momentum=0.1,
                          weight_decay=0.0001)

    ## dataset ----------------------------------------
    log.write('** dataset setting **\n')
    transform_train = transforms.Compose([
        # transforms.ToTensor(): Converts a PIL.Image or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
        transforms.Lambda(lambda x: train_augment(x))
    ])

    transform_valid = transforms.Compose([
        # transforms.ToTensor(): Converts a PIL.Image or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
        transforms.Lambda(lambda x: valid_augment(x))
    ])

    train_dataset = CDiscountDataset(csv_dir + train_data_filename,
                                     root_dir,
                                     transform=transform_train)

    train_loader = DataLoader(
        train_dataset,
        #sampler = RandomSampler1(train_dataset,50000),
        sampler=RandomSampler(train_dataset),
        batch_size=batch_size,
        drop_last=True,
        num_workers=0,
        pin_memory=False)
    # if train_loader != None: print("Train loader loaded!")

    valid_dataset = CDiscountDataset(csv_dir + validation_data_filename,
                                     root_dir,
                                     transform=transform_valid)

    valid_loader = DataLoader(valid_dataset,
                              sampler=SequentialSampler(valid_dataset),
                              batch_size=validation_batch_size,
                              drop_last=False,
                              num_workers=0,
                              pin_memory=False)

    # if valid_loader != None: print("Valid loader loaded!")

    # log.write('\ttrain_dataset.split = %s\n'%(train_dataset.split))
    # log.write('\tvalid_dataset.split = %s\n'%(valid_dataset.split))
    log.write('\tlen(train_dataset)  = %d\n' % (len(train_dataset)))
    log.write('\tlen(valid_dataset)  = %d\n' % (len(valid_dataset)))
    log.write('\tlen(train_loader)   = %d\n' % (len(train_loader)))
    log.write('\tlen(valid_loadernum_iters)   = %d\n' % (len(valid_loader)))
    log.write('\tbatch_size  = %d\n' % (batch_size))
    log.write('\titer_accum  = %d\n' % (iter_accum))
    log.write('\tbatch_size*iter_accum  = %d\n' % (batch_size * iter_accum))
    # log.write('\n')

    # log.write(inspect.getsource(train_augment)+'\n')
    # log.write(inspect.getsource(valid_augment)+'\n')
    # log.write('\n')
    ####

    # if 0:  ## check data
    #     check_dataset(train_dataset, train_loader)
    #     exit(0)

    ## resume from previous ----------------------------------
    start_iter = 0
    start_epoch = 0.
    if initial_checkpoint is not None:  # load a checkpoint and resume from previous training
        log.write('\tloading @ initial_checkpoint = %s\n' % initial_checkpoint)

        # load
        if os.path.isfile(initial_checkpoint):
            print("=> loading checkpoint '{}'".format(initial_checkpoint))
            checkpoint = torch.load(initial_checkpoint)
            start_epoch = checkpoint['epoch']
            start_iter = checkpoint['iter']
            best_train_acc = checkpoint['best_train_acc']
            best_valid_acc = checkpoint['best_valid_acc']
            net.load_state_dict(checkpoint['state_dict']
                                )  # load model weights from the checkpoint
            optimizer.load_state_dict(checkpoint['optimizer'])

            # net.load_state_dict(checkpoint)
            log.write(
                "=> loaded checkpoint '{}' (epoch: {}, iter: {}, best_train_acc: {}, best_valid_acc: {})"
                .format(initial_checkpoint, start_epoch, start_iter,
                        best_train_acc, best_valid_acc))
        else:
            print("=> no checkpoint found at '{}'".format(initial_checkpoint))
            exit(0)

    elif pretrained_file is not None:  # load a pretrained model and train from the beginning
        log.write('\tloading @ pretrained_file = %s\n' % pretrained_file)
        net.load_pretrain_pytorch_file(pretrained_file, skip)

    ## start training here! ##############################################
    log.write('** start training here! **\n')

    log.write('\toptimizer=%s\n' % str(optimizer))
    # log.write(' LR=%s\n\n'%str(LR) )
    log.write(
        '   rate   iter   epoch  | valid_loss/acc | train_loss/acc | batch_loss/acc | total time | avg iter time | i j |\n'
    )
    log.write(
        '----------------------------------------------------------------------------------------------------------------\n'
    )

    # Custom setting
    # start_iter = 75000
    # start_epoch= 2.98
    i = start_iter

    start = timer()
    end = time.time()
    while i < num_iters:
        net.train()
        optimizer.zero_grad()
        ##############################
        # for images, labels, indices in train_loader:
        #for images, labels in train_loader:#delete indices for testing
        ################################
        #print("start iteration")
        for k, data in enumerate(train_loader, 0):
            images, labels = data

            i = j / iter_accum + start_iter
            epoch = (i - start_iter) * batch_size * iter_accum / len(
                train_dataset) + start_epoch

            if i % iter_log == 0:
                # print('\r',end='',flush=True)
                log.write('\r%0.4f  %5.1f k   %4.2f  | %0.4f  %0.4f | %0.4f  %0.4f | %0.4f  %0.4f | %5.0f min | %5.2f s | %d,%d \n' % \
                        (rate, i/1000, epoch, valid_loss, valid_acc, train_loss_meter.avg, train_acc_meter.avg, batch_loss, batch_acc,(timer() - start)/60,
                            iter_time_meter.avg, i, j))

            #if 1:
            if i in iter_save and i != start_iter:
                # torch.save(net.state_dict(),out_dir +'/checkpoint/%08d_model.pth'%(i))
                # torch.save({
                #     'optimizer': optimizer.state_dict(),
                #     'iter'     : i,
                #     'epoch'    : epoch,
                #     'state_dict': net.state_dict(),
                #     'best_valid_acc': best_valid_acc
                # }, out_dir +'/checkpoint/%08d_model.pth'%(i))
                save_checkpoint(optimizer, i, epoch, net, best_valid_acc,
                                best_train_acc, out_dir,
                                '%08d_model.pth' % (i))

            if i % iter_valid == 0 and i != start_iter:
                net.eval()
                valid_loss, valid_acc = evaluate(net, valid_loader,
                                                 validation_num)
                net.train()

                # update best valida_acc and update best model
                if valid_acc > best_valid_acc:
                    best_valid_acc = valid_acc

                    # update best model on validation set
                    # torch.save(net.state_dict(), out_dir + '/checkpoint/best_model.pth')
                    save_checkpoint(optimizer, i, epoch, net, best_valid_acc,
                                    best_train_acc, out_dir,
                                    "best_val_model.pth")
                    log.write(
                        "=> Best validation model updated: iter %d, validation acc %f\n"
                        % (i, best_valid_acc))

            # learning rate schduler -------------
            lr = LR.get_rate(epoch)
            if lr < 0: break
            adjust_learning_rate(optimizer, lr / iter_accum)
            rate = get_learning_rate(optimizer)[0] * iter_accum

            end = time.time()
            # one iteration update  -------------
            images = Variable(images.type(
                torch.FloatTensor)).cuda() if use_cuda else Variable(
                    images.type(torch.FloatTensor))
            labels = Variable(labels).cuda() if use_cuda else Variable(labels)
            logits = net(images)
            probs = F.softmax(logits)
            loss = F.cross_entropy(logits, labels)
            batch_loss = loss.data[0]
            train_loss_meter.update(batch_loss)

            ####
            # loss = FocalLoss()(logits, labels)  #F.cross_entropy(logits, labels)
            # acc  = top_accuracy(probs, labels, top_k=(1,))
            ####

            # optimizer.zero_grad()
            # loss.backward()
            # optimizer.step()

            # accumulate gradients
            loss.backward()

            ## update gradients every iter_accum
            if j % iter_accum == 0:
                #torch.nn.utils.clip_grad_norm(net.parameters(), 1)
                #print("optim step")
                optimizer.step()
                optimizer.zero_grad()

            # measure elapsed time
            iter_time_meter.update(time.time() - end)

            # print statistics  ------------
            batch_acc = get_accuracy(probs, labels)
            train_acc_meter.update(batch_acc)

            if i % iter_smooth == 0:  # reset train stats every iter_smooth iters
                if train_acc_meter.avg > best_train_acc:
                    best_train_acc = train_acc_meter.avg
                    # update best model on train set
                    save_checkpoint(optimizer, i, epoch, net, best_valid_acc,
                                    best_train_acc, out_dir,
                                    "best_train_model.pth")
                    log.write(
                        "=> Best train model updated: iter %d, train acc %f\n"
                        % (i, best_train_acc))

                train_loss_meter = AverageMeter()
                train_acc_meter = AverageMeter()


            print('\r%0.4f  %5.1f k   %4.2f  | %0.4f  %0.4f | %0.4f  %0.4f | %0.4f  %0.4f | %5.0f min | %5.2f s | %d,%d' % \
                    (rate, i/1000, epoch, valid_loss, valid_acc, train_loss_meter.avg, train_acc_meter.avg, batch_loss, batch_acc,(timer() - start)/60, iter_time_meter.avg, i, j),\
                    end='',flush=True)
            j = j + 1
        pass  #-- end of one data loader --
    pass  #-- end of all iterations --

    ## check : load model and re-test
    if 1:
        # torch.save(net.state_dict(),out_dir +'/checkpoint/%d_model.pth'%(i))
        # torch.save({
        #     'optimizer': optimizer.state_dict(),
        #     'iter'     : i,
        #     'epoch'    : epoch,
        # }, out_dir +'/checkpoint/%d_optimizer.pth'%(i))
        save_checkpoint(optimizer, i, epoch, net, best_valid_acc,
                        best_train_acc, out_dir, '%d_optimizer.pth' % (i))

    log.write('\n')
Ejemplo n.º 12
0
def main(args, logger):
    trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')

    gkf = GroupKFold(n_splits=5).split(X=trn_df.question_body,
                                       groups=trn_df.question_body)

    histories = {
        'trn_loss': [],
        'val_loss': [],
        'val_metric': [],
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            continue
        fold_trn_df = trn_df.iloc[trn_idx]
        fold_val_df = trn_df.iloc[val_idx]
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        tokens = []

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = soft_binary_cross_entropy
        model = BertModelForBinaryMultiLabelClassifier(
            num_labels=30,
            pretrained_model_name_or_path=MODEL_PRETRAIN,
        )
        # model.resize_token_embeddings(len(trn_dataset.tokenizer))
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            model = model.to(DEVICE)
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader)
            val_loss, val_metric, val_y_preds, val_y_trues, val_qa_ids = test(
                model, val_loader)

            scheduler.step()
            histories['trn_loss'].append(trn_loss)
            histories['val_loss'].append(val_loss)
            histories['val_metric'].append(val_metric)
            sel_log(
                f'epoch : {epoch} -- fold : {fold} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f}', logger)
            model = model.to('cpu')
            save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model,
                            optimizer, scheduler, histories, val_y_preds,
                            val_y_trues, val_qa_ids, fold, epoch, val_loss,
                            val_metric)
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer)
        del model
    sel_log('now saving best checkpoints...', logger)
def training(model_name, model_type, optimizer_name, lr_scheduler_name, lr,
             batch_size, valid_batch_size, num_epoch, start_epoch,
             accumulation_steps, train_data_folder, checkpoint_folder,
             train_split, val_split, fold, load_pretrain):

    COMMON_STRING = '@%s:  \n' % os.path.basename(__file__)
    COMMON_STRING += '\tset random seed\n'
    COMMON_STRING += '\t\tSEED = %d\n' % SEED

    torch.backends.cudnn.benchmark = False  ##uses the inbuilt cudnn auto-tuner to find the fastest convolution algorithms. -
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.deterministic = True

    COMMON_STRING += '\tset cuda environment\n'
    COMMON_STRING += '\t\ttorch.__version__              = %s\n' % torch.__version__
    COMMON_STRING += '\t\ttorch.version.cuda             = %s\n' % torch.version.cuda
    COMMON_STRING += '\t\ttorch.backends.cudnn.version() = %s\n' % torch.backends.cudnn.version(
    )
    try:
        COMMON_STRING += '\t\tos[\'CUDA_VISIBLE_DEVICES\']     = %s\n' % os.environ[
            'CUDA_VISIBLE_DEVICES']
        NUM_CUDA_DEVICES = len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))
    except Exception:
        COMMON_STRING += '\t\tos[\'CUDA_VISIBLE_DEVICES\']     = None\n'
        NUM_CUDA_DEVICES = 1

    COMMON_STRING += '\t\ttorch.cuda.device_count()      = %d\n' % torch.cuda.device_count(
    )
    COMMON_STRING += '\n'

    os.makedirs(checkpoint_folder + '/' + model_type + '/' + model_name,
                exist_ok=True)

    log = Logger()
    log.open(checkpoint_folder + '/' + model_type + '/' + model_name + '/' +
             model_name + '_fold_' + str(fold) + '_log_train.txt',
             mode='a+')
    log.write('\t%s\n' % COMMON_STRING)
    log.write('\n')

    log.write('\tSEED         = %u\n' % SEED)
    log.write('\tPROJECT_PATH = %s\n' % train_data_folder)
    log.write('\t__file__     = %s\n' % __file__)
    log.write('\tout_dir      = %s\n' % checkpoint_folder)
    log.write('\n')

    ## dataset ----------------------------------------
    log.write('** dataset setting **\n')

    train_dataset = URESDataset(
        data_dir=train_data_folder,
        mode='train',
        csv=[
            'train.csv',
        ],
        split=train_split,
        augment=transform_train,
        size=(1024, 1024),
    )
    train_dataloader = DataLoader(train_dataset,
                                  sampler=RandomSampler(train_dataset),
                                  batch_size=batch_size,
                                  drop_last=True,
                                  num_workers=4,
                                  pin_memory=True,
                                  collate_fn=null_collate)

    valid_dataset = URESDataset(
        data_dir=train_data_folder,
        mode='train',
        csv=[
            'train.csv',
        ],
        split=val_split,
        augment=transform_valid,
        size=(1024, 1024),
    )
    valid_dataloader = DataLoader(valid_dataset,
                                  sampler=SequentialSampler(valid_dataset),
                                  batch_size=valid_batch_size,
                                  drop_last=False,
                                  num_workers=4,
                                  pin_memory=True,
                                  collate_fn=null_collate)

    log.write('train_dataset : \n%s\n' % (train_dataset))
    log.write('valid_dataset : \n%s\n' % (valid_dataset))
    log.write('\n')

    ############################################################################## define unet model with backbone
    def load(model, pretrain_file, skip=[]):
        pretrain_state_dict = torch.load(pretrain_file)
        state_dict = model.state_dict()
        keys = list(state_dict.keys())
        for key in keys:
            if any(s in key for s in skip): continue
            try:
                state_dict[key] = pretrain_state_dict[key]
            except:
                print(key)
        model.load_state_dict(state_dict)

        return model

    def get_deeplab_model(model_name="deep_se101", in_channel=3, num_classes=1, criterion=SoftDiceLoss_binary(), \
            load_pretrain=False, checkpoint_filepath=None):

        if model_name == 'deep_se50':
            model = DeepSRNX50V3PlusD_m1(in_channel=in_channel,
                                         num_classes=num_classes,
                                         criterion=criterion)
        elif model_name == 'deep_se101':
            model = DeepSRNX101V3PlusD_m1(in_channel=in_channel,
                                          num_classes=num_classes,
                                          criterion=criterion)
        elif model_name == 'WideResnet38':
            model = DeepWR38V3PlusD_m1(in_channel=in_channel,
                                       num_classes=num_classes,
                                       criterion=criterion)
        elif model_name == 'unet_ef3':
            model = EfficientNet_3_unet()
        elif model_name == 'unet_ef5':
            model = EfficientNet_5_unet()
        else:
            print('No model name in it')
            model = None

        if (load_pretrain):
            model = load(model, checkpoint_filepath)

        return model

    def get_unet_model(model_name="efficientnet-b3", IN_CHANNEL=3, NUM_CLASSES=1, \
            WIDTH=MASK_WIDTH, HEIGHT=MASK_HEIGHT, load_pretrain=False, checkpoint_filepath=None):

        model = model_iMet(model_name, IN_CHANNEL, NUM_CLASSES, WIDTH, HEIGHT)

        if (load_pretrain):
            model.load_pretrain(checkpoint_filepath)

        return model

    def get_aspp_model(model_name="efficientnet-b3",
                       NUM_CLASSES=1,
                       load_pretrain=False,
                       checkpoint_filepath=None):

        model = Net(model_name, IN_CHANNEL, NUM_CLASSES, WIDTH, HEIGHT)
        if (load_pretrain):
            state_dict = torch.load(checkpoint_filepath,
                                    map_location=lambda storage, loc: storage)
        model.load_state_dict(state_dict, strict=True)

        return model

    ############################################################################### training parameters
    checkpoint_filename = model_type + '/' + model_name + '/' + model_name + "_" + model_type + '_fold_' + str(
        fold) + "_checkpoint.pth"
    checkpoint_filepath = os.path.join(checkpoint_folder, checkpoint_filename)

    ############################################################################### model and optimizer
    if model_type == 'unet':
        model = get_unet_model(model_name=model_name, IN_CHANNEL=3, NUM_CLASSES=NUM_CLASS, \
            WIDTH=MASK_WIDTH, HEIGHT=MASK_HEIGHT, load_pretrain=load_pretrain, checkpoint_filepath=checkpoint_filepath)
    elif model_type == 'deeplab':
        model = get_deeplab_model(model_name=model_name, in_channel=3, num_classes=NUM_CLASS, \
            criterion=BCEDiceLoss(), load_pretrain=load_pretrain, checkpoint_filepath=checkpoint_filepath)
    elif model_type == 'aspp':
        model = get_aspp_model(model_name=model_name, NUM_CLASSES=NUM_CLASS, \
            load_pretrain=load_pretrain, checkpoint_filepath=checkpoint_filepath)

    model = model.cuda()

    if optimizer_name == "Adam":
        if model_type != 'deeplab':
            optimizer = torch.optim.Adam([{
                'params': model.decoder.parameters(),
                'lr': lr,
                'weight_decay': 0.01
            }, {
                'params': model.encoder.parameters(),
                'lr': lr * 0.05
            }])
        else:
            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    elif optimizer_name == "adamonecycle":
        flatten_model = lambda m: sum(map(flatten_model, m.children()), []
                                      ) if num_children(m) else [m]
        get_layer_groups = lambda m: [nn.Sequential(*flatten_model(m))]

        optimizer_func = partial(optim.Adam, betas=(0.9, 0.99))
        optimizer = OptimWrapper.create(optimizer_func,
                                        3e-3,
                                        get_layer_groups(model),
                                        wd=1e-4,
                                        true_wd=True,
                                        bn_wd=True)
    elif optimizer_name == "Ranger":
        if model_type != 'deeplab':
            optimizer = Ranger([{
                'params': model.decoder.parameters(),
                'lr': lr,
                'weight_decay': 0.01
            }, {
                'params': model.encoder.parameters(),
                'lr': lr * 0.05
            }])
        else:
            optimizer = Ranger(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr,
                               weight_decay=1e-5)
    else:
        raise NotImplementedError

    if lr_scheduler_name == "adamonecycle":
        scheduler = lsf.OneCycle(optimizer,
                                 len(train_dataset) * num_epoch, lr,
                                 [0.95, 0.85], 10.0, 0.4)
        lr_scheduler_each_iter = True
    elif lr_scheduler_name == "CosineAnealing":
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                               num_epoch,
                                                               eta_min=0,
                                                               last_epoch=-1)
        lr_scheduler_each_iter = False
    elif lr_scheduler_name == "WarmRestart":
        scheduler = WarmRestart(optimizer, T_max=5, T_mult=1, eta_min=1e-6)
        lr_scheduler_each_iter = False
    else:
        raise NotImplementedError

    log.write('net\n  %s\n' % (model_name))
    log.write('optimizer\n  %s\n' % (optimizer_name))
    log.write('schduler\n  %s\n' % (lr_scheduler_name))
    log.write('\n')

    # mix precision
    model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    ############################################################################### training
    log.write('** start training here! **\n')
    log.write('   batch_size=%d,  accumulation_steps=%d\n' %
              (batch_size, accumulation_steps))
    log.write('   experiment  = %s\n' % str(__file__.split('/')[-2:]))

    valid_loss = np.zeros(3, np.float32)
    train_loss = np.zeros(3, np.float32)
    valid_metric_optimal = np.inf
    eval_step = len(train_dataloader)  # or len(train_dataloader)
    log_step = 10
    eval_count = 0

    # define tensorboard writer and timer
    writer = SummaryWriter()
    start_timer = timer()

    # define criterion
    criterion = BCEDiceLoss()
    metric = FscoreMetric(activation=None)

    for epoch in range(1, num_epoch + 1):

        torch.cuda.empty_cache()

        # update lr and start from start_epoch
        # if (not lr_scheduler_each_iter):
        #     if epoch < 600:
        #         if epoch != 0:
        #             scheduler.step()
        #             scheduler = warm_restart(scheduler, T_mult=2)
        #     elif epoch > 600 and epoch < 800:
        #         optimizer.param_groups[0]['lr'] = 1e-5
        #     else:
        #         optimizer.param_groups[0]['lr'] = 5e-6

        affect_rate = CosineAnnealingWarmUpRestarts(
            epoch,
            T_0=num_epoch,
            T_warmup=15,
            gamma=0.8,
        )
        optimizer.param_groups[0]['lr'] = affect_rate * lr

        if epoch < 100:
            optimizer.param_groups[0]['lr'] = affect_rate * lr
        elif epoch < 150:
            lr = 4e-4
            optimizer.param_groups[0]['lr'] = affect_rate * lr
        else:
            lr = 1e-4

        # optimizer.param_groups[0]['lr'] = rate * lr
        # optimizer.param_groups[1]['lr'] = rate * lr * 0.01

        if (epoch < start_epoch):
            continue

        log.write("Epoch%s\n" % epoch)
        log.write('\n')

        for param_group in optimizer.param_groups:
            rate = param_group['lr']

        sum_train_loss = np.zeros_like(train_loss)
        sum_train = np.zeros_like(train_loss)

        seed_everything(SEED + epoch)
        torch.cuda.empty_cache()
        optimizer.zero_grad()

        for tr_batch_i, (X, truth_mask) in enumerate(train_dataloader):

            if (lr_scheduler_each_iter):
                scheduler.step(tr_batch_i)

            model.train()

            X = X.cuda().float()
            truth_mask = truth_mask.cuda()
            prediction = model(X)  # [N, C, H, W]
            # loss = criterion_mask(prediction, truth_mask, weight=None)
            loss = criterion(prediction, truth_mask)

            with amp.scale_loss(loss / accumulation_steps,
                                optimizer) as scaled_loss:
                scaled_loss.backward()

            #loss.backward()

            if ((tr_batch_i + 1) % accumulation_steps == 0):
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               max_norm=5.0,
                                               norm_type=2)
                optimizer.step()
                optimizer.zero_grad()

                writer.add_scalar(
                    'train_loss_' + str(fold), loss.item(),
                    (epoch - 1) * len(train_dataloader) * batch_size +
                    tr_batch_i * batch_size)

            # print statistics  --------

            # probability_mask  = prediction
            probability_mask = torch.sigmoid(prediction)
            mask_positive = torch.where(truth_mask > 0.5,
                                        torch.ones_like(truth_mask),
                                        truth_mask)
            mask_negative = 1 - mask_positive
            fscore_positive = metric(probability_mask, mask_positive)
            fscore_negative = metric(1 - probability_mask, mask_negative)

            # probability_mask  = torch.sigmoid(prediction)
            # mask_positive = np.where(truth_mask.clone().detach().cpu().numpy().flatten() > 0, 1, 0)
            # mask_negative = 1 - mask_positive
            # mask_pred_positive = np.where(probability_mask.detach().clone().cpu().numpy().flatten() > 0.5, 1, 0)
            # mask_pred_negative = 1 - mask_pred_positive
            # fscore_positive = f1_score(mask_positive, mask_pred_positive)
            # fscore_negative = f1_score(mask_negative, mask_pred_negative)

            l = np.array(
                [loss.item() * batch_size, fscore_positive, fscore_negative])
            n = np.array([batch_size])
            sum_train_loss = sum_train_loss + l
            sum_train = sum_train + n

            # log for training
            if (tr_batch_i + 1) % log_step == 0:
                train_loss = sum_train_loss / (sum_train + 1e-12)
                sum_train_loss[...] = 0
                sum_train[...] = 0
                log.write('lr: %f train loss: %f fscore_positive: %f fscore_negative: %f\n' % \
                    (rate, train_loss[0], train_loss[1], train_loss[2]))

            if (tr_batch_i + 1) % eval_step == 0:

                eval_count += 1

                valid_loss = np.zeros(3, np.float32)
                valid_num = np.zeros_like(valid_loss)
                valid_metric = []

                with torch.no_grad():

                    torch.cuda.empty_cache()

                    for val_batch_i, (
                            X, truth_mask) in enumerate(valid_dataloader):

                        model.eval()

                        X = X.cuda().float()
                        truth_mask = truth_mask.cuda()
                        prediction = model(X)  # [N, C, H, W]

                        # loss = criterion_mask(prediction, truth_mask, weight=None)
                        loss = criterion(prediction, truth_mask)

                        writer.add_scalar(
                            'val_loss_' + str(fold), loss.item(),
                            (eval_count - 1) * len(valid_dataloader) *
                            valid_batch_size + val_batch_i * valid_batch_size)

                        # print statistics  --------

                        # probability_mask  = prediction
                        probability_mask = torch.sigmoid(prediction)
                        mask_positive = torch.where(
                            truth_mask > 0.5, torch.ones_like(truth_mask),
                            truth_mask)
                        mask_negative = 1 - mask_positive
                        fscore_positive = metric(probability_mask,
                                                 mask_positive)
                        fscore_negative = metric(1 - probability_mask,
                                                 mask_negative)

                        # if (epoch == 1) and (val_batch_i == 0):
                        #     predict = probability_mask[0, :, :].detach().squeeze().cpu().numpy()
                        #     predict = predict > 0.5 # Threshould
                        #     predict = (1 - predict)*255
                        #     cv2.imwrite('result/0_0.tiff', predict.astype(np.uint8))

                        # probability_mask  = torch.sigmoid(prediction)
                        # mask_positive = np.where(truth_mask.clone().detach().cpu().numpy().flatten() > 0, 1, 0)
                        # mask_negative = 1 - mask_positive
                        # mask_pred_positive = np.where(probability_mask.detach().clone().cpu().numpy().flatten() > 0.5, 1, 0)
                        # mask_pred_negative = 1 - mask_pred_positive
                        # fscore_positive = f1_score(mask_positive, mask_pred_positive)
                        # fscore_negative = f1_score(mask_negative, mask_pred_negative)

                        #---
                        l = np.array([
                            loss.item() * valid_batch_size, fscore_positive,
                            fscore_negative
                        ])
                        n = np.array([valid_batch_size])
                        valid_loss = valid_loss + l
                        valid_num = valid_num + n

                    valid_loss = valid_loss / valid_num

                    log.write('validation loss: %f fscore_positive: %f fscore_negative: %f\n' % \
                    (valid_loss[0], \
                    valid_loss[1], \
                    valid_loss[2]))

        val_metric_epoch = valid_loss[0]

        if (val_metric_epoch <= valid_metric_optimal):

            log.write('Validation metric improved ({:.6f} --> {:.6f}).  Saving model ...'.format(\
                    valid_metric_optimal, val_metric_epoch))

            valid_metric_optimal = val_metric_epoch
            torch.save(model.state_dict(), checkpoint_filepath)
Ejemplo n.º 14
0
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    tst_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/test.csv')
    trn_df = pd.concat([trn_df, tst_df], axis=0).fillna(-1)
    trn_df['is_original'] = 1
    # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv')
    # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv')
    # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv')

    # clean texts
    # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer'])

    # load additional tokens
    # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin:
    #     additional_tokens = pickle.load(fin)

    gkf = GroupKFold(n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold > 0:
            break
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(
                    histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ',
            logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            trn_df = trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]  #  + additional_tokens

        fold_trn_df = trn_df.drop(['is_original', 'question_body_le'], axis=1)
        # fold_trn_df = pd.concat([fold_trn_df, raw_pseudo_df, opt_pseudo_df, half_opt_pseudo_df], axis=0)

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[SEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
            use_category=False,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        model = BertForMaskedLM.from_pretrained('bert-base-uncased')

        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue

            # model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch_ML(model, optimizer, trn_loader, DEVICE)

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(trn_loss)
            else:
                histories['val_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(trn_loss)
            else:
                histories['val_metric'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(trn_loss)
            else:
                histories['val_metric_raws'][fold] = [
                    trn_loss,
                ]

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ',
                logger)
            model = model.to('cpu')
            # model = model.module
            save_checkpoint(
                f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                model,
                optimizer,
                scheduler,
                histories,
                [],
                [],
                [],
                fold,
                epoch,
                trn_loss,
                trn_loss,
            )
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer,
                                      clean=False)
        del model

    send_line_notification('fini!')

    sel_log('now saving best checkpoints...', logger)
Ejemplo n.º 15
0
def mnist(argv=None):
    args = parse_arguments(argv)
    use_cuda = args.cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    # Train
    # -----
    train_source = datasets.MNIST(
        '../data',
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ]),
    )
    train_sampler = RandomSampler(train_source)
    train_loader = torch.utils.data.DataLoader(
        train_source,
        batch_size=args.batch_size,
        shuffle=True,
        # sampler=train_sampler,
        **kwargs
    )

    # Test
    # ----
    test_source = datasets.MNIST(
        '../data',
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])
    )

    test_loader = torch.utils.data.DataLoader(
        test_source,
        batch_size=args.test_batch_size,
        shuffle=False,
        sampler=RandomSampler(test_source),
        **kwargs
    )

    model = Net().to(device)
    optimizer = optim.SGD(
        model.parameters(),
        lr=args.lr,
        momentum=args.momentum
    )

    trainer = TrainClassifier(
        optimizer,
        nn.NLLLoss(reduction='sum'),
        model,
        train_sampler,
        device)

    trainer.fit(args.epochs, train_loader)
    result = trainer.eval_model(test_loader)

    print(f'Eval (acc: {result.acc * 100}) (loss: {result.loss})')
Ejemplo n.º 16
0
def run(fold, df, meta_features, n_meta_features, transforms_train,
        transforms_val, mel_idx):
    if args.DEBUG:
        args.n_epochs = 5
        df_train = df[df['fold'] != fold].sample(args.batch_size * 5)
        df_valid = df[df['fold'] == fold].sample(args.batch_size * 5)
    else:
        df_train = df[df['fold'] != fold]
        df_valid = df[df['fold'] == fold]

    dataset_train = MelanomaDataset(df_train,
                                    'train',
                                    meta_features,
                                    transform=transforms_train)
    dataset_valid = MelanomaDataset(df_valid,
                                    'valid',
                                    meta_features,
                                    transform=transforms_val)
    train_loader = torch.utils.data.DataLoader(
        dataset_train,
        batch_size=args.batch_size,
        sampler=RandomSampler(dataset_train),
        num_workers=args.num_workers)  # 随机不重复采样
    valid_loader = torch.utils.data.DataLoader(dataset_valid,
                                               batch_size=args.batch_size,
                                               num_workers=args.num_workers)
    model = ModelClass(
        args.enet_type,
        n_meta_features=n_meta_features,
        n_meta_dim=[int(nd) for nd in args.n_meta_dim.split(',')],
        out_dim=args.out_dim,
        pretrained=True)
    if DP:
        model = apex.parallel.convert_syncbn_model(model)
    model = model.to(device)

    auc_max = 0.
    auc_20_max = 0.
    model_file = os.path.join(args.model_dir,
                              f'{args.kernel_type}_best_fold{fold}.pth')
    model_file2 = os.path.join(args.model_dir,
                               f'{args.kernel_type}_best_20_fold{fold}.pth')
    model_file3 = os.path.join(args.model_dir,
                               f'{args.kernel_type}_final_fold{fold}.pth')

    optimizer = optim.Adam(model.parameters(), lr=args.init_lr)
    if args.use_amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
    if DP:
        model = nn.DataParallel(model)
    #     scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs - 1)
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, args.n_epochs - 1)
    scheduler_warmup = GradualWarmupSchedulerV2(
        optimizer,
        multiplier=10,
        total_epoch=1,
        after_scheduler=scheduler_cosine)

    print(len(dataset_train), len(dataset_valid))
    for epoch in range(1, args.n_epochs + 1):
        print(time.ctime(), f'Epoch {epoch}', f'Fold {fold}')
        #         scheduler_warmup.step(epoch - 1)

        train_loss = train_epoch(model, train_loader, optimizer)
        val_loss, acc, auc, auc_20 = val_epoch(
            model, valid_loader, mel_idx, is_ext=df_valid['is_ext'].values)

        content = time.ctime(
        ) + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {(val_loss):.5f}, acc: {(acc):.4f}, auc: {(auc):.6f}, auc_20: {(auc_20):.6f}.'
        print(content)
        with open(os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'),
                  'a') as appender:
            appender.write(content + '\n')

        scheduler_warmup.step()
        if epoch == 2: scheduler_warmup.step()  # bug workaround

        if auc > auc_max:
            print('auc_max ({:.6f} --> {:.6f}). Saving model ...'.format(
                auc_max, auc))
            torch.save(model.state_dict(), model_file)
            auc_max = auc
        if auc_20 > auc_20_max:
            print('auc_20_max ({:.6f} --> {:.6f}). Saving model ...'.format(
                auc_20_max, auc_20))
            torch.save(model.state_dict(), model_file2)
            auc_20_max = auc_20

    torch.save(model.state_dict(), model_file3)
Ejemplo n.º 17
0
def main():
    # define parser and arguments
    args = get_train_test_args()

    util.set_seed(args.seed)
    model = DistilBertForQuestionAnswering.from_pretrained(
        args.model_checkpoint)
    tokenizer = DistilBertTokenizerFast.from_pretrained(args.model_checkpoint)
    with wandb.init(project="qa-system", config=args) as run:
        run.name = args.run_name
        wandb.watch(model)
        if args.do_train:
            if not os.path.exists(args.save_dir):
                os.makedirs(args.save_dir)
            args.save_dir = util.get_save_dir(args.save_dir, args.run_name)
            log = util.get_logger(args.save_dir, 'log_train')
            log.info(
                f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')
            log.info("Preparing Training Data...")
            args.device = torch.device(
                'cuda') if torch.cuda.is_available() else torch.device('cpu')
            trainer = Trainer(args, log)
            train_dataset, _ = get_dataset(args, args.train_datasets,
                                           args.train_dir, tokenizer, 'train')
            log.info("Preparing Validation Data...")
            val_dataset, val_dict = get_dataset(args, args.val_datasets,
                                                args.val_dir, tokenizer, 'val')
            train_loader = DataLoader(train_dataset,
                                      batch_size=args.batch_size,
                                      sampler=RandomSampler(train_dataset))
            val_loader = DataLoader(val_dataset,
                                    batch_size=args.batch_size,
                                    sampler=SequentialSampler(val_dataset))
            best_scores = trainer.train(model, train_loader, val_loader,
                                        val_dict)
            model_artifact = wandb.Artifact(
                args.run_name,
                type="model",
            )
            model_artifact.add_dir(os.path.join(args.save_dir, 'checkpoint'))
            run.log_artifact(model_artifact)

        if args.do_eval:
            args.device = torch.device(
                'cuda') if torch.cuda.is_available() else torch.device('cpu')
            split_name = 'test' if 'test' in args.eval_dir else 'validation'
            log = util.get_logger(args.save_dir, f'log_{split_name}')
            trainer = Trainer(args, log)
            if args.checkpoint_path != "":
                model = DistilBertForQuestionAnswering.from_pretrained(
                    args.checkpoint_path)
            else:
                checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
                model = DistilBertForQuestionAnswering.from_pretrained(
                    checkpoint_path)
            model.to(args.device)
            eval_dataset, eval_dict = get_dataset(args, args.eval_datasets,
                                                  args.eval_dir, tokenizer,
                                                  split_name)
            eval_loader = DataLoader(eval_dataset,
                                     batch_size=args.batch_size,
                                     sampler=SequentialSampler(eval_dataset))
            eval_preds, eval_scores = trainer.evaluate(model,
                                                       eval_loader,
                                                       eval_dict,
                                                       return_preds=True,
                                                       split=split_name)
            results_str = ', '.join(f'{k}: {v:05.2f}'
                                    for k, v in eval_scores.items())
            log.info(f'Eval {results_str}')
            # Write submission file
            sub_path = os.path.join(args.save_dir,
                                    split_name + '_' + args.sub_file)
            log.info(f'Writing submission file to {sub_path}...')
            with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
                csv_writer = csv.writer(csv_fh, delimiter=',')
                csv_writer.writerow(['Id', 'Predicted'])
                for uuid in sorted(eval_preds):
                    csv_writer.writerow([uuid, eval_preds[uuid]])
Ejemplo n.º 18
0
def main():
    args = parser.parse_args()

    log_out_dir = os.path.join(RESULT_DIR, 'logs', args.out_dir,
                               'fold%d' % args.fold)
    if not os.path.exists(log_out_dir):
        os.makedirs(log_out_dir)
    log = Logger()
    log.open(os.path.join(log_out_dir, 'log.train.txt'), mode='a')

    model_out_dir = os.path.join(RESULT_DIR, 'models', args.out_dir,
                                 'fold%d' % args.fold)
    log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format(
        model_out_dir))
    if not os.path.exists(model_out_dir):
        os.makedirs(model_out_dir)

    # set cuda visible device
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id
    cudnn.benchmark = True
    # cudnn.enabled = False

    # set random seeds
    torch.manual_seed(0)
    torch.cuda.manual_seed_all(0)
    np.random.seed(0)

    model_params = {}
    model_params['architecture'] = args.arch
    model_params['num_classes'] = args.num_classes
    model_params['in_channels'] = args.in_channels
    if 'efficientnet' in args.arch:
        model_params['image_size'] = args.img_size
        model_params['encoder'] = args.effnet_encoder
    model = init_network(model_params)

    if args.load_state_dict_path is not None:
        init_pretrained = torch.load(args.load_state_dict_path)
        model.load_state_dict(init_pretrained['state_dict'])
    # state_dict = model.state_dict()
    # torch.save({
    #     'state_dict': state_dict
    # }, '../output/densenet121_bestfitting_converted_classes.h5')
    # sys.exit(0)
    # move network to gpu
    # model = DataParallel(model)

    if args.clip_and_replace_grad_explosures:

        def clip_and_replace_explosures(grad):
            grad[torch.logical_or(
                torch.isnan(grad),
                torch.isinf(grad))] = torch.tensor(0.0).cuda()
            grad = torch.clamp(grad, -0.5, 0.5)
            return grad

        for param in model.parameters():
            if param.requires_grad:
                param.register_hook(clip_and_replace_explosures)
    model.cuda()

    # define loss function (criterion)
    try:
        criterion = eval(args.loss)().cuda()
    except:
        raise (RuntimeError("Loss {} not available!".format(args.loss)))

    start_epoch = 0
    best_loss = 1e5
    best_epoch = 0
    best_map = 0

    # define scheduler
    try:
        scheduler = eval(args.scheduler)(
            scheduler_lr_multiplier=args.scheduler_lr_multiplier,
            scheduler_epoch_offset=args.scheduler_epoch_offset)
    except:
        raise (RuntimeError("Scheduler {} not available!".format(
            args.scheduler)))
    optimizer = scheduler.schedule(model, start_epoch, args.epochs)[0]

    # optionally resume from a checkpoint
    if args.resume:
        # args.resume = os.path.join(model_out_dir, args.resume)
        if os.path.isfile(args.resume):
            # load checkpoint weights and update model and optimizer
            log.write(">> Loading checkpoint:\n>> '{}'\n".format(args.resume))

            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            best_epoch = checkpoint['best_epoch']
            best_map = checkpoint['best_score']
            model.module.load_state_dict(checkpoint['state_dict'])

            optimizer_fpath = args.resume.replace('.pth', '_optim.pth')
            if os.path.exists(optimizer_fpath):
                log.write(">> Loading checkpoint:\n>> '{}'\n".format(
                    optimizer_fpath))
                optimizer.load_state_dict(
                    torch.load(optimizer_fpath)['optimizer'])
            log.write(">>>> loaded checkpoint:\n>>>> '{}' (epoch {})\n".format(
                args.resume, checkpoint['epoch']))
        else:
            log.write(">> No checkpoint found at '{}'\n".format(args.resume))

    # Data loading code
    train_transform = train_multi_augment2

    with open('../input/imagelevel_folds_obvious_staining_5.pkl', 'rb') as f:
        folds = pickle.load(f)
    fold = args.fold
    trn_img_paths, val_img_paths = folds[fold]

    train_df = get_train_df_ohe(clean_from_duplicates=args.clean_duplicates,
                                clean_mitotic=args.clean_mitotic_samples,
                                clean_aggresome=args.clean_aggresome)
    if args.ignore_negs:
        train_df['Negative'] = 0

    train_paths_set = set(train_df['img_base_path'])

    basepath_2_ohe_vector = {
        img: vec
        for img, vec in zip(train_df['img_base_path'],
                            train_df.iloc[:, 2:].values)
    }

    train_paths_set = set(train_df['img_base_path'])

    if not args.without_public_data:
        public_hpa_df_17 = get_public_df_ohe(
            clean_from_duplicates=args.clean_duplicates,
            clean_mitotic=args.clean_mitotic_samples,
            clean_aggresome=args.clean_aggresome)
        if args.ignore_negs:
            public_hpa_df_17['Negative'] = 0
        public_basepath_2_ohe_vector = {
            img_path: vec
            for img_path, vec in zip(public_hpa_df_17['img_base_path'],
                                     public_hpa_df_17.iloc[:, 2:].values)
        }
        basepath_2_ohe_vector.update(public_basepath_2_ohe_vector)
    else:
        trn_img_paths = [
            path for path in trn_img_paths if path in train_paths_set
        ]

    if not args.without_public_data:
        available_paths = set(
            np.concatenate((train_df['img_base_path'].values,
                            public_hpa_df_17['img_base_path'].values)))
    else:
        available_paths = set(train_df['img_base_path'].values)
    trn_img_paths = [path for path in trn_img_paths if path in available_paths]
    val_img_paths = [path for path in val_img_paths if path in available_paths]

    if args.copy_paste_augment_mitotic_aggresome:
        train_ids = {os.path.basename(x) for x in trn_img_paths}
        id_2_ohe_vector = {
            os.path.basename(path): ohe
            for path, ohe in basepath_2_ohe_vector.items()
        }

        cherrypicked_mitotic_spindle = pd.read_csv(
            '../input/mitotic_cells_selection.csv')
        cherrypicked_mitotic_spindle = cherrypicked_mitotic_spindle[
            cherrypicked_mitotic_spindle['ID'].isin(train_ids)]

        cherrypicked_aggresome = pd.read_csv(
            '../input/aggressome_cells_selection.csv')
        cherrypicked_aggresome = cherrypicked_aggresome[
            cherrypicked_aggresome['ID'].isin(train_ids)]

        cherrypicked_mitotic_spindle['ohe'] = cherrypicked_mitotic_spindle[
            'ID'].map(id_2_ohe_vector)
        cherrypicked_aggresome['ohe'] = cherrypicked_aggresome['ID'].map(
            id_2_ohe_vector)

        mitotic_idx = [
            idx for idx, colname in enumerate(train_df.columns)
            if colname == 'Mitotic spindle'
        ][0]
        aggresome_idx = [
            idx for idx, colname in enumerate(train_df.columns)
            if colname == 'Aggresome'
        ][0]
        mitotic_ohe = np.zeros_like(cherrypicked_aggresome['ohe'].values[0])
        mitotic_ohe[mitotic_idx] = 1

        aggresome_ohe = np.zeros_like(cherrypicked_aggresome['ohe'].values[0])
        aggresome_ohe[aggresome_idx] = 1

        cherrypicked_mitotic_spindle.loc[
            cherrypicked_mitotic_spindle['is_pure'] == 1, 'ohe'] = pd.Series(
                [
                    mitotic_ohe for _ in range(
                        sum(cherrypicked_mitotic_spindle['is_pure'] == 1))
                ],
                index=cherrypicked_mitotic_spindle.index[
                    cherrypicked_mitotic_spindle['is_pure'] == 1])

        cherrypicked_aggresome.loc[
            cherrypicked_aggresome['is_pure'] == 1,
            'ohe'] = pd.Series([
                mitotic_ohe
                for _ in range(sum(cherrypicked_aggresome['is_pure'] == 1))
            ],
                               index=cherrypicked_aggresome.index[
                                   cherrypicked_aggresome['is_pure'] == 1])

        class_purity_2_weight = {1: 4, 0: 1}
        cherrypicked_mitotic_spindle[
            'sampling_weight'] = cherrypicked_mitotic_spindle['is_pure'].map(
                class_purity_2_weight)
        cherrypicked_aggresome['sampling_weight'] = cherrypicked_aggresome[
            'is_pure'].map(class_purity_2_weight)
    else:
        cherrypicked_mitotic_spindle = None
        cherrypicked_aggresome = None

    train_dataset = ProteinDatasetImageLevel(
        trn_img_paths,
        basepath_2_ohe=basepath_2_ohe_vector,
        img_size=args.img_size,
        is_trainset=True,
        return_label=True,
        in_channels=args.in_channels,
        transform=train_transform,
        cherrypicked_mitotic_spindle_df=cherrypicked_mitotic_spindle,
        cherrypicked_aggresome_df=cherrypicked_aggresome)

    class_names = get_class_names()
    if args.balance_classes:
        sampler = BalancingSubSampler(trn_img_paths,
                                      basepath_2_ohe_vector,
                                      class_names,
                                      required_class_count=1500)
    else:
        sampler = RandomSampler(train_dataset)

    train_loader = DataLoader(
        train_dataset,
        sampler=sampler,
        batch_size=args.batch_size,
        drop_last=True,
        num_workers=args.workers,
        pin_memory=True,
    )

    # val_img_paths = [path for path in val_img_paths if path in train_paths_set]

    valid_dataset = ProteinDatasetImageLevel(
        val_img_paths,
        basepath_2_ohe=basepath_2_ohe_vector,
        img_size=args.img_size,
        is_trainset=True,
        return_label=True,
        in_channels=args.in_channels,
        transform=train_transform)
    valid_loader = DataLoader(valid_dataset,
                              sampler=SequentialSampler(valid_dataset),
                              batch_size=args.batch_size,
                              drop_last=False,
                              num_workers=args.workers,
                              pin_memory=True)

    focal_loss = FocalLoss().cuda()
    log.write('** start training here! **\n')
    log.write('\n')
    log.write(
        'epoch    iter      rate     |  train_loss/acc  |    valid_loss/acc/focal/map     |best_epoch/best_map|  min \n'
    )
    log.write(
        '-----------------------------------------------------------------------------------------------------------------\n'
    )
    start_epoch += 1

    if args.eval_at_start:
        with torch.no_grad():
            valid_loss, valid_acc, valid_focal_loss, valid_map = validate(
                valid_loader, model, criterion, -1, focal_loss, log)
        print('\r', end='', flush=True)
        log.write(
            '%5.1f   %5d    %0.6f   |  %0.4f  %0.4f  |    %0.4f  %6.4f %6.4f %6.4f    |  %6.1f    %6.4f   | %3.1f min \n' % \
            (-1, -1, -1, -1, -1, valid_loss, valid_acc, valid_focal_loss, valid_map,
             best_epoch, best_map, -1))

    for epoch in range(start_epoch, args.epochs + 1):
        end = time.time()

        # set manual seeds per epoch
        np.random.seed(epoch)
        torch.manual_seed(epoch)
        torch.cuda.manual_seed_all(epoch)

        # adjust learning rate for each epoch
        lr_list = scheduler.step(model, epoch, args.epochs)
        lr = lr_list[0]

        # train for one epoch on train set
        iter, train_loss, train_acc = train(
            train_loader,
            model,
            criterion,
            optimizer,
            epoch,
            clipnorm=args.clipnorm,
            lr=lr,
            agg_steps=args.gradient_accumulation_steps)
        if np.isnan(train_loss):
            print('@@@@@NAN!')
        else:
            print('norm')

        with torch.no_grad():
            valid_loss, valid_acc, valid_focal_loss, valid_map = validate(
                valid_loader, model, criterion, epoch, focal_loss, log)

        # remember best loss and save checkpoint
        is_best = valid_map > best_map
        best_loss = min(valid_focal_loss, best_loss)
        best_epoch = epoch if is_best else best_epoch
        best_map = valid_map if is_best else best_map

        print('\r', end='', flush=True)
        log.write(
            '%5.1f   %5d    %0.6f   |  %0.4f  %0.4f  |    %0.4f  %6.4f %6.4f %6.4f    |  %6.1f    %6.4f   | %3.1f min \n' % \
            (epoch, iter + 1, lr, train_loss, train_acc, valid_loss, valid_acc, valid_focal_loss, valid_map,
             best_epoch, best_map, (time.time() - end) / 60))

        save_model(model,
                   is_best,
                   model_out_dir,
                   optimizer=optimizer,
                   epoch=epoch,
                   best_epoch=best_epoch,
                   best_map=best_map)
dtype = torch.cuda.FloatTensor

#csv_path='final_label.csv'
#data=pd.read_csv(csv_path,sep='\t')
#train,test=train_test_split(data,test_size=0.3,shuffle=True,random_state=1235)
#train.to_csv('train.csv',index=False)
#test.to_csv('test.csv',index=False)

composed_transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((.5, .5, .5), (.5, .5, .5))])

train_ds = CDATA('train.csv', 'data/', transform=composed_transform)
train_loader = data.DataLoader(train_ds,
                               batch_size=1,
                               sampler=RandomSampler(train_ds))

val_ds = CDATA('test.csv', 'data/', transform=composed_transform)
val_loader = data.DataLoader(val_ds,
                             batch_size=1,
                             sampler=RandomSampler(val_ds))

retinet = RetiNet().type(dtype)
optimizer = optim.Adam(retinet.parameters(), lr=.0002)
loss = nn.BCELoss()
print(len(train_loader))
num_epochs = 10
c = 0
fopen = open('loss.txt', 'w')
for epoch in range(num_epochs):
    for x, y in train_loader:
Ejemplo n.º 20
0
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1
    for HOST in HOSTs:
        trn_df.loc[trn_df.host.str.contains(HOST).values,
                   'host'] = f'HOST_{HOST}'.casefold()
    # aug_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/ContextualWordEmbsAug_sub_df.pkl')
    # aug_df['is_original'] = 0

    # trn_df = pd.concat([trn_df, aug_df], axis=0).reset_index(drop=True)

    gkf = GroupKFold(n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    # calc max_seq_len using quest dataset
    # max_seq_len = QUESTDataset(
    #     df=trn_df,
    #     mode='train',
    #     tokens=[],
    #     augment=[],
    #     pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
    # ).MAX_SEQUENCE_LENGTH
    # max_seq_len = 9458
    # max_seq_len = 1504
    max_seq_len = 512

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(
                    histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ',
            logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
            #            'host_stackexchange',
            #            'host_askubuntu',
            #            'host_mathoverflow',
            #            'host_serverfault',
            #            'host_stackoverflow',
            #            'host_superuser',
        ]

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            MAX_SEQUENCE_LENGTH=max_seq_len,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            MAX_SEQUENCE_LENGTH=max_seq_len,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = BCEWithLogitsLoss()
        # fobj = MSELoss()
        model = BertModelForBinaryMultiLabelClassifier(
            num_labels=30,
            pretrained_model_name_or_path=MODEL_PRETRAIN,
            # cat_num=5,
            token_size=len(trn_dataset.tokenizer),
            MAX_SEQUENCE_LENGTH=max_seq_len,
        )
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader)

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [
                    val_loss,
                ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [
                    val_metric,
                ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [
                    val_metric_raws,
                ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}', logger)
            model = model.to('cpu')
            model = model.module
            save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model,
                            optimizer, scheduler, histories, val_y_preds,
                            val_y_trues, val_qa_ids, fold, epoch, val_loss,
                            val_metric)
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(
                histories["val_metric"][fold])])
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
def get_tpu_sampler(dataset: Dataset):
    if xm.xrt_world_size() <= 1:
        return RandomSampler(dataset)
    return DistributedSampler(dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal())
import torch
from torch.utils.data import DataLoader, Dataset
from model import NestedUNet
from torch.utils.data.sampler import RandomSampler
from dataloader import random_seed, PolypDataset
# from model_without_effcientnet_encoder import NestedUNet

test_images_file = "data/test_images.txt"
test_labels_file = "data/test_masks.txt"

input_size = (128, 128)
torch.manual_seed(15)
test_set = PolypDataset(test_images_file, test_labels_file, input_size)
test_loader = DataLoader(test_set,
                         batch_size=1,
                         sampler=RandomSampler(test_set))

# Inference device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load Model
model_path = "experiment_test/polyp_unet_deepsupervision1.pth"
# model = NestedUNet(n_channels = 3, n_classes = 1, bilinear = False).to(device)
model = NestedUNet(num_classes=1, input_channels=3, bilinear=False).to(device)

model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()
Ejemplo n.º 23
0
def main():
    args = parser.parse_args()

    log_out_dir = os.path.join(RESULT_DIR, 'logs', args.out_dir,
                               'fold%d' % args.fold)
    if not os.path.exists(log_out_dir):
        os.makedirs(log_out_dir)
    log = Logger()
    log.open(os.path.join(log_out_dir, 'log.train.txt'), mode='a')

    model_out_dir = os.path.join(RESULT_DIR, 'models', args.out_dir,
                                 'fold%d' % args.fold)
    log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format(
        model_out_dir))
    if not os.path.exists(model_out_dir):
        os.makedirs(model_out_dir)

    # set cuda visible device
    if not args.all_gpus:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id
    cudnn.benchmark = True
    # cudnn.enabled = False

    # set random seeds
    torch.manual_seed(0)
    torch.cuda.manual_seed_all(0)
    np.random.seed(0)

    model_params = {}
    model_params['architecture'] = args.arch
    model_params['num_classes'] = args.num_classes
    model_params['in_channels'] = args.in_channels
    if 'efficientnet' in args.arch:
        model_params['image_size'] = args.img_size
        model_params['encoder'] = args.effnet_encoder

    model = init_network(model_params)

    if args.load_state_dict_path is not None:
        if args.load_state_dict_path == 'use-img-level-densenet-ckpt':
            model_dir = '../output/models/densenet121_1024_all_data__obvious_neg__gradaccum_20__start_lr_3e6'
            pretrained_ckpt_path = os.path.join(f'{model_dir}',
                                                f'fold{args.fold}',
                                                'final.pth')
        else:
            pretrained_ckpt_path = args.load_state_dict_path
        init_pretrained = torch.load(pretrained_ckpt_path)
        model.load_state_dict(init_pretrained['state_dict'])

    if args.all_gpus:
        model = DataParallel(model)
    model.cuda()

    # define loss function (criterion)
    try:
        criterion = eval(args.loss)().cuda()
    except:
        raise (RuntimeError("Loss {} not available!".format(args.loss)))

    start_epoch = 0
    best_loss = 1e5
    best_epoch = 0
    best_focal = float('inf')

    # define scheduler
    try:
        scheduler = eval(args.scheduler)(
            scheduler_lr_multiplier=args.scheduler_lr_multiplier,
            scheduler_epoch_offset=args.scheduler_epoch_offset)
    except:
        raise (RuntimeError("Scheduler {} not available!".format(
            args.scheduler)))
    optimizer = scheduler.schedule(model, start_epoch, args.epochs)[0]

    # optionally resume from a checkpoint
    if args.resume:
        args.resume = os.path.join(model_out_dir, args.resume)
        if os.path.isfile(args.resume):
            # load checkpoint weights and update model and optimizer
            log.write(">> Loading checkpoint:\n>> '{}'\n".format(args.resume))

            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            best_epoch = checkpoint['best_epoch']
            best_focal = checkpoint['best_map']
            model.load_state_dict(checkpoint['state_dict'])

            optimizer_fpath = args.resume.replace('.pth', '_optim.pth')
            if os.path.exists(optimizer_fpath):
                log.write(">> Loading checkpoint:\n>> '{}'\n".format(
                    optimizer_fpath))
                optimizer.load_state_dict(
                    torch.load(optimizer_fpath)['optimizer'])
            log.write(">>>> loaded checkpoint:\n>>>> '{}' (epoch {})\n".format(
                args.resume, checkpoint['epoch']))
        else:
            log.write(">> No checkpoint found at '{}'\n".format(args.resume))

    # Data loading code
    train_transform = train_multi_augment2

    with open('../input/imagelevel_folds_obvious_staining_5.pkl', 'rb') as f:
        folds = pickle.load(f)
    fold = args.fold
    trn_img_paths, val_img_paths = folds[fold]

    train_df = get_train_df_ohe(clean_from_duplicates=True)
    basepath_2_ohe_vector = {
        img: vec
        for img, vec in zip(train_df['img_base_path'],
                            train_df.iloc[:, 2:].values)
    }

    public_hpa_df_17 = get_public_df_ohe(clean_from_duplicates=True)
    public_basepath_2_ohe_vector = {
        img_path: vec
        for img_path, vec in zip(public_hpa_df_17['img_base_path'],
                                 public_hpa_df_17.iloc[:, 2:].values)
    }
    basepath_2_ohe_vector.update(public_basepath_2_ohe_vector)

    available_paths = set(
        np.concatenate((train_df['img_base_path'].values,
                        public_hpa_df_17['img_base_path'].values)))
    trn_img_paths = [path for path in trn_img_paths if path in available_paths]
    val_img_paths = [path for path in val_img_paths if path in available_paths]
    labels_df = pd.read_hdf(args.cell_level_labels_path)

    # modifying minor class labels
    cherrypicked_mitotic_spindle = pd.read_csv(
        '../input/mitotic_cells_selection.csv')

    cherrypicked_mitotic_spindle_img_cell = set(
        cherrypicked_mitotic_spindle[['ID', 'cell_i']].apply(tuple,
                                                             axis=1).values)

    cherrypicked_mitotic_spindle_img_cell = {
        (img, cell_i - 1)
        for img, cell_i in cherrypicked_mitotic_spindle_img_cell
    }

    class_names = get_class_names()
    mitotic_spindle_class_i = class_names.index('Mitotic spindle')

    if args.include_nn_mitotic:
        cherrypicked_mitotic_spindle_based_on_nn = pd.read_csv(
            '../input/mitotic_pos_nn_added.csv')
        cherrypicked_mitotic_spindle_img_cell.update(
            set(cherrypicked_mitotic_spindle_based_on_nn[[
                'ID', 'cell_i'
            ]].apply(tuple, axis=1).values))
        print('len cherrypicked_mitotic_spindle_img_cell',
              len(cherrypicked_mitotic_spindle_img_cell))
    mitotic_bool_idx = labels_df.index.isin(
        cherrypicked_mitotic_spindle_img_cell)

    def modify_label(labels, idx, val):
        labels[idx] = val
        return labels

    labels_df.loc[mitotic_bool_idx, 'image_level_pred'] = labels_df.loc[
        mitotic_bool_idx, 'image_level_pred'].map(
            lambda x: modify_label(x, mitotic_spindle_class_i, 1))

    if args.include_nn_mitotic:
        cherrypicked_not_mitotic_spindle_based_on_nn = pd.read_csv(
            '../input/mitotic_neg_nn_added.csv')
        cherrypicked_not_mitotic_spindle_based_on_nn = set(
            cherrypicked_not_mitotic_spindle_based_on_nn[[
                'ID', 'cell_i'
            ]].apply(tuple, axis=1).values)
        not_mitotic_bool_idx = labels_df.index.isin(
            cherrypicked_not_mitotic_spindle_based_on_nn)
        labels_df.loc[not_mitotic_bool_idx,
                      'image_level_pred'] = labels_df.loc[
                          not_mitotic_bool_idx,
                          'image_level_pred'].map(lambda x: modify_label(
                              x, mitotic_spindle_class_i, 0))

    if args.ignore_negative:
        raise NotImplementedError

    if args.upsample_minorities:
        cells_to_upsample = list(cherrypicked_mitotic_spindle_img_cell)
        aggresome_class_i = class_names.index('Aggresome')
        confident_aggresome_indices = list(
            labels_df.index[labels_df['image_level_pred'].map(
                lambda x: x[aggresome_class_i] > 0.9)])
        print('confident_aggresome_indices len',
              len(confident_aggresome_indices))
        print('confident_aggresome_indices[:5]',
              confident_aggresome_indices[:5])
        cells_to_upsample += confident_aggresome_indices
    else:
        cells_to_upsample = None
    train_dataset = ProteinDatasetCellSeparateLoading(
        trn_img_paths,
        labels_df=labels_df,
        cells_to_upsample=cells_to_upsample,
        img_size=args.img_size,
        in_channels=args.in_channels,
        transform=train_transform,
        basepath_2_ohe=basepath_2_ohe_vector,
        normalize=args.normalize,
        target_raw_img_size=args.target_raw_img_size)
    train_loader = DataLoader(
        train_dataset,
        sampler=RandomSampler(train_dataset),
        batch_size=args.batch_size,
        drop_last=False,
        num_workers=args.workers,
        pin_memory=True,
    )

    # valid_dataset = ProteinDatasetCellLevel(val_img_paths,
    #                                         labels_df=labels_df,
    #                                         img_size=args.img_size,
    #                                         batch_size=64,
    #                                         is_trainset=True,
    #                                         in_channels=args.in_channels)

    valid_dataset = ProteinDatasetCellSeparateLoading(
        val_img_paths,
        labels_df=labels_df,
        img_size=args.img_size,
        in_channels=args.in_channels,
        basepath_2_ohe=basepath_2_ohe_vector,
        normalize=args.normalize,
        target_raw_img_size=args.target_raw_img_size)
    valid_loader = DataLoader(valid_dataset,
                              sampler=SequentialSampler(valid_dataset),
                              batch_size=args.batch_size,
                              drop_last=False,
                              num_workers=args.workers,
                              pin_memory=True)

    log.write('** start training here! **\n')
    log.write('\n')
    log.write(
        'epoch    iter      rate     |  train_loss/acc  |    valid_loss/acc/map/focal     |best_epoch/best_focal|  min \n'
    )
    log.write(
        '-----------------------------------------------------------------------------------------------------------------\n'
    )
    start_epoch += 1

    if args.eval_at_start:
        with torch.no_grad():
            valid_loss, valid_acc, val_focal, val_map_score = validate(
                valid_loader, model, criterion, -1, log)
        print('\r', end='', flush=True)
        log.write(
            '%5.1f   %5d    %0.6f   |  %0.4f  %0.4f  |    %0.4f  %6.4f %6.4f %6.1f  |    %6.4f  %6.4f   | %3.1f min \n' % \
            (-1, -1, -1, -1, -1, valid_loss, valid_acc, val_map_score, val_focal,
                   best_epoch, best_focal, -1))

    for epoch in range(start_epoch, args.epochs + 1):
        end = time.time()

        # set manual seeds per epoch
        np.random.seed(epoch)
        torch.manual_seed(epoch)
        torch.cuda.manual_seed_all(epoch)

        # adjust learning rate for each epoch
        lr_list = scheduler.step(model, epoch, args.epochs)
        lr = lr_list[0]

        # train for one epoch on train set
        iter, train_loss, train_acc = train(
            train_loader,
            model,
            criterion,
            optimizer,
            epoch,
            clipnorm=args.clipnorm,
            lr=lr,
            agg_steps=args.gradient_accumulation_steps)

        with torch.no_grad():
            valid_loss, valid_acc, val_focal, val_map_score = validate(
                valid_loader, model, criterion, epoch, log)

        # remember best loss and save checkpoint
        is_best = val_focal < best_focal
        best_loss = min(valid_loss, best_loss)
        best_epoch = epoch if is_best else best_epoch
        best_focal = val_focal if is_best else best_focal

        print('\r', end='', flush=True)
        log.write('%5.1f   %5d    %0.6f   |  %0.4f  %0.4f  |    %0.4f  %6.4f %6.4f  %6.1f |  %6.4f  %6.4f | %3.1f min \n' % \
                  (epoch, iter + 1, lr, train_loss, train_acc, valid_loss, valid_acc, val_map_score, val_focal,
                   best_epoch, best_focal, (time.time() - end) / 60))

        save_model(model,
                   is_best,
                   model_out_dir,
                   optimizer=optimizer,
                   epoch=epoch,
                   best_epoch=best_epoch,
                   best_map=best_focal)
Ejemplo n.º 24
0
def run(fold, df, meta_features, n_meta_features, transforms_train,
        transforms_val, target_idx, df_test):
    '''
    학습 진행 메인 함수

    :param fold: cross-validation에서 valid에 쓰일 분할 번호
    :param df: DataFrame 학습용 전체 데이터 목록
    :param meta_features, n_meta_features: 이미지 외 추가 정보 사용 여부
    :param transforms_train, transforms_val: 데이터셋 transform 함수
    :param target_idx:
    '''

    if args.DEBUG:
        args.n_epochs = 5
        df_train = df[df['fold'] != fold].sample(args.batch_size * 5)
        df_valid = df[df['fold'] == fold].sample(args.batch_size * 5)
    else:
        df_train = df[df['fold'] != fold]
        df_valid = df[df['fold'] == fold]

        # https://discuss.pytorch.org/t/error-expected-more-than-1-value-per-channel-when-training/26274
        # batch_normalization에서 배치 사이즈 1인 경우 에러 발생할 수 있으므로, 데이터 한개 버림
        if len(df_train) % args.batch_size == 1:
            df_train = df_train.sample(len(df_train) - 1)
        if len(df_valid) % args.batch_size == 1:
            df_valid = df_valid.sample(len(df_valid) - 1)

    # 데이터셋 읽어오기
    dataset_train = MMC_ClassificationDataset(df_train,
                                              'train',
                                              meta_features,
                                              transform=transforms_train)
    dataset_valid = MMC_ClassificationDataset(df_valid,
                                              'valid',
                                              meta_features,
                                              transform=transforms_val)
    train_loader = torch.utils.data.DataLoader(
        dataset_train,
        batch_size=args.batch_size,
        sampler=RandomSampler(dataset_train),
        num_workers=args.num_workers)
    valid_loader = torch.utils.data.DataLoader(dataset_valid,
                                               batch_size=args.batch_size,
                                               num_workers=args.num_workers)

    auc_max = 0.
    auc_no_ext_max = 0.
    model_file = os.path.join(args.model_dir,
                              f'{args.kernel_type}_best_fold{fold}.pth')
    model_file2 = os.path.join(
        args.model_dir, f'{args.kernel_type}_best_no_ext_fold{fold}.pth')
    model_file3 = os.path.join(args.model_dir,
                               f'{args.kernel_type}_final_fold{fold}.pth')
    # model_file_astraining = os.path.join(args.model_dir, f'{args.kernel_type}_best_aspsg_fold{fold}.pth')

    # pretrained file이 있는 경우
    if os.path.isfile(model_file3):
        model = ModelClass(
            args.enet_type,
            n_meta_features=n_meta_features,
            n_meta_dim=[int(nd) for nd in args.n_meta_dim.split(',')],
            out_dim=args.out_dim,
            pretrained=True)
        model.load_state_dict(torch.load(model_file3))
    else:
        model = ModelClass(
            args.enet_type,
            n_meta_features=n_meta_features,
            n_meta_dim=[int(nd) for nd in args.n_meta_dim.split(',')],
            out_dim=args.out_dim,
            pretrained=True)

    # GPU 여러개로 병렬처리
    # if DP:
    #     model = apex.parallel.convert_syncbn_model(model)
    model = model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=args.init_lr)
    # if args.use_amp:
    #     model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
    if DP:
        model = nn.DataParallel(model)

    # amp를 사용하면 버그 (use_amp 비활성화)
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, args.n_epochs - 1)
    scheduler_warmup = GradualWarmupSchedulerV2(
        optimizer,
        multiplier=10,
        total_epoch=1,
        after_scheduler=scheduler_cosine)

    for epoch in range(1, args.n_epochs + 1):
        print(time.ctime(), f'Fold {fold}, Epoch {epoch}')

        train_loss = train_epoch(model, train_loader, optimizer)
        val_loss, acc, auc, auc_no_ext = val_epoch(model,
                                                   valid_loader,
                                                   target_idx,
                                                   is_ext=0)

        if args.use_ext:
            content = time.ctime(
            ) + ' ' + f'Fold {fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {(val_loss):.5f}, Acc: {(acc):.4f}, AUC: {(auc):.6f}, AUC_no_ext: {(auc_no_ext):.6f}.'
        else:
            content = time.ctime(
            ) + ' ' + f'Fold {fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {train_loss:.5f}, valid loss: {(val_loss):.5f}, Acc: {(acc):.4f}, AUC: {(auc):.6f}.'

        print(content)
        with open(os.path.join(args.log_dir, f'log_{args.kernel_type}.txt'),
                  'a') as appender:
            appender.write(content + '\n')

        scheduler_warmup.step()
        if epoch == 2:
            scheduler_warmup.step()  # bug workaround

        if auc > auc_max:
            print('auc_max ({:.6f} --> {:.6f}). Saving model ...'.format(
                auc_max, auc))
            torch.save(model.state_dict(), model_file_astraining)
            auc_max = auc

        # 외부데이터를 사용할 경우, 외부데이터를 제외한 모델을 따로 저장한다.
        if args.use_ext:
            if auc_no_ext > auc_no_ext_max:
                print('auc_no_ext_max ({:.6f} --> {:.6f}). Saving model ...'.
                      format(auc_no_ext_max, auc_no_ext))
                torch.save(model.state_dict(), model_file2)
                auc_no_ext_max = auc_no_ext

    torch.save(model.state_dict(), model_file3)
Ejemplo n.º 25
0
def run(fold):
    df_train = df_study[(df_study['fold'] != fold)]
    df_valid = df_study[(df_study['fold'] == fold)]

    dataset_train = RSNADataset3D(df_train,
                                  'train',
                                  transform=train_transforms)
    dataset_valid = RSNADataset3D(df_valid, 'val', transform=val_transforms)
    train_loader = torch.utils.data.DataLoader(
        dataset_train,
        batch_size=4,
        sampler=RandomSampler(dataset_train),
        num_workers=num_workers)
    valid_loader = torch.utils.data.DataLoader(dataset_valid,
                                               batch_size=4,
                                               num_workers=num_workers)

    model = monai.networks.nets.densenet.densenet121(
        spatial_dims=3, in_channels=3, out_channels=out_dim).to(device)

    val_loss_best = 1000
    model_file = f'{kernel_type}_best_fold{fold}.pth'

    optimizer = optim.Adam(model.parameters(), lr=init_lr)
    if use_amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")


#     if len(os.environ['CUDA_VISIBLE_DEVICES'].split(',')) > 1:
#         model = nn.DataParallel(model)

    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, cosine_epo)
    scheduler_warmup = GradualWarmupSchedulerV2(
        optimizer,
        multiplier=10,
        total_epoch=warmup_epo,
        after_scheduler=scheduler_cosine)

    print(len(dataset_train), len(dataset_valid))

    for epoch in range(1, n_epochs + 1):
        print(time.ctime(), 'Epoch:', epoch)
        scheduler_warmup.step(epoch - 1)

        train_loss = train_epoch(model, train_loader, optimizer)
        val_loss, acc, auc = val_epoch(model, valid_loader)

        content = time.ctime(
        ) + ' ' + f'Fold {fold}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, valid loss: {(val_loss):.5f}, acc: {(acc):.4f}, auc: {(auc):.6f}'
        print(content)
        with open(f'log_{kernel_type}.txt', 'a') as appender:
            appender.write(content + '\n')

        if val_loss < val_loss_best:
            print(
                'val_loss_best ({:.6f} --> {:.6f}).  Saving model ...'.format(
                    val_loss_best, val_loss))
            torch.save(model.state_dict(), model_file)
            val_loss_best = val_loss

    torch.save(model.state_dict(), f'{kernel_type}_model_fold{fold}.pth')
Ejemplo n.º 26
0
                                    weight_decay=weight_decay)
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, scheduler_step, min_lr)

        # Load data
        train_id = fold_train[idx]
        val_id = fold_valid[idx]

        X_train, y_train = trainImageFetch(train_id)
        X_val, y_val = trainImageFetch(val_id)

        train_data = DataSource1(X_train, mode='train', mask_list=y_train, fine_size=fine_size,
                                 pad_left=pad_left,
                                 pad_right=pad_right)
        train_loader = DataLoader(
            train_data,
            shuffle=RandomSampler(train_data),
            batch_size=batch_size,
            num_workers=8,
            pin_memory=True)

        val_data = DataSource1(X_val, mode='val', mask_list=y_val, fine_size=fine_size, pad_left=pad_left,
                               pad_right=pad_left)
        val_loader = DataLoader(
            val_data,
            shuffle=False,
            batch_size=batch_size,
            num_workers=8,
            pin_memory=True)

        num_snapshot = 0
        best_acc = 0
Ejemplo n.º 27
0
def get_dataloader(dataset, batchsize, use_hidden=True):
    dataloader = data.DataLoader(dataset,
                                 batch_size=batchsize,
                                 sampler=RandomSampler(dataset),
                                 collate_fn=get_collate_fn(use_hidden))
    return dataloader
Ejemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help="Path to the configuration file for the BERT model.")
    ## Other parameters
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--discr",
                        default=False,
                        action='store_true',
                        help="Whether to do discriminative fine-tuning.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--layers',
        type=int,
        nargs='+',
        default=[-2],
        help="choose the layers that used for downstream tasks, "
        "-2 means use pooled output, -1 means all layer,"
        "else means the detail layers. default is -2")
    parser.add_argument('--num_datas',
                        default=None,
                        type=int,
                        help="the number of data examples")
    parser.add_argument('--num_test_datas',
                        default=None,
                        type=int,
                        help="the number of data examples")
    parser.add_argument('--pooling_type',
                        default=None,
                        type=str,
                        choices=[None, 'mean', 'max'])
    args = parser.parse_args()

    processors = {"yelp": YELPProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    summary_writer = SummaryWriter(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained("bert-large-cased")

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir,
                                                      data_num=args.num_datas)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size *
            args.num_train_epochs)

    bert_config = BertConfig.from_json_file(args.bert_config_file)
    model = BertForSequenceClassification(bert_config,
                                          len(label_list),
                                          args.layers,
                                          pooling=args.pooling_type)

    if args.init_checkpoint is not None:
        model.bert.load_state_dict(
            torch.load(args.init_checkpoint, map_location='cpu'))

    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']

    optimizer_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay_rate':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay_rate':
        0.0
    }]

    optimizer = AdamW(optimizer_parameters,
                      lr=args.learning_rate,
                      correct_bias=False)

    global_step = 0
    global_train_step = 0

    eval_examples = processor.get_dev_examples(args.data_dir,
                                               data_num=args.num_test_datas)
    eval_features = convert_examples_to_features(eval_examples, label_list,
                                                 args.max_seq_length,
                                                 tokenizer)

    all_input_ids = eval_features['input_ids']
    all_input_mask = eval_features['attention_mask']
    all_segment_ids = eval_features['token_type_ids']
    all_label_ids = eval_features['labels']

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_label_ids)
    eval_dataloader = DataLoader(eval_data,
                                 batch_size=args.eval_batch_size,
                                 shuffle=False)

    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = train_features['input_ids']
        all_input_mask = train_features['attention_mask']
        all_segment_ids = train_features['token_type_ids']
        all_label_ids = train_features['labels']

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        print("TOTAL STEPS: ",
              (len(train_dataloader) * int(args.num_train_epochs)))

        epoch = 0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            epoch += 1
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, token_type_ids, label_ids = batch
                loss, _ = model(input_ids,
                                attention_mask=input_mask,
                                token_type_ids=token_type_ids,
                                labels=label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    # scheduler.step()

                    summary_writer.add_scalar('Loss/train', loss.item(),
                                              global_step)

                    # possibly comment this out
                    max_grad_norm = 1.0
                    _clip_grad_norm(optimizer_parameters, max_grad_norm)
                    model.zero_grad()
                    global_step += 1

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            pos_eval_prec, pos_eval_recall, pos_eval_f1 = 0, 0, 0
            neg_eval_prec, neg_eval_recall, neg_eval_f1 = 0, 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            with open(
                    os.path.join(args.output_dir,
                                 "results_ep" + str(epoch) + ".txt"),
                    "w") as f:
                for input_ids, input_mask, segment_ids, label_ids in tqdm(
                        eval_dataloader, desc="Evaluate"):
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        tmp_eval_loss, logits = model(
                            input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=input_mask,
                            labels=label_ids)

                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.detach().to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    for output in outputs:
                        f.write(str(output) + "\n")
                    tmp_eval_accuracy = np.sum(outputs == label_ids)
                    tmp_eval_prec, tmp_eval_recall, tmp_eval_f1 = get_analytics_neg_sent(
                        outputs, label_ids)

                    eval_loss += tmp_eval_loss.mean().item()
                    eval_accuracy += tmp_eval_accuracy
                    neg_eval_prec += tmp_eval_prec
                    neg_eval_recall += tmp_eval_recall
                    neg_eval_f1 += tmp_eval_f1

                    tmp_eval_prec, tmp_eval_recall, tmp_eval_f1 = get_analytics_pos_sent(
                        outputs, label_ids)
                    pos_eval_prec += tmp_eval_prec
                    pos_eval_recall += tmp_eval_recall
                    pos_eval_f1 += tmp_eval_f1

                    global_train_step += 1

                    summary_writer.add_scalar("Loss/test",
                                              tmp_eval_loss.mean().item(),
                                              global_train_step)
                    summary_writer.add_scalar("Accuracy/test",
                                              tmp_eval_accuracy,
                                              global_train_step)

                    nb_eval_examples += input_ids.size(0)
                    nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = eval_accuracy / nb_eval_examples

            pos_eval_prec = pos_eval_prec / nb_eval_steps
            pos_eval_recall = pos_eval_recall / nb_eval_steps
            pos_eval_f1 = pos_eval_f1 / nb_eval_steps

            neg_eval_prec = neg_eval_prec / nb_eval_steps
            neg_eval_recall = neg_eval_recall / nb_eval_steps
            neg_eval_f1 = neg_eval_f1 / nb_eval_steps

            result = {
                'eval_loss': eval_loss,
                'eval_accuracy': eval_accuracy,
                'global_step': global_step,
                'loss': tr_loss / nb_tr_steps,
                'pos_eval_precision': pos_eval_prec,
                'neg_eval_precision': neg_eval_prec,
                'pos_eval_recall': pos_eval_recall,
                'neg_eval_recall': neg_eval_recall,
                'pos_eval_f1': pos_eval_f1,
                'neg_eval_f1': neg_eval_f1
            }

            summary_writer.add_scalar("Epoch_loss/train", tr_loss, epoch)
            summary_writer.add_scalar("Epoch_loss/test", eval_loss, epoch)
            summary_writer.add_scalar("Epoch_accuracy/test", eval_accuracy,
                                      epoch)

            summary_writer.add_scalar("Epoch_positive_precision/test",
                                      pos_eval_prec, epoch)
            summary_writer.add_scalar("Epoch_negative_precision/test",
                                      neg_eval_prec, epoch)

            summary_writer.add_scalar("Epoch_positive_recall/test",
                                      pos_eval_recall, epoch)
            summary_writer.add_scalar("Epoch_negative_recall/test",
                                      neg_eval_recall, epoch)

            summary_writer.add_scalar("Epoch_positive_f1/test", pos_eval_f1,
                                      epoch)
            summary_writer.add_scalar("Epoch_negative_f1/test", neg_eval_f1,
                                      epoch)

            output_eval_file = os.path.join(
                args.output_dir, "eval_results_ep" + str(epoch) + ".txt")
            print("output_eval_file=", output_eval_file)
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
            print("Saving model")
            torch.save(
                model.module.state_dict(),
                os.path.join(
                    args.output_dir,
                    "yelp-finetuned-bert-model_" + str(epoch) + ".pth"))
Ejemplo n.º 29
0
                                        hebtb)
dev_inf_set = get_morph_dataset_partition('dev-inf', home_path, tb_vocab,
                                          hebtb)
test_inf_set = get_morph_dataset_partition('test-inf', home_path, tb_vocab,
                                           hebtb)
dev_uninf_set = get_morph_dataset_partition('dev-uninf', home_path, tb_vocab,
                                            hebtb)
test_uninf_set = get_morph_dataset_partition('test-uninf', home_path, tb_vocab,
                                             hebtb)
train_set = get_model_morpheme_dataset_partition(home_path, train_set)
dev_inf_set = get_model_morpheme_dataset_partition(home_path, dev_inf_set)
test_inf_set = get_model_morpheme_dataset_partition(home_path, test_inf_set)
dev_uninf_set = get_model_morpheme_dataset_partition(home_path, dev_uninf_set)
test_uninf_set = get_model_morpheme_dataset_partition(home_path,
                                                      test_uninf_set)
train_sampler = RandomSampler(train_set)
dev_inf_sampler = SequentialSampler(dev_inf_set)
test_inf_sampler = SequentialSampler(test_inf_set)
dev_uninf_sampler = SequentialSampler(dev_uninf_set)
test_uninf_sampler = SequentialSampler(test_uninf_set)
train_dataloader = DataLoader(train_set, sampler=train_sampler)
dev_inf_dataloader = DataLoader(dev_inf_set, sampler=dev_inf_sampler)
test_inf_dataloader = DataLoader(test_inf_set, sampler=test_inf_sampler)
dev_uninf_dataloader = DataLoader(dev_uninf_set, sampler=dev_uninf_sampler)
test_uninf_dataloader = DataLoader(test_uninf_set, sampler=test_uninf_sampler)

# Embedding
ft_form_vec_file_path = Path(
    'data/processed/spmrl/hebtb-morph-vocab/word-form.vec')
ft_lemma_vec_file_path = Path(
    'data/processed/spmrl/hebtb-morph-vocab/word-lemma.vec')
Ejemplo n.º 30
0
def train_model(model, dataset_train, dataset_val, lr, num_epochs, model_dir, exp_name, scale_lr=None):
    params = itertools.chain(
        model.parameters())
    optimizer = optim.Adam(params, lr=lr)

    criterion = nn.BCELoss()

    step = 0
    optimizer.zero_grad()
    for epoch in range(num_epochs):
        sampler = RandomSampler(dataset_train)
        for i, sample_id in enumerate(sampler):
            data = dataset_train[sample_id]
            feats = {
                'human_rcnn': Variable(torch.cuda.FloatTensor(data['human_feat'])),
                'object_rcnn': Variable(torch.cuda.FloatTensor(data['object_feat'])),
                'box': Variable(torch.cuda.FloatTensor(data['box_feat'])),
                "human_det_score": Variable(torch.cuda.FloatTensor(data["human_prob"])),
                "object_det_score": Variable(torch.cuda.FloatTensor(data["object_prob"])),
                "object_word2vec": Variable(torch.cuda.FloatTensor(data['verb_obj_vec'][:, 300:])),
            }

            model.train()
            binary_score = model(feats)

            # add
            binary_label = Variable(torch.cuda.FloatTensor(data['nis_labels']))
            loss_binary = criterion(binary_score, binary_label.view(binary_score.size(0), 1))

            loss_binary.backward()
            if step % 1 == 0:
                optimizer.step()
                optimizer.zero_grad()

            if step % 20 == 0:
                num_tp = np.sum(data['hoi_label'])
                num_fp = data['hoi_label'].shape[0] - num_tp
                log_str = \
                    'Epoch: {} | Iter: {} | Step: {} | ' + \
                    ' Train Loss binary: {:.8f}' \
                    '| TPs: {} | FPs: {} | lr:{} '
                log_str = log_str.format(
                    epoch,
                    i,
                    step,
                    loss_binary.data[0],
                    num_tp,
                    num_fp,
                    optimizer.param_groups[0]['lr'])
                print(log_str)

            if step % 100 == 0:
                log_value('train_loss_binary', loss_binary.data[0], step)
                print(exp_name)

            if step % 1000 == 0 and step > 2000:
                val_loss_binary, recall, tp, fp = eval_model(model, dataset_val)
                log_value('val_loss_binary', val_loss_binary, step)
                log_value('recall', recall, step)
                log_value('tp', tp, step)
                log_value('fp', fp, step)

                log_str = \
                    'Epoch: {} | Iter: {} | Step: {} | Val Loss binary: {:.8f}' \
                    '| recall: {:.2f} | tp: {:.2f}|fp: {:.2f}'
                log_str = log_str.format(
                    epoch,
                    i,
                    step,
                    val_loss_binary,
                    recall,
                    tp,
                    fp)
                print(log_str)

            if step == 10 or( step % 1000 == 0 and step > 2000):
                hoi_classifier_pth = os.path.join(
                    model_dir, "model",
                    f'hoi_classifier_{step}')
                torch.save(
                    model.state_dict(),
                    hoi_classifier_pth)

            step += 1

            if scale_lr is not None and step == scale_lr:
                scale_lr(optimizer, 0.1)