Exemple #1
0
def run_one_fold(fold_id):

    with timer('load csv data'):

        debug = config.DEBUG
        df_train = pd.read_csv(
            config.TRAIN_PATH).dropna().reset_index(drop=True)

        if debug:
            df_train = df_train.sample(
                1000, random_state=SEED).dropna().reset_index(drop=True)

        # 一致しないものがある.
        # https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/142011
        df_train.loc[df_train['sentiment'] == 'neutral',
                     'selected_text'] = df_train[df_train['sentiment'] ==
                                                 'neutral']['text']

        num_folds = config.NUM_FOLDS
        kf = StratifiedKFold(n_splits=num_folds, random_state=SEED)
        splits = list(kf.split(X=df_train, y=df_train[['sentiment']]))
        train_idx = splits[fold_id][0]
        val_idx = splits[fold_id][1]

        print(len(train_idx), len(val_idx))

        gc.collect()

    with timer('prepare validation data'):
        train_dataset = TweetDataset(
            tweet=df_train.iloc[train_idx].text.values,
            sentiment=df_train.iloc[train_idx].sentiment.values,
            selected_text=df_train.iloc[train_idx].selected_text.values)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            shuffle=True,
            batch_size=config.TRAIN_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        val_dataset = TweetDataset(
            tweet=df_train.iloc[val_idx].text.values,
            sentiment=df_train.iloc[val_idx].sentiment.values,
            selected_text=df_train.iloc[val_idx].selected_text.values)

        val_loader = torch.utils.data.DataLoader(
            val_dataset,
            shuffle=False,
            batch_size=config.VALID_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        del train_dataset, val_dataset
        gc.collect()

    with timer('create model'):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        model = TweetRoBERTaModel(config.ROBERTA_PATH)
        model = model.to(device)

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.001
            },
            {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            },
        ]

        num_train_steps = int(
            len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
        optimizer = transformers.AdamW(optimizer_parameters, lr=3e-5)
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

        model = nn.DataParallel(model)

        # pretrain_path = 'models/exp11_fold0.pth'
        # model.load_state_dict(torch.load(pretrain_path))
        # LOGGER.info(f'pretrained model (exp11) loaded')

    with timer('training loop'):
        best_score = -999
        best_epoch = 0
        patience = 3
        p = 0
        for epoch in range(1, config.EPOCHS + 1):

            LOGGER.info("Starting {} epoch...".format(epoch))

            engine.train_fn(train_loader, model, optimizer, device, scheduler)
            score = engine.eval_fn(val_loader, model, device)

            LOGGER.info(f"Jaccard Score = {score}")

            if score > best_score:
                best_score = score
                best_epoch = epoch
                torch.save(
                    model.state_dict(),
                    os.path.join(config.OUT_DIR,
                                 '{}_fold{}.pth'.format(EXP_ID, fold_id)))
                LOGGER.info("save model at score={} on epoch={}".format(
                    best_score, best_epoch))
                p = 0

            if p > 0:
                LOGGER.info(
                    f'best score is not updated while {p} epochs of training')
            p += 1
            if p > patience:
                LOGGER.info(f'Early Stopping')
                break

        LOGGER.info("best score={} on epoch={}".format(best_score, best_epoch))
Exemple #2
0
def run_one_fold(fold_id):

    fnc_df = pd.read_csv(config.FNC_PATH)
    loading_df = pd.read_csv(config.LOADING_PATH)
    labels_df = pd.read_csv(config.TRAIN_SCORES_PATH)

    fnc_features, loading_features = list(fnc_df.columns[1:]), list(
        loading_df.columns[1:])
    df = fnc_df.merge(loading_df, on="Id")
    labels_df["is_train"] = True

    df = df.merge(labels_df, on="Id", how="left")

    df['bin_age'] = pd.cut(df['age'], [i for i in range(0, 100, 10)],
                           labels=False)

    df_test = df[df["is_train"] != True].copy()
    df_train = df[df["is_train"] == True].copy()

    num_folds = config.NUM_FOLDS
    kf = StratifiedKFold(n_splits=num_folds, random_state=SEED)
    splits = list(kf.split(X=df_train, y=df_train[['bin_age']]))

    train_idx = splits[fold_id][0]
    val_idx = splits[fold_id][1]

    target_cols = [
        'age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2'
    ]

    print(len(train_idx), len(val_idx))

    train_dataset = TReNDSDataset(df=df_train,
                                  target_cols=target_cols,
                                  indices=train_idx,
                                  map_path=config.TRAIN_MAP_PATH)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        shuffle=True,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=0,
        pin_memory=True)

    val_dataset = TReNDSDataset(df=df_train,
                                target_cols=target_cols,
                                indices=val_idx,
                                map_path=config.TRAIN_MAP_PATH)
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        shuffle=False,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=0,
        pin_memory=True)

    del train_dataset, val_dataset
    gc.collect()

    device = config.DEVICE
    model = resnet34()

    # https://github.com/Tencent/MedicalNet/blob/35ecd5be96ae4edfc1be29816f9847c11d067db0/model.py#L89
    net_dict = model.state_dict()
    # pretrain = torch.load("inputs/pretrain/resnet_10.pth")
    LOGGER.info('pytorch 3d model pretrained weight loading ...')
    pretrain = torch.load("inputs/r3d34_K_200ep.pth")
    pretrain_dict = {
        k: v
        for k, v in pretrain['state_dict'].items() if k in net_dict.keys()
    }
    net_dict.update(pretrain_dict)

    model.load_state_dict(net_dict)
    print("pretrained model loaded !")
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                           T_max=30,
                                                           eta_min=1e-6)
    # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, min_lr=1e-5)

    patience = 3
    p = 0
    min_loss = 999
    best_score = -999

    for epoch in range(1, config.EPOCHS + 1):

        LOGGER.info("Starting {} epoch...".format(epoch))

        engine.train_fn(train_loader, model, optimizer, device, scheduler)
        score, val_loss = engine.eval_fn(val_loader, model, device)
        scheduler.step()
        # scheduler.step(val_loss)

        if val_loss < min_loss:
            min_loss = val_loss
            best_score = score
            best_epoch = epoch
            torch.save(
                model.state_dict(),
                os.path.join(config.OUT_DIR,
                             '{}_fold{}.pth'.format(EXP_ID, fold_id)))
            LOGGER.info("val loss is {}".format(val_loss))
            LOGGER.info("save model at score={} on epoch={}".format(
                best_score, best_epoch))
            p = 0

        if p > 0:
            LOGGER.info(
                f'val loss is not updated while {p} epochs of training')
        p += 1
        if p > patience:
            LOGGER.info(f'Early Stopping')
            break
    LOGGER.info("best score={} on epoch={}".format(best_score, best_epoch))
Exemple #3
0
def run_one_fold(fold_id):

    with timer('load csv data'):

        debug = config.DEBUG
        df_train = pd.read_csv(
            config.TRAIN_PATH).dropna().reset_index(drop=True)

        if debug:
            df_train = df_train.sample(
                1000, random_state=SEED).dropna().reset_index(drop=True)

        num_folds = 5
        kf = StratifiedKFold(n_splits=num_folds, random_state=SEED)
        splits = list(kf.split(X=df_train, y=df_train[['sentiment']]))
        train_idx = splits[fold_id][0]
        val_idx = splits[fold_id][1]

        print(len(train_idx), len(val_idx))

        gc.collect()

    with timer('prepare validation data'):
        train_dataset = TweetDataset(
            tweet=df_train.iloc[train_idx].text.values,
            sentiment=df_train.iloc[train_idx].sentiment.values,
            selected_text=df_train.iloc[train_idx].selected_text.values)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            shuffle=True,
            batch_size=config.TRAIN_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        val_dataset = TweetDataset(
            tweet=df_train.iloc[val_idx].text.values,
            sentiment=df_train.iloc[val_idx].sentiment.values,
            selected_text=df_train.iloc[val_idx].selected_text.values)

        val_loader = torch.utils.data.DataLoader(
            val_dataset,
            shuffle=False,
            batch_size=config.VALID_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        del train_dataset, val_dataset
        gc.collect()

    with timer('create model'):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        model = BERTBaseUncased()
        model = model.to(device)

        # t_max=10
        # scheduler_cosine = CosineAnnealingLR(optimizer, T_max=t_max)
        # scheduler = GradualWarmupScheduler(optimizer, multiplier=1.1, total_epoch=5,
        #                                    after_scheduler=scheduler_cosine)

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.001
            },
            {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            },
        ]

        num_train_steps = int(
            len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
        optimizer = transformers.AdamW(optimizer_parameters, lr=5e-5)
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

        model = nn.DataParallel(model)

    with timer('training loop'):
        best_score = -999
        best_epoch = 0
        for epoch in range(1, config.EPOCHS + 1):

            LOGGER.info("Starting {} epoch...".format(epoch))

            engine.train_fn(train_loader, model, optimizer, device, scheduler)
            score, val_outputs = engine.eval_fn(val_loader, model, device)

            LOGGER.info(f"Jaccard Score = {score}")

            if score > best_score:
                best_score = score
                best_epoch = epoch
                torch.save(
                    model.state_dict(),
                    os.path.join(config.OUT_DIR,
                                 '{}_fold{}.pth'.format(EXP_ID, fold_id)))
                to_pickle(
                    os.path.join(config.OUT_DIR,
                                 "{}_fold{}_oof.pkl".format(EXP_ID, fold_id)),
                    [val_idx, val_outputs])
                LOGGER.info("save model at score={} on epoch={}".format(
                    best_score, best_epoch))

        LOGGER.info("best score={} on epoch={}".format(best_score, best_epoch))
Exemple #4
0
def run_one_fold(fold_id):

    fnc_df = pd.read_csv(config.FNC_PATH)
    loading_df = pd.read_csv(config.LOADING_PATH)
    labels_df = pd.read_csv(config.TRAIN_SCORES_PATH)

    fnc_features, loading_features = list(fnc_df.columns[1:]), list(
        loading_df.columns[1:])
    df = fnc_df.merge(loading_df, on="Id")
    labels_df["is_train"] = True

    df = df.merge(labels_df, on="Id", how="left")

    df['bin_age'] = pd.cut(df['age'], [i for i in range(0, 100, 10)],
                           labels=False)

    df_test = df[df["is_train"] != True].copy()
    df_train = df[df["is_train"] == True].copy()

    num_folds = config.NUM_FOLDS
    kf = StratifiedKFold(n_splits=num_folds, random_state=SEED)
    splits = list(kf.split(X=df_train, y=df_train[['bin_age']]))

    train_idx = splits[fold_id][0]
    val_idx = splits[fold_id][1]

    target_cols = [
        'age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2'
    ]

    print(len(train_idx), len(val_idx))

    train_dataset = TReNDSDataset(df=df_train,
                                  target_cols=target_cols,
                                  indices=train_idx,
                                  map_path=config.TRAIN_MAP_PATH)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        shuffle=True,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=0,
        pin_memory=True)

    val_dataset = TReNDSDataset(df=df_train,
                                target_cols=target_cols,
                                indices=val_idx,
                                map_path=config.TRAIN_MAP_PATH)
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        shuffle=False,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=0,
        pin_memory=True)

    del train_dataset, val_dataset
    gc.collect()

    device = config.DEVICE
    model = resnet10()
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                           T_max=30,
                                                           eta_min=1e-6)
    # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, min_lr=1e-5)

    patience = 5
    p = 0
    min_loss = 999
    best_score = -999

    for epoch in range(1, config.EPOCHS + 1):

        print("Starting {} epoch...".format(epoch))

        engine.train_fn(train_loader, model, optimizer, device, scheduler)
        score, val_loss = engine.eval_fn(val_loader, model, device)
        scheduler.step()
        # scheduler.step(val_loss)

        if val_loss < min_loss:
            min_loss = val_loss
            best_score = score
            best_epoch = epoch
            torch.save(
                model.state_dict(),
                os.path.join(config.OUT_DIR,
                             '{}_fold{}.pth'.format(EXP_ID, fold_id)))
            print("save model at score={} on epoch={}".format(
                best_score, best_epoch))
            p = 0

        if p > 0:
            print(f'val loss is not updated while {p} epochs of training')
        p += 1
        if p > patience:
            print(f'Early Stopping')
            break
    print("best score={} on epoch={}".format(best_score, best_epoch))
Exemple #5
0
def run_one_fold(fold_id):

    df_train = pd.read_csv(config.TRAIN_PATH)
    print(df_train.shape)

    DEBUG = 0
    if DEBUG:
        df_train = df_train.head(100)

    TARGETS = 'isup_grade'

    kf = StratifiedKFold(n_splits=config.NUM_FOLDS, random_state=SEED)
    splits = list(kf.split(X=df_train, y=df_train[TARGETS].values))

    train_idx = splits[fold_id][0]
    val_idx = splits[fold_id][1]

    train_dataset = PANDADataset(df=df_train,
                                 indices=train_idx,
                                 transform=data_transforms)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        shuffle=True,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=0,
        pin_memory=True)

    val_dataset = PANDADataset(df=df_train,
                               indices=val_idx,
                               transform=data_transforms_test)

    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        shuffle=False,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=0,
        pin_memory=True)

    del train_dataset, val_dataset
    gc.collect()

    device = config.DEVICE
    model = CustomSEResNeXt(model_name='se_resnext50_32x4d')
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config.LR)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                           T_max=30,
                                                           eta_min=1e-6)
    # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, min_lr=1e-5)

    patience = config.PATIENCE
    p = 0
    min_loss = 999
    best_score = -999

    for epoch in range(1, config.EPOCHS + 1):

        LOGGER.info("Starting {} epoch...".format(epoch))

        engine.train_fn(train_loader, model, optimizer, device, scheduler)
        score, val_loss, val_ids, val_preds = engine.eval_fn(
            val_loader, model, device)
        scheduler.step()
        # scheduler.step(val_loss)

        if val_loss < min_loss:
            min_loss = val_loss
            best_score = score
            best_epoch = epoch
            torch.save(
                model.state_dict(),
                os.path.join(config.OUT_DIR,
                             '{}_fold{}.pth'.format(EXP_ID, fold_id)))
            LOGGER.info("save model at score={} on epoch={}".format(
                best_score, best_epoch))
            p = 0

        if p > 0:
            LOGGER.info(
                f'val loss is not updated while {p} epochs of training')
        p += 1
        if p > patience:
            LOGGER.info(f'Early Stopping')
            break

    to_pickle(
        os.path.join(config.OUT_DIR, '{}_fold{}.pkl'.format(EXP_ID, fold_id)),
        [val_ids, val_preds])
    LOGGER.info("best score={} on epoch={}".format(best_score, best_epoch))
Exemple #6
0
def run_one_fold(fold_id):

    with timer('load csv data'):

        debug = config.DEBUG
        df_train = pd.read_csv(
            config.TRAIN_PATH).dropna().reset_index(drop=True)

        if debug:
            df_train = df_train.sample(
                1000, random_state=SEED).dropna().reset_index(drop=True)

        num_folds = 5
        kf = StratifiedKFold(n_splits=num_folds, random_state=SEED)
        splits = list(kf.split(X=df_train, y=df_train[['sentiment']]))
        train_idx = splits[fold_id][0]
        val_idx = splits[fold_id][1]

        print(len(train_idx), len(val_idx))

        gc.collect()

    with timer('prepare validation data'):
        train_dataset = TweetDataset(
            tweet=df_train.iloc[train_idx].text.values,
            sentiment=df_train.iloc[train_idx].sentiment.values,
            selected_text=df_train.iloc[train_idx].selected_text.values)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            shuffle=True,
            batch_size=config.TRAIN_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        val_dataset = TweetDataset(
            tweet=df_train.iloc[val_idx].text.values,
            sentiment=df_train.iloc[val_idx].sentiment.values,
            selected_text=df_train.iloc[val_idx].selected_text.values)

        val_loader = torch.utils.data.DataLoader(
            val_dataset,
            shuffle=False,
            batch_size=config.VALID_BATCH_SIZE,
            num_workers=0,
            pin_memory=True)

        del train_dataset, val_dataset
        gc.collect()

    with timer('create model'):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        #model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
        model_config = transformers.BertConfig.from_pretrained(
            "bert-large-uncased-whole-word-masking", output_hidden_states=True)
        # model_config.output_hidden_states = True

        model = TweetModelLargeWWM("bert-large-uncased-whole-word-masking",
                                   model_config)
        model = model.to(device)

        # t_max=10
        # scheduler_cosine = CosineAnnealingLR(optimizer, T_max=t_max)
        # scheduler = GradualWarmupScheduler(optimizer, multiplier=1.1, total_epoch=5,
        #                                    after_scheduler=scheduler_cosine)

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.001
            },
            {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            },
        ]

        num_train_steps = int(
            len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
        optimizer = transformers.AdamW(optimizer_parameters, lr=3e-5)
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

        model = nn.DataParallel(model)

        # https://www.kaggle.com/irustandi/bertlargeuncasedwwmfinetunedsquad
        # pretrain_path = 'inputs/bert-large-uncased-wwm-finetuned-squad/pytorch_model.bin'
        # model.load_state_dict(torch.load(pretrain_path))
        # LOGGER.info(f'pretrained model (WWM uncased squad) loaded')

    with timer('training loop'):
        best_score = -999
        best_epoch = 0
        patience = 2
        p = 0
        for epoch in range(1, config.EPOCHS + 1):

            LOGGER.info("Starting {} epoch...".format(epoch))

            engine.train_fn(train_loader, model, optimizer, device, scheduler)
            score = engine.eval_fn(val_loader, model, device)

            LOGGER.info(f"Jaccard Score = {score}")

            if score > best_score:
                best_score = score
                best_epoch = epoch
                torch.save(
                    model.state_dict(),
                    os.path.join(config.OUT_DIR,
                                 '{}_fold{}.pth'.format(EXP_ID, fold_id)))
                LOGGER.info("save model at score={} on epoch={}".format(
                    best_score, best_epoch))
                p = 0

            if p > 0:
                LOGGER.info(
                    f'best score is not updated while {p} epochs of training')
            p += 1
            if p > patience:
                LOGGER.info(f'Early Stopping')
                break

        LOGGER.info("best score={} on epoch={}".format(best_score, best_epoch))