Exemple #1
0
def run():
    data_df = pd.read_csv('../input/train.csv')
    train_df, valid_df = train_test_split(data_df,
                                          random_state=42,
                                          test_size=0.1)
    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)

    train_y = train_df['median_relevance'].values
    valid_y = valid_df['median_relevance'].values

    train_dataset = CrowdFlowerDataset(
        query=train_df['query'].values,
        prod_title=train_df['product_title'].values,
        prod_description=train_df['product_description'].values,
        targets=train_y)
    valid_dataset = CrowdFlowerDataset(
        query=valid_df['query'].values,
        prod_title=valid_df['product_title'].values,
        prod_description=valid_df['product_description'].values,
        targets=valid_y)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=configs.TRAIN_BATCH_SIZE, shuffle=True)

    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=configs.VALID_BATCH_SIZE, shuffle=False)

    num_train_steps = int(
        len(train_dataset) / configs.TRAIN_BATCH_SIZE * configs.EPOCHS)
    device = configs.DEVICE
    model = BERTBaseUncased().to(device)
    optimizer = configs.OPTIMIZER(model.parameters(), lr=configs.LR)
    scheduler = configs.SCHEDULER(optimizer,
                                  num_warmup_steps=0,
                                  num_training_steps=num_train_steps)

    for epoch in range(configs.EPOCHS):

        epoch_start = time.time()

        epoch_train_loss = train_loop_fn(train_dataloader, model, optimizer,
                                         scheduler)
        outputs, targets, epoch_valid_loss = eval_loop_fn(
            valid_dataloader, model)

        epoch_end = time.time()
        epoch_time_elapsed = (epoch_end - epoch_start) / 60.0
        print(f'time take to run a epoch - {epoch_time_elapsed}')
        print(
            f'Epoch - Training loss - {epoch_train_loss} Valid loss - {epoch_valid_loss}'
        )

        qw_kappa = quadratic_weighted_kappa(targets.flatten(),
                                            outputs.flatten())
        print(f'Quadratic Weighted Kappa: {qw_kappa}')
def run():
    data_df = pd.read_csv('../input/train.csv')
    train_df, valid_df = train_test_split(data_df, random_state=42, test_size=0.1)
    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)

    sample_sub_df = pd.read_csv("../input/sample_submission.csv")
    target_cols = list(sample_sub_df.drop("qa_id", axis=1).columns)
    train_y = train_df[target_cols].values
    valid_y = valid_df[target_cols].values

    train_dataset = BertDataset(
        qtitle=train_df.question_title.values,
        qbody=train_df.question_body.values,
        answer=train_df.answer.values,
        targets=train_y
    )

    valid_dataset = BertDataset(
        qtitle=valid_df.question_title.values,
        qbody=valid_df.question_body.values,
        answer=valid_df.answer.values,
        targets=valid_y
    )

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size= config.TRAIN_BATCH_SIZE,
        shuffle=True
    )


    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size= config.VALID_BATCH_SIZE,
        shuffle=False
    )

    num_train_steps = int(len(train_dataset)/ config.TRAIN_BATCH_SIZE * config.EPOCHS)
    device= config.DEVICE
    model = BERTBaseUncased().to(device)
    optimizer = config.OPTIMIZER(model.parameters(), lr=config.LR)
    scheduler = config.SCHEDULER(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    for epoch in range(config.EPOCHS):

        epoch_start = time.time()

        epoch_train_loss = train_loop_fn(train_dataloader, model, optimizer, scheduler)
        outputs, targets, epoch_valid_loss = eval_loop_fn(valid_dataloader, model)

        epoch_end = time.time()
        epoch_time_elapsed =  (epoch_end - epoch_start)/60.0
        print(f'time take to run a epoch - {epoch_time_elapsed}')
        print(f'Epoch - Training loss - {epoch_train_loss} Valid loss - {epoch_valid_loss}')

        spear =[]
        for jj in range(targets.shape[1]):
            p1 = list(targets[:, jj])
            p2 = list(outputs[:, jj])
            coef, _ = np.nan_to_num(stats.spearmanr(p1, p2))
            spear.append(coef)
        spear = np.mean(spear)
        print(f"Spearman coeff : {spear}")