Example #1
0
def train_gpt(config: PipeLineConfig):
    logging.basicConfig(level=logging.INFO)

    logging.info("Reading data...")
    input_folder = "../input/jigsaw-unintended-bias-in-toxicity-classification/"
    train = pd.read_csv(os.path.join(input_folder, "train.csv"))

    logging.info("Reading wiki PL...")
    wiki_sents = pd.read_csv("../input/wiki_sents.csv")
    wiki_subset = wiki_sents[(wiki_sents.target < 0.1) & (
        wiki_sents[IDENTITY_COLUMNS].max(1) >= 0.33)].copy()
    wiki_subset.drop(columns=["any_identity", "max_identity", "target_aux"],
                     inplace=True)
    wiki_subset.iloc[:, :6] = 0.0  # They are not toxic by definition

    logging.info("Sampling extra data...")
    seed_everything(config.seed + 1)
    extras = []
    t = convert_dataframe_to_bool(train)
    for identity in IDENTITY_COLUMNS:
        Ip = np.sum(t[identity] & t.target)
        I = np.sum(t[identity])
        Bp = np.sum(~t[identity] & t.target)
        B = np.sum(~t[identity])
        required = (Ip * B - Bp * I) // Bp

        extra = wiki_subset[wiki_subset[identity] >= 0.333].copy()
        logging.info("Mitigating bias for %s", identity)
        logging.info("Need %d extra samples, got %d", required, len(extra))
        if len(extra) > required:
            logging.info("Downsampling extra dataframe")
            extra = extra.sample(required)
        extras.append(extra)

    enriched = pd.concat([train] + extras,
                         ignore_index=True,
                         sort=False,
                         axis=0)

    logging.info("Tokenizing...")

    with multiprocessing.Pool(processes=32) as pool:
        text_list = enriched.comment_text.tolist()
        sequences = pool.map(convert_line_gpt, text_list)

    logging.info("Building ttensors for training...")
    sequences = np.array(sequences)
    print(sequences.shape)
    lengths = np.argmax(sequences == 0, axis=1)
    lengths[lengths == 0] = sequences.shape[1]

    logging.info("Bulding target tesnor...")
    iden = enriched[IDENTITY_COLUMNS].fillna(0).values
    subgroup_target = np.hstack([
        (iden >= 0.5).any(axis=1, keepdims=True).astype(np.int),
        iden,
        iden.max(axis=1, keepdims=True),
    ])
    sub_target_weigths = (~enriched[IDENTITY_COLUMNS].isna().values.any(
        axis=1, keepdims=True)).astype(np.int)

    weights = np.ones(len(enriched))
    weights += (iden >= 0.5).any(1)
    weights += (enriched["target"].values >= 0.5) & (iden < 0.5).any(1)
    weights += (enriched["target"].values < 0.5) & (iden >= 0.5).any(1)
    weights /= weights.mean()

    y_aux_train = enriched[AUX_TARGETS]
    y_train_torch = torch.tensor(
        np.hstack([
            enriched.target.values[:, None],
            weights[:, None],
            y_aux_train,
            subgroup_target,
            sub_target_weigths,
        ])).float()

    logging.info("Seeding with seed %d ...", config.seed)
    seed_everything(config.seed)

    logging.info("Creating dataset...")
    dataset = data.TensorDataset(torch.tensor(sequences), y_train_torch,
                                 torch.tensor(lengths))
    train_loader = data.DataLoader(dataset,
                                   batch_size=BATCH_SIZE,
                                   collate_fn=clip_to_max_len,
                                   shuffle=True)

    logging.info("Creating a model...")
    model = GPT2CNN.from_pretrained("gpt2", num_labels=18)
    model.zero_grad()
    model = model.cuda()

    logs_file = f"./tb_logs/final_{config.expname}"
    optimizer_grouped_parameters = [
        {
            "params":
            [p for n, p in model.named_parameters() if should_decay(n)],
            "weight_decay": config.decay,
        },
        {
            "params":
            [p for n, p in model.named_parameters() if not should_decay(n)],
            "weight_decay":
            0.00,
        },
    ]

    optimizer = BertAdam(
        optimizer_grouped_parameters,
        lr=config.lr,
        warmup=config.warmup,
        t_total=config.epochs * len(train_loader) // ACCUM_STEPS,
    )

    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level="O1",
                                      verbosity=0)
    model = model.train()

    writer = SummaryWriter(logs_file)
    agg = TensorboardAggregator(writer)
    custom_loss = prepare_loss(config)

    for _ in range(config.epochs):
        for j, (X, y) in enumerate(train_loader):

            X = X.cuda()
            y = y.cuda()

            y_pred = model(X)
            loss = custom_loss(y_pred, y)

            accuracy = ((y_pred[:, 0] > 0) == (y[:, 0] > 0.5)).float().mean()
            agg.log({
                "train_loss": loss.item(),
                "train_accuracy": accuracy.item()
            })

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            if (j + 1) % ACCUM_STEPS == 0:
                optimizer.step()
                optimizer.zero_grad()

    torch.save(model.state_dict(),
               f"./models/final-pipe6-{config.expname}.bin")
def train_bert(config: PipeLineConfig):
    logging.basicConfig(level=logging.INFO)

    logging.info("Reading data...")
    input_folder = "../input/jigsaw-unintended-bias-in-toxicity-classification/"
    train = pd.read_csv(os.path.join(input_folder, "train.csv"))

    logging.info("Tokenizing...")

    with multiprocessing.Pool(processes=32) as pool:
        text_list = train.comment_text.tolist()
        sequences = pool.map(convert_line_uncased, text_list)

    logging.info("Building ttensors for training...")
    sequences = np.array(sequences)
    lengths = np.argmax(sequences == 0, axis=1)
    lengths[lengths == 0] = sequences.shape[1]

    logging.info("Bulding target tesnor...")
    iden = train[IDENTITY_COLUMNS].fillna(0).values
    subgroup_target = np.hstack([
        (iden >= 0.5).any(axis=1, keepdims=True).astype(np.int),
        iden,
        iden.max(axis=1, keepdims=True),
    ])
    sub_target_weigths = (~train[IDENTITY_COLUMNS].isna().values.any(
        axis=1, keepdims=True)).astype(np.int)

    weights = np.ones(len(train))
    weights += (iden >= 0.5).any(1)
    weights += (train["target"].values >= 0.5) & (iden < 0.5).any(1)
    weights += (train["target"].values < 0.5) & (iden >= 0.5).any(1)
    weights /= weights.mean()

    y_aux_train = train[AUX_TARGETS]
    y_train_torch = torch.tensor(
        np.hstack([
            train.target.values[:, None],
            weights[:, None],
            y_aux_train,
            subgroup_target,
            sub_target_weigths,
        ])).float()

    perfect_output = torch.tensor(
        np.hstack([train.target.values[:, None], y_aux_train,
                   subgroup_target])).float()

    logging.info("Seeding with seed %d ...", config.seed)
    seed_everything(config.seed)

    logging.info("Creating dataset...")

    dataset = data.TensorDataset(
        torch.from_numpy(sequences).long(), y_train_torch,
        torch.from_numpy(lengths))
    train_loader = data.DataLoader(dataset,
                                   batch_size=BATCH_SIZE,
                                   collate_fn=clip_to_max_len,
                                   shuffle=True)

    logging.info("Creating a model...")
    model = BertForSequenceClassification.from_pretrained("bert-ft/",
                                                          num_labels=18)
    model.zero_grad()
    model = model.cuda()
    model.classifier.bias = nn.Parameter(
        perfect_bias(perfect_output.mean(0)).cuda())

    logs_file = f"./tb_logs/final_{config.expname}"

    optimizer_grouped_parameters = [
        {
            "params":
            [p for n, p in model.named_parameters() if should_decay(n)],
            "weight_decay": config.decay,
        },
        {
            "params":
            [p for n, p in model.named_parameters() if not should_decay(n)],
            "weight_decay":
            0.00,
        },
    ]

    optimizer = BertAdam(
        optimizer_grouped_parameters,
        lr=config.lr,
        warmup=config.warmup,
        t_total=config.epochs * len(train_loader) // ACCUM_STEPS,
    )

    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level="O1",
                                      verbosity=0)
    model = model.train()

    writer = SummaryWriter(logs_file)
    agg = TensorboardAggregator(writer)
    custom_loss = prepare_loss(config)

    for _ in range(config.epochs):
        for j, (X, y) in enumerate(train_loader):

            X = X.cuda()
            y = y.cuda()

            y_pred = model(X, attention_mask=(X > 0))
            loss = custom_loss(y_pred, y)

            accuracy = ((y_pred[:, 0] > 0) == (y[:, 0] > 0.5)).float().mean()
            agg.log({
                "train_loss": loss.item(),
                "train_accuracy": accuracy.item()
            })

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            if (j + 1) % ACCUM_STEPS == 0:
                optimizer.step()
                optimizer.zero_grad()

    torch.save(model.state_dict(),
               f"./models/final-pipe1-{config.expname}.bin")
    bce_loss_1 = nn.BCEWithLogitsLoss(targets[:, 1:2])(data[:, :1],
                                                       targets[:, :1])
    bce_loss_2 = nn.BCEWithLogitsLoss()(data[:, 1:7], targets[:, 2:8])
    bce_loss_3 = nn.BCEWithLogitsLoss(targets[:, 19:20])(data[:, 7:18],
                                                         targets[:, 8:19])
    return bce_loss_1 + bce_loss_2 + bce_loss_3 / 4


def save_nn_without_embedding_weights(model, path: str):
    temp_dict = model.state_dict()
    del temp_dict["embedding.weight"]
    torch.save(temp_dict, path)


if __name__ == "__main__":
    seed_everything(1234)
    torch.cuda.set_device(0)

    logging.info("Reading data...")
    INPUT_FOLDER = "../input/jigsaw-unintended-bias-in-toxicity-classification/"
    train = pd.read_csv(os.path.join(INPUT_FOLDER, "train.csv"))
    y = train["target"].values

    logging.info("Preprocessing...")
    with multiprocessing.Pool(processes=32) as pool:
        text_list = pool.map(normalize, train.comment_text.tolist())

    logging.info("Tokenization...")
    tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    word_sequences = []
    word_dict = {}