Example #1
0
def run():
    dfx = pd.read_csv(config.TRAINING_FILE).fillna("none")
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    valid_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")

    model = BERTBaseUncased()

    model.to(device)

    param_optimizer = list(model.named_parameters())

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, scheduler, device)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")

        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
def run():
    df1 = pd.read_csv("../input/jigsaw-toxic-comment-train.csv",
                      usecols=["comment_text", "toxic"])
    df2 = pd.read_csv("../input/jigsaw-unintended-bias-train.csv",
                      usecols=["comment_text", "toxic"])

    df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True)
    df_valid = pd.read_csv("../input/validation.csv")

    train_dataset = dataset.BERTDataset(
        comment_text=df_train.comment_text.values,
        target=df_train.toxic.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.BERTDataset(
        comment_text=df_valid.comment_text.values,
        target=df_valid.toxic.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device(config.DEVICE)
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [{
        "params":
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        "weight_decay":
        0.001
    }, {
        "params":
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay":
        0.0
    }]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        targets = np.array(targets) >= 0.5
        accuracy = metrics.roc_auc_score(targets, outputs)
        print(f"AUC Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Example #3
0
def run():
    df1 = pd.read_csv(config.TRAINING_FILE, usecols=["comment_text","toxic"])
    
    train_dataset = dataset.BERTDataset(
        review=df1.comment_text.values,
        target=df1.toxic.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )
    df2=pd.read_csv("../input/validation.csv", usecols=["comment_text","toxic"])
    valid_dataset = dataset.BERTDataset(
        review=df2.comment_text.values,
        target=df2.toxic.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BERTBaseUncased()
    model.to(device)
    
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    num_train_steps = int(len(df1) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
def run():
    dfx = pd.read_csv(config.TRAINING_FILE, nrows=30).dropna().reset_index(drop=True)


    df_train, df_valid = model_selection.train_test_split(
        dfx, 
        test_size = 0.1,
        random_state = 42,
        stratify = dfx.sentiment.values
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.TweetDataset(
        tweet = df_train.text.values,
        sentiment = df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=1
    )

    valid_dataset = dataset.TweetDataset(
        tweet = df_valid.text.values,
        sentiment = df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1 
    )

    device = torch.device('cpu')
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    best_jaccard = 0
    for epoch in range(config.EPOCHS):
        print("here")
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        mean_jac = engine.eval_fn(valid_data_loader, model, device)
        print("jaccard_score = {mean_jac}".format(mean_jac=mean_jac))
        if(mean_jac>best_jaccard):
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_jaccard = mean_jac
Example #5
0
def run(opt_level="O2",
        keep_batchnorm_fp32=True,
        batch_size=5,
        nb_epochs=10,
        data_path="../inputs/IMDB_Dataset.csv",
        model_path="./"):

    df = pd.read_csv(data_path).fillna("none")[0:100]
    df.sentiment = df.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    df_train, df_valid = model_selection.train_test_split(
        df, test_size=0.1, random_state=42, stratify=df.sentiment.values)
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)
    # Creating the datasets
    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    valid_dataset = dataset.BERTDataset(review=df_valid.review.values,
                                        target=df_valid.sentiment.values)
    # Creating the dataloaders
    train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size,
                                                   num_workers=10,
                                                   drop_last=True)

    valid_dataloader = torch.utils.data.DataLoader(valid_dataset,
                                                   batch_size,
                                                   num_workers=10,
                                                   drop_last=True)
    # Defining the model and sending to the device
    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    parameters = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias",
                "LayerNorm.weight"]  # We don't want any decay for them
    optimizer_parameters = [{
        "params":
        [p for n, p in parameters if not any(nd in n for nd in no_decay)],
        "weight_decay":
        0.001
    }, {
        "params":
        [p for n, p in parameters if any(nd in n for nd in no_decay)],
        "weight_decay":
        0.0
    }]

    num_train_steps = int(len(df_train) * nb_epochs / batch_size)
    # Defining the optimizer and the scheduler
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    # Initialize the pytorch model and the optimizer to allow automatic mixed-precision training
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=opt_level,
                                      keep_batchnorm_fp32=keep_batchnorm_fp32,
                                      loss_scale="dynamic")

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,  # No warmup
        num_training_steps=num_train_steps)

    # Train the model
    engine.global_trainer(train_dataloader, valid_dataloader, model, optimizer,
                          scheduler, device, nb_epochs, model_path)
def run():
    dfx = pd.read_csv(config.TRAINING_FILE).fillna("none")
    dfx.sentiment = dfx.sentiment.apply(  # can use label encoding
        lambda x: 1 if x == "positive" else 0  # can use map fn
    )

    df_train, df_valid = model_selection.train_test_split(
        dfx,
        test_size=0.1,
        random_state=42,
        stratify=dfx.sentiment.
        values  # when split both train and val have same positive to negative sample ratio
    )

    df_train = df_train.reset_index(drop=True)  # 0 to length of df_train
    df_valid = df_valid.reset_index(drop=True)  # 0 to length of df_valid

    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)
    valid_dataset = dataset.BERTDataset(review=df_valid.review.values,
                                        target=df_valid.sentiment.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")  # using cuda
    model = BERTBaseUncased()  # calling from model.py

    param_optimizer = list(
        model.named_parameters())  # specify parameters to train
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    """ These parameters are adjustable, we should take a look at different layers and
    the decay we want, how much learning rate etc."""

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    # model = nn.DataParallel(model)              # converting to multi gpu model

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, target = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(target, outputs)
        print(f"Accuracy score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(
                model.state_dict(),
                config.MODEL_PATH)  # saving the model only if it improves
            best_accuracy = accuracy
Example #7
0
def run(dataset_index):

    datasets = [
        "gold.prep-auto.full.prep.{0}.csv",
        "gold.prep-auto.no-emoticons.prep.{0}.csv",
        "gold.prep-auto.prep.{0}.csv", "gold.prep-english.prep.{0}.csv",
        "gold.prep-peisenieks.prep.{0}.csv", "gold.prep.{0}.csv"
    ]
    # dataset_index = 5 #0-5

    train_file = config.DATASET_LOCATION + datasets[dataset_index].format(
        "train")
    df_train = pd.read_csv(train_file).fillna("none")
    df_train.label = df_train.label.apply(label_encoder)

    valid_file = config.DATASET_LOCATION + datasets[dataset_index].format(
        "dev"
    )  #"gold.prep-auto.full.prep.dev.csv" #gold.prep-auto.no-emoticons.prep.dev.csv" #gold.prep-auto.prep.dev.csv" #"gold.prep-english.prep.dev.csv" #"gold.prep-peisenieks.prep.dev.csv" #"gold.prep.dev.csv"
    df_valid = pd.read_csv(valid_file).fillna("none")
    df_valid.label = df_valid.label.apply(label_encoder)

    test_file = config.DATASET_LOCATION + "eval.prep.test.csv"
    df_test = pd.read_csv(test_file).fillna("none")
    df_test.label = df_test.label.apply(label_encoder)

    logger.info(f"Bert Model: {config.BERT_PATH}")
    logger.info(
        f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} "
    )

    logger.info(f"Train file: {train_file}")
    logger.info(f"Valid file: {valid_file}")
    logger.info(f"Test file: {test_file}")

    logger.info(f"Train size : {len(df_train):.4f}")
    logger.info(f"Valid size : {len(df_valid):.4f}")
    logger.info(f"Test size : {len(df_test):.4f}")

    train_dataset = dataset.BERTDataset(review=df_train.text.values,
                                        target=df_train.label.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4,
        shuffle=True)

    valid_dataset = dataset.BERTDataset(review=df_valid.text.values,
                                        target=df_valid.label.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    test_dataset = dataset.BERTDataset(review=df_test.text.values,
                                       target=df_test.label.values)

    test_data_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device(
        'cuda' if torch.cuda.is_available() else 'cpu')  #torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    # model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        logger.info(f"epoch={epoch}")

        train_loss, train_acc = engine.train_fn(train_data_loader, model,
                                                optimizer, device, scheduler)

        for tag, parm in model.named_parameters():
            if parm.grad is not None:
                writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch)

        outputs, targets, val_loss, val_acc = engine.eval_fn(
            valid_data_loader, model, device)
        val_mcc = metrics.matthews_corrcoef(outputs, targets)
        logger.info(f"val_MCC_Score = {val_mcc:.3f}")

        outputs, targets, test_loss, test_acc = engine.eval_fn(
            test_data_loader, model, device)
        test_mcc = metrics.matthews_corrcoef(outputs, targets)
        logger.info(f"test_MCC_Score = {test_mcc:.3f}")

        logger.info(
            f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, test_loss={test_loss:.4f}"
        )
        writer.add_scalar('loss/train', train_loss,
                          epoch)  # data grouping by `slash`
        writer.add_scalar('loss/val', val_loss,
                          epoch)  # data grouping by `slash`
        writer.add_scalar('loss/test', test_loss,
                          epoch)  # data grouping by `slash`

        logger.info(
            f"train_acc={train_acc:.3f}, val_acc={val_acc:.3f}, test_acc={test_acc:.3f}"
        )
        writer.add_scalar('acc/train', train_acc,
                          epoch)  # data grouping by `slash`
        writer.add_scalar('acc/val', val_acc,
                          epoch)  # data grouping by `slash`
        writer.add_scalar('acc/test', test_acc,
                          epoch)  # data grouping by `slash`

        logger.info(f"val_mcc={val_acc:.3f}, test_mcc={test_acc:.3f}")
        writer.add_scalar('mcc/val', val_mcc,
                          epoch)  # data grouping by `slash`
        writer.add_scalar('mcc/test', test_mcc,
                          epoch)  # data grouping by `slash`

        accuracy = metrics.accuracy_score(targets, outputs)
        logger.info(f"Accuracy Score = {accuracy:.3f}")

        if accuracy > best_accuracy:
            print(f"Saving model with Accuracy Score = {accuracy:.3f}")
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Example #8
0
def run():
    dfx = pd.read_csv(configr.TRAINING_FILE).fillna('none')
    dfx.sentiment = dfx.sentiment.map({"positive": 1, "negative": 0})

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=configr.TRAIN_BATCH_SIZE, num_workers=1)

    valid_dataset = dataset.BERTDataset(review=df_valid.review.values,
                                        target=df_valid.sentiment.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=configr.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device('cpu')
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / configr.TRAIN_BATCH_SIZE * configr.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(configr.EPOCHS):
        print("here")
        engine.train_fn(train_data_loader, model, optimizer,
                        configr.ACCUMULATION, device)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print("accuracy_score = {accuracy}".format(accuracy=accuracy))
        if (accuracy > best_accuracy):
            torch.save(model.state_dict(), configr.MODEL_PATH)
Example #9
0
def train():
    # this function trains the model

    # read the training file and fill NaN values with "none"
    # you can also choose to drop NaN values in this
    # specific dataset
    dfx = pd.read_csv(config_2.TRAINING_FILE).fillna("none")

    # sentiment = 1 if its positive
    # else sentiment = 0
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    # we split the data into single training
    # and validation fold
    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)

    # reset index
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    # initialize BERTDataset from dataset.py
    # for training dataset
    train_dataset = dataset_2.BERTDataset(review=df_train.review.values,
                                          target=df_train.sentiment.values)

    # create training dataloader
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config_2.TRAIN_BATCH_SIZE, num_workers=4)

    # initialize BERTDataset from dataset.py
    # for validation dataset
    valid_dataset = dataset_2.BERTDataset(review=df_valid.review.values,
                                          target=df_valid.sentiment.values)

    # create validation data loader
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config_2.VALID_BATCH_SIZE, num_workers=1)

    # initialize the cuda device
    # use cpu if you dont have GPU
    #device = torch.device("cuda")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # load model and send it to the device
    model = BERTBaseUncased()
    model.to(device)

    # create parameters we want to optimize
    # we generally dont use any decay for bias
    # and weight layers
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    # calculate the number of training steps
    # this is used by scheduler
    num_train_steps = int(
        len(df_train) / config_2.TRAIN_BATCH_SIZE * config_1.EPOCHS)

    # AdamW optimizer
    # AdamW is the most widely used optimizer
    # for transformer based networks
    optimizer = AdamW(optimizer_parameters, lr=3e-5)

    # fetch a scheduler
    # you can also try using reduce lr on plateau
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    # if you have multiple GPUs
    # model model to DataParallel
    # to use multiple GPUs
    model = nn.DataParallel(model)

    # start training the epochs
    best_accuracy = 0
    for epoch in range(config_2.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config_2.MODEL_PATH)
            best_accuracy = accuracy
Example #10
0
def run():
    dfx = pd.read_csv(config.TRAINING_FILE,
                      nrows=100).dropna().reset_index(drop=True)
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    model = nn.DataParallel(model)

    best_jaccard = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        jaccard = engine.eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        if jaccard > best_jaccard:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = jaccard
Example #11
0
def train():
    df = pd.read_csv(config.TRAINING_FILE).fillna("none")
    df['sentiment'] = df['sentiment'].map({"positive": 1, "negative": 0})

    df_train, df_valid = train_test_split(df,
                                          test_size=0.1,
                                          random_state=42,
                                          stratify=df.sentiment.values)

    # reset index of both splits
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        shuffle=False,
        num_workers=4,
    )

    valid_dataset = dataset.BERTDataset(review=df_valid.review.values,
                                        target=df_valid.sentiment.values)

    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        shuffle=False,
        num_workers=4,
    )

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=int(len(df_train) / config.TRAIN_BATCH_SIZE) *
        config.EPOCHS)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_dataloader, model, optimizer, device, scheduler)
        outputs, targets = engine.eval_fn(valid_dataloader, model, device)

        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(outputs, targets)
        print(f"Accuracy: {accuracy:.3f}")
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), config.MODEL_PATH)
Example #12
0
def run():
    dfx = pd.read_csv(config.TRAINING_FILE).fillna("none")

    #convert positive to 1 and negative to 0
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    #stratify split so that class can be balanced for both train and validation ==>> it means number of positive class will be equal to negative class for train ===>>same  for validation dataset also
    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.BERTDataset(review=df_valid.review.values,
                                        target=df_valid.sentiment.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    #specify what parameters you want to train
    param_optimizer = list(model.named_parameters())

    #we don't want any deacy for these layer names such as bias and othr following things
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters = [
        {
            #don't decay weight for above no_decay list else decay
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)

    #experiment with lr
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    #convert model to multi-gpu model --->> no need to do this if you have not multiple gpus

    model = nn.DataParallel(
        model)  # use @amp.autocast() in model.py if DataParallel() is enabled

    scaler = amp.GradScaler(
    )  #from torch.cuda import amp   #this is required if using autoatic mixed precision
    #and pass scaler to train_fun

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler,
                        scaler)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Example #13
0
def run():
    dfx = pd.read_csv(
        config.TRAINING_FILE).fillna("none").reset_index(drop=True)
    # df_test = pd.read_csv(config.TESTING_FILE).fillna("none").reset_index(drop=True)

    df_train, df_test = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.label.values)

    df_train = df_train.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(text=df_train.title.values,
                                        label=df_train.label.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    test_dataset = dataset.BERTDataset(text=df_test.title.values,
                                       label=df_test.label.values)

    test_data_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=config.TEST_BATCH_SIZE, num_workers=1)

    device = torch.device("cpu")
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, labels = engine.eval_fn(test_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(labels, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Example #14
0
def run():

    #df = preprocess()
    #df = pd.read_csv(config.PROCESSED_FILE)
    df = pd.read_csv('data/processed_train_data.csv')
    #print(df.columns)

    df_train, df_valid = model_selection.train_test_split(
        df, test_size=0.3, random_state=32, stratify=df.offensive.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.text.values,
                                        target=df_train.offensive.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.BERTDataset(review=df_valid.text.values,
                                        target=df_valid.offensive.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    model = BERTBaseUncased()
    model.to(config.DEVICE)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(5):
        engine.train_fn(train_data_loader, model, optimizer, config.DEVICE,
                        scheduler)
        outputs, targets = engine.eval_fn(valid_data_loader, model,
                                          config.DEVICE)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Example #15
0
def run():
    df = pd.read_csv(config.training_file).fillna("none")
    df.sentiment = df.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    df_train, df_valid = model_selection.train_test_split(
        df, test_size=0.1, random_state=42, stratify=df.sentiment.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.bert_dataset(review=df_train.review.values,
                                         target=df_train.sentiment.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.train_batch, num_workers=4)

    valid_dataset = dataset.bert_dataset(review=df_valid.review.values,
                                         target=df_valid.sentiment.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.valid_batch, num_workers=1)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = BERTBaseUncased()

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_params = [{
        "params":
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }, {
        "params":
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    num_train_steps = int(len(df_train) / config.train_batch) * config.epochs

    optimizer = AdamW(optimizer_params, lr=3e-5)

    scheduler = WarmupLinearSchedule(optimizer=optimizer,
                                     warmup_steps=0,
                                     t_total=num_train_steps)

    best_accuracy = 0
    for epoch in range(config.epochs):
        engine.train_fn(data_loader=train_data_loader,
                        model=model,
                        optimizer=optimizer,
                        device=device,
                        scheduler=scheduler)

        outputs, targets = engine.eval_fn(valid_data_loader,
                                          model=model,
                                          device=device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(outputs, targets)
        print(f"Accuracy Score= {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.model_path)
            best_accuracy = accuracy
def run():

    train_filename, label = sys.argv[1:3]

    model_path = "models2/" + label + "_best.pt"

    assert 'train' in train_filename
    filenames = {'train': train_filename,
        'dev': train_filename.replace('train', 'dev'),
        'test':train_filename.replace('train', 'test')}

    dataframes = {}
    num_classes = 0
    for subset, filename in filenames.items():
      dataframes[subset] = preprocess(filename, label)
      num_classes = max(num_classes, max(dataframes[subset].ENCODE_CAT) + 1)

    dataloaders = {}
    for subset, filename in filenames.items():
      if subset == 'train':
        batch_size = config.TRAIN_BATCH_SIZE
        num_workers = 4
      else:
        batch_size = config.VALID_BATCH_SIZE
        num_worker = 1
      dataloaders[subset] = process_dataset(
          dataframes[subset], batch_size, num_workers)

    device = torch.device(config.DEVICE)
    model = BERTBaseUncased(num_classes)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0,
        num_training_steps=get_num_train_steps(filenames["train"], label)
    )


    best_val_accuracy = float('-inf')
    best_val_epoch = None

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(
            dataloaders["train"], model, optimizer, device, scheduler, epoch)
        outputs, targets = engine.eval_fn(
            dataloaders['dev'], model, device, epoch)
        accuracy =  metrics.accuracy_score(outputs, targets)
        print(f"Validation Accuracy  = {accuracy}")
        if accuracy > best_val_accuracy:
            torch.save(model.state_dict(), model_path)
            best_val_accuracy = accuracy
            best_val_epoch = epoch
            print("Best val accuracy till now {}".format(best_val_accuracy))

        if best_val_epoch < (epoch - config.PATIENCE):
          break

    model.load_state_dict(torch.load(model_path))
    for subset in ['train', 'dev', 'test']:
      outputs, targets = engine.eval_fn(
            dataloaders[subset], model, device, epoch)

      result_df_dicts = []
      for o, t in zip(outputs, targets):
        result_df_dicts.append({"output":o, "target":t})

      result_df = pd.DataFrame.from_dict(result_df_dicts)

      final_df = pd.concat([dataframes[subset], result_df], axis=1)
      for i in final_df.itertuples():
        assert i.ENCODE_CAT == i.target

      result_file = "results2/" + subset + "_" + label + ".csv"
      final_df.to_csv(result_file)
def main(_):
    LEARNING_RATE = config.LEARNING_RATE
    DROPOUT = config.DROPOUT

    if FLAGS.lr:
        LEARNING_RATE = FLAGS.lr
    if FLAGS.dropout:
        DROPOUT = FLAGS.dropout

    train_file = config.TRAIN_PROC
    df_train = pd.read_csv(train_file).fillna("none")

    valid_file = config.DEVEL_PROC
    df_valid = pd.read_csv(valid_file).fillna("none")

    test_file = config.EVAL_PROC
    df_test = pd.read_csv(test_file).fillna("none")
    
    logger.info(f"Bert Model: {config.BERT_PATH}")
    logger.info(f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ")

    logger.info(f"Train file: {train_file}")
    logger.info(f"Valid file: {valid_file}")
    logger.info(f"Test file: {test_file}")

    logger.info(f"Train size : {len(df_train):.4f}")
    logger.info(f"Valid size : {len(df_valid):.4f}")
    logger.info(f"Test size : {len(df_test):.4f}")

    train_dataset = dataset.BERTDataset(
        review=df_train.text.values,
        target=df_train.label.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4, shuffle=True
    )

    valid_dataset = dataset.BERTDataset(
        review=df_valid.text.values,
        target=df_valid.label.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    test_dataset = dataset.BERTDataset(
        review=df_test.text.values,
        target=df_test.label.values
    )

    test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #torch.device("cuda")
    model = BERTBaseUncased(DROPOUT)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    # model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        logger.info(f"Epoch = {epoch}")

        train_loss, train_acc = engine.train_fn(
            train_data_loader, model, optimizer, device, scheduler)

        for tag, parm in model.named_parameters():
            if parm.grad is not None:
                writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch)

        outputs, targets, val_loss, val_acc = engine.eval_fn(
            valid_data_loader, model, device)
        val_mcc = metrics.matthews_corrcoef(outputs, targets)
        logger.info(f"val_MCC_Score = {val_mcc:.4f}")

        outputs, targets, test_loss, test_acc = engine.eval_fn(
            test_data_loader, model, device)
        test_mcc = metrics.matthews_corrcoef(outputs, targets)
        logger.info(f"test_MCC_Score = {test_mcc:.4f}")

        logger.info(
            f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, test_loss={test_loss:.4f}")
        writer.add_scalar('loss/train', train_loss, epoch) # data grouping by `slash`
        writer.add_scalar('loss/val', val_loss, epoch) # data grouping by `slash`
        writer.add_scalar('loss/test', test_loss, epoch) # data grouping by `slash`
        
        logger.info(
            f"train_acc={train_acc:.4f}, val_acc={val_acc:.4f}, test_acc={test_acc:.4f}")
        writer.add_scalar('acc/train', train_acc, epoch) # data grouping by `slash`
        writer.add_scalar('acc/val', val_acc, epoch) # data grouping by `slash`
        writer.add_scalar('acc/test', test_acc, epoch) # data grouping by `slash`
        
        logger.info(f"val_mcc={val_acc:.4f}, test_mcc={test_acc:.4f}")
        writer.add_scalar('mcc/val', val_mcc, epoch) # data grouping by `slash`
        writer.add_scalar('mcc/test', test_mcc, epoch) # data grouping by `slash`

        accuracy = metrics.accuracy_score(targets, outputs)
        logger.info(f"Accuracy Score = {accuracy:.4f}")
        
        if accuracy < 0.4:
            logger.info(f"Something is very wrong! Accuracy is only {accuracy:.4f} Stopping...")
            break

        if accuracy > best_accuracy:
            logger.info(f"Saving model with Accuracy Score = {accuracy:.4f}")
            torch.save(model.state_dict(), config.MODEL_PATH[:-4] + "." + str(round(accuracy*100, 2)) + ".bin")
            best_accuracy = accuracy
            es = 0
        else:
            es += 1
            logger.info(f"Not improved for {es} times of 5. Best so far - {best_accuracy:.4f}")

            if es > 4:
                logger.info(f"Early stopping with best accuracy: {best_accuracy:.4f} and accuracy for this epoch: {accuracy:.4f} ...")
                break
Example #18
0
def run():
    df1 = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-train.csv",
                      usecols=['comment_text', 'toxic'])
    df1 = pd.read_csv("../input/jigsaw-unintended-bias-train.csv",
                      usecols=['comment_text', 'toxic'])

    #combined df1 and df2 and made big dataframe
    df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True)

    #validation dataframe has been given by kaggle
    df_valid - pd.read_csv("../input/validation.csv")

    train_dataset = dataset.BERTDataset(
        comment_text=df_train.comment_text.values,
        target=df_train.toxic.values)

    #--------------------------------------
    #write sampler if using tpu else not
    train_sampler = torch.data.distributed.DistributedSampler(
        train_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=True)
    #----------------------------------------

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4,
        sampler=train_sampler,
        #problem with tpu when using torch_xla is that if batch size is not equal then it's going to crash , so use drop_last
        drop_last=True)

    valid_dataset = dataset.BERTDataset(
        comment_text=df_valid.comment_text.values,
        target=df_valid.toxic.values)

    #--------------------------------------
    #write sampler if using tpu else not
    valid_sampler = torch.data.distributed.DistributedSampler(
        valid_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=True)
    #----------------------------------------------

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1,
        sampler=valid_sampler,
        #no need of drop_last here
    )

    device = xm.xla_device()  #xla_device means tpu
    model = BERTBaseUncased()
    # model.to(device)  #no need to move data on device

    #specify what parameters you want to train
    param_optimizer = list(model.named_parameters())

    #we don't want any deacy for these layer names such as bias and othr following things
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters = [
        {
            #don't decay weight for above no_decay list else decay
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE / xm.xrt_world_size() *
        config.EPOCHS)

    lr = 3e-5 * xm.xrt_world_size()
    #experiment with lr
    optimizer = AdamW(optimizer_parameters, lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):

        #parallel loader for tpus
        para_loader = pl.ParallelLoader(train_data_loader, [device])
        engine.train_fn(para_loader.per_device_loader(device), model,
                        optimizer, device, scheduler)

        parallel_loader = pl.ParallelLoader(valid_data_loader, [device])
        outputs, targets = engine.eval_fn(
            para_loader.per_device_loader(device), model, device)

        #threshold the target instead of output
        targets = np.array(targets) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:

            #instead of torch.save use xm.save
            xm.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Example #19
0
def run():
    '''
    Entire training loop
        - Create DataLoaders
        - Define Training Configuration
        - Launch Training Loop
    '''

    # Num of available TPU cores
    if config.TPUs:
        n_TPUs = xm.xrt_world_size()
        DEVICE = xm.xla_device()
    else:
        DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = torch.device(DEVICE)
    
    # Read Data
    
    # df1 = pd.read_csv('data/jigsaw-toxic-comment-train.csv', usecols=['comment_text', 'toxic'])
    # df2 = pd.read_csv('data/jigsaw-unintended-bias-train.csv', usecols=['comment_text', 'toxic'], engine='python') # don't know why it was breaking with default C parser
    # df_train = df1 # pd.concat([df1,df2], axis=0).reset_index(drop=True)
    # df_valid = pd.read_csv('data/validation.csv')
    
    # Subsample
    df_train = pd.read_csv('data/jigsaw-toxic-comment-train-small.csv', usecols=['comment_text', 'toxic'])
    df_valid = pd.read_csv('data/validation-small.csv', usecols=['comment_text', 'toxic']) 

    # Preprocess
    
    train_dataset = dataset.BERTDataset(
        comment=df_train.comment_text.values,
        target=df_train.toxic.values
    )

    valid_dataset = dataset.BERTDataset(
        comment=df_valid.comment_text.values,
        target=df_valid.toxic.values
    )

    drop_last=False
    train_sampler, valid_sampler = None, None
    if config.TPUs:
        drop_last=True
        train_sampler = DistributedSampler(
            train_dataset, 
            num_replicas=n_TPUs,
            rank=xm.get_ordinal(),
            shuffle=True
        )
        valid_sampler = DistributedSampler(
            valid_dataset, 
            num_replicas=n_TPUs,
            rank=xm.get_ordinal(),
            shuffle=True
        )


    # Create Data Loaders

    train_data_loader = torch.utils.data.DataLoader(
        dataset=train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4,
        drop_last=drop_last,
        sampler=train_sampler
    )


    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1,
        drop_last=drop_last,
        sampler=valid_sampler
    )

    # Machine Configuration

    if config.MODEL == 'bert':
        model = BERTBaseUncased()
    elif config.MODEL == 'distil-bert':
        model = DistilBERTBaseUncased()
    else:
        print('Model chosen in config not valid')
        exit()
    model.to(device)
    
    # Optimizer Configuration 

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    lr = config.LR
    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    # TODO: why do the LR increases because of a distributed training ?
    if config.TPUs:
        num_train_steps /= n_TPUs
        lr *= n_TPUs

    optimizer = AdamW(optimizer_parameters, lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    if not config.TPUs:
        if N_GPU > 1:
            model = nn.DataParallel(model)
    
    # Training loop

    best_score = 0
    
    for epoch in range(config.EPOCHS):
    
        if config.TPUs:
            train_loader = pl.ParallelLoader(train_data_loader, [device])
            valid_loader = pl.ParallelLoader(valid_data_loader, [device])
            train_fn(train_loader.per_device_loader(device), model, optimizer, device, scheduler)
            outputs, targets = eval_fn(valid_loader.per_device_loader(device), model, device)

        else:
            train_fn(train_data_loader, model, optimizer, device, scheduler)
            outputs, targets = eval_fn(valid_data_loader, model, device)
        
        targets = np.array(targets) >= 0.5 # TODO: why ?
        auc_score = metrics.roc_auc_score(targets, outputs)
            
        # Save if best
        print(f"AUC Score = {auc_score}")
        if auc_score > best_score:
            if not config.TPUs:
                torch.save(model.state_dict(), config.MODEL_PATH)
            else:
                xm.save(model.state_dict(), config.MODEL_PATH)
            best_score = auc_score
def run():
    dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True)

    #stratify split so that class can be balanced for both train and validation ==>> it means number of positive class will be equal to negative class for train ===>>same  for validation dataset also
    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.TweetDataset(
        tweet=df_train.text.values,
        target=df_train.sentiment.values,
        selected_text=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.TweetDataset(
        tweet=df_valid.text.values,
        target=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    #specify what parameters you want to train
    param_optimizer = list(model.named_parameters())

    #we don't want any deacy for these layer names such as bias and othr following things
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters = [
        {
            #don't decay weight for above no_decay list else decay
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)

    #experiment with lr
    optimizer = AdamW(optimizer_parameters, lr=3e-5)

    #scheduler can be of your choice
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    #convert model to multi-gpu model --->> no need to do this if you have not multiple gpus
    model = nn.DataParallel(model)

    #evaluation matrix is jacccard
    best_jaccard = 0

    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        jaccard = engine.eval_fn(valid_data_loader, model, device)

        print(f"Jaccard Score = {jaccard}")
        if jaccard > best_jaccard:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_jaccard = jaccard
Example #21
0
def run():

    df_train = preprocess('./review-sentence_train_clean.csv')
    df_valid = preprocess('./review-sentence_dev_clean.csv')

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.sentence.values,
                                        target=df_train.ENCODE_CAT.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.BERTDataset(review=df_valid.sentence.values,
                                        target=df_valid.ENCODE_CAT.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device(config.DEVICE)
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler,
                        epoch)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device,
                                          epoch)
        accuracy = metrics.accuracy_score(outputs, targets)
        print(f"Validation Accuracy  = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
            print("Best val accuracy till now {}".format(best_accuracy))
    valid_dataset = dataset.BERTDataset(
        comment_text=df_valid.comment_text.values,
        target=df_valid.toxic.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)
    
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    model = nn.DataParallel(model)