Exemple #1
0
def run():
    df = pd.read_csv(config.TRAINING_FILE).fillna("none")
    df.sentiment = df.sentiment.apply(lambda x: 1 if x == "positive" else 0)
    df_train, df_valid = model_selection.train_test_split(
        df, test_size=0.1, random_state=42,
        stratify=df.sentiment.values)  # Same ratio of +ve and -ve index
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)
    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4,
    )

    valid_dataset = dataset.BERTDataset(review=df_valid.review.values,
                                        target=df_valid.sentiment.values)
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=4,
    )
    model = BERTBaseUncased()
    trainer = Trainer(gpus=1)
    trainer.fit(model,
                train_dataloader=train_data_loader,
                val_dataloaders=[valid_data_loader])
Exemple #2
0
def run():
    dfx = pd.read_csv(config.TRAINING_FILE).fillna("none")
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    valid_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")

    model = BERTBaseUncased()

    model.to(device)

    param_optimizer = list(model.named_parameters())

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, scheduler, device)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")

        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
def run():
    df1 = pd.read_csv("../input/jigsaw-toxic-comment-train.csv",
                      usecols=["comment_text", "toxic"])
    df2 = pd.read_csv("../input/jigsaw-unintended-bias-train.csv",
                      usecols=["comment_text", "toxic"])

    df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True)
    df_valid = pd.read_csv("../input/validation.csv")

    train_dataset = dataset.BERTDataset(
        comment_text=df_train.comment_text.values,
        target=df_train.toxic.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.BERTDataset(
        comment_text=df_valid.comment_text.values,
        target=df_valid.toxic.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device(config.DEVICE)
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [{
        "params":
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        "weight_decay":
        0.001
    }, {
        "params":
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay":
        0.0
    }]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        targets = np.array(targets) >= 0.5
        accuracy = metrics.roc_auc_score(targets, outputs)
        print(f"AUC Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
def run():
    df1 = pd.read_csv(config.TRAINING_FILE, usecols=["comment_text","toxic"])
    
    train_dataset = dataset.BERTDataset(
        review=df1.comment_text.values,
        target=df1.toxic.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )
    df2=pd.read_csv("../input/validation.csv", usecols=["comment_text","toxic"])
    valid_dataset = dataset.BERTDataset(
        review=df2.comment_text.values,
        target=df2.toxic.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BERTBaseUncased()
    model.to(device)
    
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    num_train_steps = int(len(df1) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Exemple #5
0
def run():
    print('1.Loading data...')
    dfx = pd.read_csv(config.TRAINING_FILE).fillna("none")

    # only train 2000 entries
    dfx = dfx[:2000]
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    print('Creating dataset...')
    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    valid_dataset = dataset.BERTDataset(review=df_valid.review.values,
                                        target=df_valid.sentiment.values)

    print('Creating dataloader...')
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    print('Building Bert Model...')
    model = BERTBaseUncased()

    print("Creating BERT Trainer...")
    trainer = BERTTrainer(model=model,
                          train_dataloader=train_data_loader,
                          test_dataloader=valid_data_loader,
                          lr=config.LR,
                          with_cuda=config.USE_CUDA)

    # model = nn.DataParallel(model)

    print('Training Start...')
    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        train_acc, train_loss = trainer.train_fn(epoch, len(df_train))
        print(f'Train loss: {train_loss} Train accuracy: {train_acc:.4%}')

        outputs, targets = trainer.eval_fn()
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy:.2%}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Exemple #6
0
def run():
  # Read in CSV
  df = pd.read_csv(config.TRAINING_FILE)
  print('Read In Complete!')

  # Split into Validation
  df_train, df_val = train_test_split(df, test_size=0.1, stratify=df.sentiment.values, random_state=config.RANDOM_SEED)
  df_train = df_train.reset_index(drop=True)
  df_val = df_val.reset_index(drop=True)
  print(df_train.shape, df_val.shape)
  print('Validation Split Complete!')

  # Create Dataset required for BERT Model
  train_dataset = dataset.BERTDataset(df_train.content.values, df_train.sentiment.values)
  val_dataset = dataset.BERTDataset(df_val.content.values, df_val.sentiment.values)

  train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)
  val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=config.VAL_BATCH_SIZE, num_workers=1)
  print('Dataset for Model Complete!')

  # Define Model and Hyperparameters
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  model = BERTBaseCased()
  model.to(device)

  num_training_steps = len(train_data_loader) * config.EPOCHS
  optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=num_training_steps, num_warmup_steps=0)

  # Train the Model, Print Aaccurcay, Save Model
  n_train_exp = len(df_train)
  n_val_exp = len(df_val)

  history = defaultdict(list)
  best_accuracy = 0

  for epoch in range(config.EPOCHS):
    print(f'\n{"#" * 10} Epoch: {epoch+1}/{config.EPOCHS} {"#" * 10}\n')
    train_acc, train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler, n_train_exp)    
    val_acc, val_loss = engine.eval_fn(val_data_loader, model, device, n_val_exp)

    print(f'\nTrain Loss: {train_loss:.4f}        Acc: {train_acc:.4f} \nVal   Loss: {val_loss:.4f}    Val Acc: {val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        #!rm -rf /content/model*
        torch.save(model.state_dict(), config.MODEL_PATH)  # f'model/model_{val_acc:0.2f}.bin')
        best_accuracy = val_acc
Exemple #7
0
def sentence_prediction(sentence):
    sentence = preprocess(sentence)
    model_path = config.MODEL_PATH

    test_dataset = dataset.BERTDataset(
        review=[sentence],
        target=[0]
    )

    test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=3
    )

    device = config.device

    model = BERTBaseUncased()
    model.load_state_dict(torch.load(
        model_path, map_location=torch.device(device)))
    model.to(device)

    outputs, [] = engine.predict_fn(test_data_loader, model, device)
    print(outputs)
    return outputs[0]
def generate_predictions(df):
    df.reset_index(drop=True, inplace=True)
    predict_dataset = dataset.BERTDataset(review=df.PROCESSED_TEXT.values)
    predict_data_loader = torch.utils.data.DataLoader(
        predict_dataset,
        batch_size=config.PREDICT_BATCH_SIZE,
        num_workers=config.NUM_WORKERS,
    )
    test_preds = np.zeros(df.shape[0])
    with torch.no_grad():
        for bi, d in enumerate(predict_data_loader):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]

            ids = ids.to(DEVICE, dtype=torch.long)
            token_type_ids = token_type_ids.to(DEVICE, dtype=torch.long)
            mask = mask.to(DEVICE, dtype=torch.long)
            preds = MODEL(ids=ids, mask=mask, token_type_ids=token_type_ids)
            test_preds[
                bi * config.PREDICT_BATCH_SIZE : (bi + 1) * config.PREDICT_BATCH_SIZE
            ] = (preds[:, 0].detach().cpu().squeeze().numpy())

    output = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()
    return output
def process_dataset(df, batch_size, num_workers):
  df = df.reset_index(drop=True)
  this_dataset = dataset.BERTDataset(
        review=df.sentence.values, target=df.ENCODE_CAT.values
    )
  data_loader = torch.utils.data.DataLoader(
      this_dataset, batch_size=batch_size, num_workers=num_workers)
  return data_loader
def create_data_loader(df, tokenizer, max_len, batch_size):

    ds = dataset.BERTDataset(
                            reviews=df.content.to_numpy(),
                            targets=df.category.to_numpy(),
                            tokenizer=tokenizer,
                            max_len=max_len
                            )

    return DataLoader(
                      ds,
                      batch_size=batch_size,
                      num_workers=4
                      )
Exemple #11
0
def main(_):
    input = config.EVAL_PROC
    output = 'predictions.csv'
    model_path = config.MODEL_PATH
    if FLAGS.input:
        input = FLAGS.input
    if FLAGS.output:
        output = FLAGS.input
    if FLAGS.model_path:
        model_path = FLAGS.model_path
    df_test = pd.read_fwf(input)

    logger.info(f"Bert Model: {config.BERT_PATH}")
    logger.info(
        f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} "
    )
    logger.info(f"Test file: {input}")
    logger.info(f"Test size : {len(df_test):.4f}")

    trg = []
    for i in range(len(df_test.values)):
        trg.append(0)

    test_dataset = dataset.BERTDataset(text=df_test.values, target=trg)

    test_data_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=3)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = BERTBaseUncased(config.DROPOUT)
    model.load_state_dict(
        torch.load(model_path, map_location=torch.device(device)))
    model.to(device)

    outputs, extracted_features = engine.predict_fn(
        test_data_loader, model, device, extract_features=FLAGS.features)
    df_test["predicted"] = outputs
    # save file
    df_test.to_csv(output, header=None, index=False)
Exemple #12
0
def run(opt_level="O2",
        keep_batchnorm_fp32=True,
        batch_size=5,
        nb_epochs=10,
        data_path="../inputs/IMDB_Dataset.csv",
        model_path="./"):

    df = pd.read_csv(data_path).fillna("none")[0:100]
    df.sentiment = df.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    df_train, df_valid = model_selection.train_test_split(
        df, test_size=0.1, random_state=42, stratify=df.sentiment.values)
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)
    # Creating the datasets
    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    valid_dataset = dataset.BERTDataset(review=df_valid.review.values,
                                        target=df_valid.sentiment.values)
    # Creating the dataloaders
    train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size,
                                                   num_workers=10,
                                                   drop_last=True)

    valid_dataloader = torch.utils.data.DataLoader(valid_dataset,
                                                   batch_size,
                                                   num_workers=10,
                                                   drop_last=True)
    # Defining the model and sending to the device
    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    parameters = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias",
                "LayerNorm.weight"]  # We don't want any decay for them
    optimizer_parameters = [{
        "params":
        [p for n, p in parameters if not any(nd in n for nd in no_decay)],
        "weight_decay":
        0.001
    }, {
        "params":
        [p for n, p in parameters if any(nd in n for nd in no_decay)],
        "weight_decay":
        0.0
    }]

    num_train_steps = int(len(df_train) * nb_epochs / batch_size)
    # Defining the optimizer and the scheduler
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    # Initialize the pytorch model and the optimizer to allow automatic mixed-precision training
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=opt_level,
                                      keep_batchnorm_fp32=keep_batchnorm_fp32,
                                      loss_scale="dynamic")

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,  # No warmup
        num_training_steps=num_train_steps)

    # Train the model
    engine.global_trainer(train_dataloader, valid_dataloader, model, optimizer,
                          scheduler, device, nb_epochs, model_path)
def run():
    dfx = pd.read_csv(config.TRAINING_FILE).fillna("none")
    dfx.sentiment = dfx.sentiment.apply(  # can use label encoding
        lambda x: 1 if x == "positive" else 0  # can use map fn
    )

    df_train, df_valid = model_selection.train_test_split(
        dfx,
        test_size=0.1,
        random_state=42,
        stratify=dfx.sentiment.
        values  # when split both train and val have same positive to negative sample ratio
    )

    df_train = df_train.reset_index(drop=True)  # 0 to length of df_train
    df_valid = df_valid.reset_index(drop=True)  # 0 to length of df_valid

    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)
    valid_dataset = dataset.BERTDataset(review=df_valid.review.values,
                                        target=df_valid.sentiment.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")  # using cuda
    model = BERTBaseUncased()  # calling from model.py

    param_optimizer = list(
        model.named_parameters())  # specify parameters to train
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    """ These parameters are adjustable, we should take a look at different layers and
    the decay we want, how much learning rate etc."""

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    # model = nn.DataParallel(model)              # converting to multi gpu model

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, target = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(target, outputs)
        print(f"Accuracy score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(
                model.state_dict(),
                config.MODEL_PATH)  # saving the model only if it improves
            best_accuracy = accuracy
def run():
    print("---------- Starting Data Reading -------")
    df1 = pd.read_csv("../input/jigsaw-toxic-comment-train.csv",
                      usecols=["comment_text", "toxic"])
    df2 = pd.read_csv("../input/jigsaw-unintended-bias-train.csv",
                      usecols=["comment_text", "toxic"])

    df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True)
    df_valid = pd.read_csv("../input/validation.csv")

    print("---- Data Read Sucessfully --- ")

    # # dfx = pd.read_csv(config.TRAINING_FILE).fillna("none")
    # # dfx["sentiment"] = dfx["sentiment"].apply(
    # #     lambda x : 1 if x == "positive" else 0
    # # )

    # # df_train, df_valid = model_selection.train_test_split(
    # #     dfx,
    # #     test_size=0.1,
    # #     random_state=42,
    # #     stratify=dfx["sentiment"].values
    # # )

    # df_train = df_train.reset_index(drop=True)
    # df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(
        comment_text=df_train["comment_text"].values,
        target=df_train["toxic"].values)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4,
    )

    valid_dataset = dataset.BERTDataset(
        comment_text=df_valid["comment_text"].values,
        target=df_train["toxic"].values)

    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALIDATION_BATCH_SIZE,
        num_workers=1,
    )
    print("---- DataLoaders Created Sucessfully --- ")

    device = torch.device("cuda")

    model = BERTBasedUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = len(dfx) / (config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_dataloader, model, optimizer, scheduler, device)
        outputs, targets = engine.eval_fn(valid_dataloader, model, device)
        targets = np.array(targets) >= 0.5
        accuracy = metrics.roc_auc_score(targets, outputs)
        print(f"AUC Score {accuracy}")

        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Exemple #15
0
def train():
    # this function trains the model

    # read the training file and fill NaN values with "none"
    df = pd.read_csv(config.TRAINING_FILE).fillna("none")

    # map positive to 1 and negative to 0
    df.sentiment = df.apply(lambda x: 1 if x == "positive" else 0)

    # split data into single training and validation fold
    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=df.sentiment.values)
    # reset index
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    # initialize BERTDataset from dataset.py
    # for training dataset
    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    # create training dataloader
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)
    # initialize BERTDataset from dataset.py
    # for training dataset
    valid_dataset = dataset.BERTDataset(review=df_valid.review.values,
                                        target=df_valid.sentiment.values)

    # create training dataloader
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    # initialize the cuda device
    # use cpu if you dont have GPU
    device = torch.device("cuda")

    # load model and send it to the device
    model = BERTBasedUncased()
    model.to(device)

    # create parameters we want to optimize
    # we generally dont use any decay for bias
    # and weight layers
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]

    # calculate the number of training steps
    # this is used by scheduler
    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)

    # AdamW optimizer
    # AdamW is the most widely used optimizer
    # for transformer based networks
    optimizer = AdamW(optimizer_parameters, lr=3e-5)

    # fetch a scheduler
    # you can also try using reduce lr on plateau
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    # if you have multiple GPUs
    # model to DataParallel
    # to use multiple GPUs
    model = nn.DataParallel(model)

    # start training the epochs
    best_accuracy = 0
    for epoch in range(config.EPOCHS):

        # train model
        engine.train_fn(train_data_loader, model, device, scheduler)

        # test the model
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)

        # convert outputs to numpy array
        outputs = np.array(outputs) >= 0.5

        # calculate the accuracy
        accuracy = metrics.accuracy_score(targets, outputs)

        # print the accuracy
        print(f"Accuracy Score = {accuracy}")

        # save model only this the accuracy is better than the best_accuracy (set up to 0)
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Exemple #16
0
def run():
    df_train = pd.read_csv(config.TRAINING_FILE).fillna("none")
    df_train.label = df_train.label.apply(
        lambda x: 1 if x == "unscrambled" else 0)

    df_valid = pd.read_csv(config.VALID_FILE).fillna("none")
    df_valid.label = df_valid.label.apply(
        lambda x: 1 if x == "unscrambled" else 0)

    # df_train, df_valid = model_selection.train_test_split(
    #     dfx, test_size=0.1, random_state=42, stratify=dfx.label.values
    # )

    # df_train = df_train.reset_index(drop=True)
    # df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(
        text=df_train.text.values, target=df_train.label.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4
    )

    valid_dataset = dataset.BERTDataset(
        text=df_valid.text.values, target=df_valid.label.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
    )

    device = torch.device("cuda")
    # model = DistilBERTBaseUncased()
    configuration = transformers.DistilBertConfig()
    # Initializing a model from the configuration
    # model = transformers.DistilBertModel(configuration)
    model = transformers.DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased')
    model.classifier = nn.Linear(768, 1)
    print(model)
    # exit(0)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    best_f1 = 0
    es_patience = 3
    es = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(
            train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        # valid_loss = nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        f1_score = metrics.f1_score(targets, outputs)
        print(f"Accuracy Score = {accuracy} F1 Score = {f1_score}")
        if f1_score > best_f1:
            print(
                f'Saving model, F1 score improved from {best_f1} to {f1_score}')
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_f1 = f1_score
        else:
            if es < es_patience:
                print(f'Early stopping!')
                break
            else:
                es += 1
                print(f'Early Stop Counter {es} of {es_patience}')
Exemple #17
0
def run():
    '''
    Entire training loop
        - Create DataLoaders
        - Define Training Configuration
        - Launch Training Loop
    '''

    # Num of available TPU cores
    if config.TPUs:
        n_TPUs = xm.xrt_world_size()
        DEVICE = xm.xla_device()
    else:
        DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = torch.device(DEVICE)
    
    # Read Data
    
    # df1 = pd.read_csv('data/jigsaw-toxic-comment-train.csv', usecols=['comment_text', 'toxic'])
    # df2 = pd.read_csv('data/jigsaw-unintended-bias-train.csv', usecols=['comment_text', 'toxic'], engine='python') # don't know why it was breaking with default C parser
    # df_train = df1 # pd.concat([df1,df2], axis=0).reset_index(drop=True)
    # df_valid = pd.read_csv('data/validation.csv')
    
    # Subsample
    df_train = pd.read_csv('data/jigsaw-toxic-comment-train-small.csv', usecols=['comment_text', 'toxic'])
    df_valid = pd.read_csv('data/validation-small.csv', usecols=['comment_text', 'toxic']) 

    # Preprocess
    
    train_dataset = dataset.BERTDataset(
        comment=df_train.comment_text.values,
        target=df_train.toxic.values
    )

    valid_dataset = dataset.BERTDataset(
        comment=df_valid.comment_text.values,
        target=df_valid.toxic.values
    )

    drop_last=False
    train_sampler, valid_sampler = None, None
    if config.TPUs:
        drop_last=True
        train_sampler = DistributedSampler(
            train_dataset, 
            num_replicas=n_TPUs,
            rank=xm.get_ordinal(),
            shuffle=True
        )
        valid_sampler = DistributedSampler(
            valid_dataset, 
            num_replicas=n_TPUs,
            rank=xm.get_ordinal(),
            shuffle=True
        )


    # Create Data Loaders

    train_data_loader = torch.utils.data.DataLoader(
        dataset=train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4,
        drop_last=drop_last,
        sampler=train_sampler
    )


    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1,
        drop_last=drop_last,
        sampler=valid_sampler
    )

    # Machine Configuration

    if config.MODEL == 'bert':
        model = BERTBaseUncased()
    elif config.MODEL == 'distil-bert':
        model = DistilBERTBaseUncased()
    else:
        print('Model chosen in config not valid')
        exit()
    model.to(device)
    
    # Optimizer Configuration 

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    lr = config.LR
    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    # TODO: why do the LR increases because of a distributed training ?
    if config.TPUs:
        num_train_steps /= n_TPUs
        lr *= n_TPUs

    optimizer = AdamW(optimizer_parameters, lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    if not config.TPUs:
        if N_GPU > 1:
            model = nn.DataParallel(model)
    
    # Training loop

    best_score = 0
    
    for epoch in range(config.EPOCHS):
    
        if config.TPUs:
            train_loader = pl.ParallelLoader(train_data_loader, [device])
            valid_loader = pl.ParallelLoader(valid_data_loader, [device])
            train_fn(train_loader.per_device_loader(device), model, optimizer, device, scheduler)
            outputs, targets = eval_fn(valid_loader.per_device_loader(device), model, device)

        else:
            train_fn(train_data_loader, model, optimizer, device, scheduler)
            outputs, targets = eval_fn(valid_data_loader, model, device)
        
        targets = np.array(targets) >= 0.5 # TODO: why ?
        auc_score = metrics.roc_auc_score(targets, outputs)
            
        # Save if best
        print(f"AUC Score = {auc_score}")
        if auc_score > best_score:
            if not config.TPUs:
                torch.save(model.state_dict(), config.MODEL_PATH)
            else:
                xm.save(model.state_dict(), config.MODEL_PATH)
            best_score = auc_score
Exemple #18
0
def run():
    dfx = pd.read_csv(configr.TRAINING_FILE).fillna('none')
    dfx.sentiment = dfx.sentiment.map({"positive": 1, "negative": 0})

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=configr.TRAIN_BATCH_SIZE, num_workers=1)

    valid_dataset = dataset.BERTDataset(review=df_valid.review.values,
                                        target=df_valid.sentiment.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=configr.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device('cpu')
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / configr.TRAIN_BATCH_SIZE * configr.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(configr.EPOCHS):
        print("here")
        engine.train_fn(train_data_loader, model, optimizer,
                        configr.ACCUMULATION, device)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print("accuracy_score = {accuracy}".format(accuracy=accuracy))
        if (accuracy > best_accuracy):
            torch.save(model.state_dict(), configr.MODEL_PATH)
Exemple #19
0
def run():
    dfx = pd.read_csv(config.TRAINING_FILE)
    print("Shape of datframe:",dfx.shape)
    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.label.values
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)
    print("Shape of train datframe:",df_train.shape)
    print("Shape of validation dataframe:",df_valid.shape)

    train_dataset = dataset.BERTDataset(
        sent=df_train.sentences.values, target=df_train.label.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=8
    )

    valid_dataset = dataset.BERTDataset(
        sent=df_valid.sentences.values, target=df_valid.label.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2
    )

    device = torch.device(config.DEVICE)
    model = BERT_CLASSIFIER()
    if config.RETRAIN:
            DEVICE = 'cuda'
            model.load_state_dict(torch.load(config.RETRAIN_MODEL_LOC))
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.1,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=config.LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    best_accuracy = 0
    best_eval_loss = np.inf

    for epoch in range(config.EPOCHS):
        epoch_train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets, epoch_eval_loss = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= config.ACC_CUTOFF
        accuracy = metrics.accuracy_score(targets, outputs)
        print("Train loss = ", epoch_train_loss)
        print("Validation Loss = ", epoch_eval_loss)
        print("Accuracy Score =", accuracy)
        if config.TRAINING_MODE == 'ba':
            best_eval_loss = np.inf
        if accuracy > best_accuracy and epoch_eval_loss < best_eval_loss:
            print("Saving Model state")
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
            best_eval_loss = epoch_eval_loss
        else:
            print("Saving model in dump folder")
            torch.save(model.state_dict(), config.MODEL_PATH_2 + f"{epoch}.bin")
Exemple #20
0
def run():

    #df = preprocess()
    #df = pd.read_csv(config.PROCESSED_FILE)
    df = pd.read_csv('data/processed_train_data.csv')
    #print(df.columns)

    df_train, df_valid = model_selection.train_test_split(
        df, test_size=0.3, random_state=32, stratify=df.offensive.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.text.values,
                                        target=df_train.offensive.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.BERTDataset(review=df_valid.text.values,
                                        target=df_valid.offensive.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    model = BERTBaseUncased()
    model.to(config.DEVICE)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(5):
        engine.train_fn(train_data_loader, model, optimizer, config.DEVICE,
                        scheduler)
        outputs, targets = engine.eval_fn(valid_data_loader, model,
                                          config.DEVICE)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Exemple #21
0
def run():
    df1 = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-train.csv",
                      usecols=['comment_text', 'toxic'])
    df1 = pd.read_csv("../input/jigsaw-unintended-bias-train.csv",
                      usecols=['comment_text', 'toxic'])

    #combined df1 and df2 and made big dataframe
    df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True)

    #validation dataframe has been given by kaggle
    df_valid - pd.read_csv("../input/validation.csv")

    train_dataset = dataset.BERTDataset(
        comment_text=df_train.comment_text.values,
        target=df_train.toxic.values)

    #--------------------------------------
    #write sampler if using tpu else not
    train_sampler = torch.data.distributed.DistributedSampler(
        train_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=True)
    #----------------------------------------

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4,
        sampler=train_sampler,
        #problem with tpu when using torch_xla is that if batch size is not equal then it's going to crash , so use drop_last
        drop_last=True)

    valid_dataset = dataset.BERTDataset(
        comment_text=df_valid.comment_text.values,
        target=df_valid.toxic.values)

    #--------------------------------------
    #write sampler if using tpu else not
    valid_sampler = torch.data.distributed.DistributedSampler(
        valid_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=True)
    #----------------------------------------------

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1,
        sampler=valid_sampler,
        #no need of drop_last here
    )

    device = xm.xla_device()  #xla_device means tpu
    model = BERTBaseUncased()
    # model.to(device)  #no need to move data on device

    #specify what parameters you want to train
    param_optimizer = list(model.named_parameters())

    #we don't want any deacy for these layer names such as bias and othr following things
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters = [
        {
            #don't decay weight for above no_decay list else decay
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE / xm.xrt_world_size() *
        config.EPOCHS)

    lr = 3e-5 * xm.xrt_world_size()
    #experiment with lr
    optimizer = AdamW(optimizer_parameters, lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):

        #parallel loader for tpus
        para_loader = pl.ParallelLoader(train_data_loader, [device])
        engine.train_fn(para_loader.per_device_loader(device), model,
                        optimizer, device, scheduler)

        parallel_loader = pl.ParallelLoader(valid_data_loader, [device])
        outputs, targets = engine.eval_fn(
            para_loader.per_device_loader(device), model, device)

        #threshold the target instead of output
        targets = np.array(targets) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:

            #instead of torch.save use xm.save
            xm.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Exemple #22
0
def train():
    df = pd.read_csv(config.TRAINING_FILE).fillna("none")
    df['sentiment'] = df['sentiment'].map({"positive": 1, "negative": 0})

    df_train, df_valid = train_test_split(df,
                                          test_size=0.1,
                                          random_state=42,
                                          stratify=df.sentiment.values)

    # reset index of both splits
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        shuffle=False,
        num_workers=4,
    )

    valid_dataset = dataset.BERTDataset(review=df_valid.review.values,
                                        target=df_valid.sentiment.values)

    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        shuffle=False,
        num_workers=4,
    )

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=int(len(df_train) / config.TRAIN_BATCH_SIZE) *
        config.EPOCHS)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_dataloader, model, optimizer, device, scheduler)
        outputs, targets = engine.eval_fn(valid_dataloader, model, device)

        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(outputs, targets)
        print(f"Accuracy: {accuracy:.3f}")
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), config.MODEL_PATH)
Exemple #23
0
def run():
    dfx = pd.read_csv(config.TRAINING_FILE).fillna("none")

    #convert positive to 1 and negative to 0
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    #stratify split so that class can be balanced for both train and validation ==>> it means number of positive class will be equal to negative class for train ===>>same  for validation dataset also
    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.review.values,
                                        target=df_train.sentiment.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.BERTDataset(review=df_valid.review.values,
                                        target=df_valid.sentiment.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    #specify what parameters you want to train
    param_optimizer = list(model.named_parameters())

    #we don't want any deacy for these layer names such as bias and othr following things
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters = [
        {
            #don't decay weight for above no_decay list else decay
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)

    #experiment with lr
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    #convert model to multi-gpu model --->> no need to do this if you have not multiple gpus

    model = nn.DataParallel(
        model)  # use @amp.autocast() in model.py if DataParallel() is enabled

    scaler = amp.GradScaler(
    )  #from torch.cuda import amp   #this is required if using autoatic mixed precision
    #and pass scaler to train_fun

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler,
                        scaler)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Exemple #24
0
def main(_):
    test_file = config.EVAL_PROC
    model_path = config.MODEL_PATH
    if FLAGS.test_file:
        test_file = FLAGS.test_file
    if FLAGS.model_path:
        model_path = FLAGS.model_path
    df_test = pd.read_csv(test_file).fillna("none")

    logger.info(f"Bert Model: {config.BERT_PATH}")
    logger.info(
        f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} "
    )
    logger.info(f"Test file: {test_file}")
    logger.info(f"Test size : {len(df_test):.4f}")

    test_dataset = dataset.BERTDataset(review=df_test.text.values,
                                       target=df_test.label.values)

    test_data_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=3)

    device = config.device

    model = BERTBaseUncased()
    model.load_state_dict(
        torch.load(model_path, map_location=torch.device(device)))
    model.to(device)

    outputs, extracted_features = engine.predict_fn(
        test_data_loader, model, device, extract_features=FLAGS.features)
    df_test["predicted"] = outputs
    # save file
    df_test.to_csv(model_path.split("/")[-2] + '.csv',
                   header=None,
                   index=False)

    if FLAGS.features:
        pca = PCA(n_components=50, random_state=7)
        X1 = pca.fit_transform(extracted_features)
        tsne = TSNE(n_components=2,
                    perplexity=10,
                    random_state=6,
                    learning_rate=1000,
                    n_iter=1500)
        X1 = tsne.fit_transform(X1)
        # if row == 0: print("Shape after t-SNE: ", X1.shape)

        X = pd.DataFrame(np.concatenate([X1], axis=1), columns=["x1", "y1"])
        X = X.astype({"x1": float, "y1": float})

        # Plot for layer -1
        plt.figure(figsize=(20, 15))
        p1 = sns.scatterplot(x=X["x1"], y=X["y1"], palette="coolwarm")
        # p1.set_title("development-"+str(row+1)+", layer -1")
        x_texts = []
        for output, value in zip(outputs, df_test.label.values):
            if output == value:
                x_texts.append("@" + label_decoder(output)[0] +
                               label_decoder(output))
            else:
                x_texts.append(
                    label_decoder(value) + "-" + label_decoder(output))

        X["texts"] = x_texts
        # X["texts"] = ["@G" + label_decoder(output) if output == value else "@R-" + label_decoder(value) + "-" + label_decoder(output)
        #               for output, value in zip(outputs, df_test.label.values)]

        # df_test.label.astype(str)
        #([str(output)+"-" + str(value)] for output, value in zip(outputs, df_test.label.values))
        # Label each datapoint with the word it corresponds to
        for line in X.index:
            text = X.loc[line, "texts"] + "-" + str(line)
            if "@U" in text:
                p1.text(X.loc[line, "x1"] + 0.2,
                        X.loc[line, "y1"],
                        text[2:],
                        horizontalalignment='left',
                        size='medium',
                        color='blue',
                        weight='semibold')
            elif "@P" in text:
                p1.text(X.loc[line, "x1"] + 0.2,
                        X.loc[line, "y1"],
                        text[2:],
                        horizontalalignment='left',
                        size='medium',
                        color='green',
                        weight='semibold')
            elif "@N" in text:
                p1.text(X.loc[line, "x1"] + 0.2,
                        X.loc[line, "y1"],
                        text[2:],
                        horizontalalignment='left',
                        size='medium',
                        color='red',
                        weight='semibold')
            else:
                p1.text(X.loc[line, "x1"] + 0.2,
                        X.loc[line, "y1"],
                        text,
                        horizontalalignment='left',
                        size='medium',
                        color='black',
                        weight='semibold')
        plt.show()
        plt.savefig(model_path.split("/")[-2] + '-figure.svg', format="svg")
def main(_):
    LEARNING_RATE = config.LEARNING_RATE
    DROPOUT = config.DROPOUT

    if FLAGS.lr:
        LEARNING_RATE = FLAGS.lr
    if FLAGS.dropout:
        DROPOUT = FLAGS.dropout

    train_file = config.TRAIN_PROC
    df_train = pd.read_csv(train_file).fillna("none")

    valid_file = config.DEVEL_PROC
    df_valid = pd.read_csv(valid_file).fillna("none")

    test_file = config.EVAL_PROC
    df_test = pd.read_csv(test_file).fillna("none")
    
    logger.info(f"Bert Model: {config.BERT_PATH}")
    logger.info(f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ")

    logger.info(f"Train file: {train_file}")
    logger.info(f"Valid file: {valid_file}")
    logger.info(f"Test file: {test_file}")

    logger.info(f"Train size : {len(df_train):.4f}")
    logger.info(f"Valid size : {len(df_valid):.4f}")
    logger.info(f"Test size : {len(df_test):.4f}")

    train_dataset = dataset.BERTDataset(
        review=df_train.text.values,
        target=df_train.label.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4, shuffle=True
    )

    valid_dataset = dataset.BERTDataset(
        review=df_valid.text.values,
        target=df_valid.label.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    test_dataset = dataset.BERTDataset(
        review=df_test.text.values,
        target=df_test.label.values
    )

    test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #torch.device("cuda")
    model = BERTBaseUncased(DROPOUT)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    # model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        logger.info(f"Epoch = {epoch}")

        train_loss, train_acc = engine.train_fn(
            train_data_loader, model, optimizer, device, scheduler)

        for tag, parm in model.named_parameters():
            if parm.grad is not None:
                writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch)

        outputs, targets, val_loss, val_acc = engine.eval_fn(
            valid_data_loader, model, device)
        val_mcc = metrics.matthews_corrcoef(outputs, targets)
        logger.info(f"val_MCC_Score = {val_mcc:.4f}")

        outputs, targets, test_loss, test_acc = engine.eval_fn(
            test_data_loader, model, device)
        test_mcc = metrics.matthews_corrcoef(outputs, targets)
        logger.info(f"test_MCC_Score = {test_mcc:.4f}")

        logger.info(
            f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, test_loss={test_loss:.4f}")
        writer.add_scalar('loss/train', train_loss, epoch) # data grouping by `slash`
        writer.add_scalar('loss/val', val_loss, epoch) # data grouping by `slash`
        writer.add_scalar('loss/test', test_loss, epoch) # data grouping by `slash`
        
        logger.info(
            f"train_acc={train_acc:.4f}, val_acc={val_acc:.4f}, test_acc={test_acc:.4f}")
        writer.add_scalar('acc/train', train_acc, epoch) # data grouping by `slash`
        writer.add_scalar('acc/val', val_acc, epoch) # data grouping by `slash`
        writer.add_scalar('acc/test', test_acc, epoch) # data grouping by `slash`
        
        logger.info(f"val_mcc={val_acc:.4f}, test_mcc={test_acc:.4f}")
        writer.add_scalar('mcc/val', val_mcc, epoch) # data grouping by `slash`
        writer.add_scalar('mcc/test', test_mcc, epoch) # data grouping by `slash`

        accuracy = metrics.accuracy_score(targets, outputs)
        logger.info(f"Accuracy Score = {accuracy:.4f}")
        
        if accuracy < 0.4:
            logger.info(f"Something is very wrong! Accuracy is only {accuracy:.4f} Stopping...")
            break

        if accuracy > best_accuracy:
            logger.info(f"Saving model with Accuracy Score = {accuracy:.4f}")
            torch.save(model.state_dict(), config.MODEL_PATH[:-4] + "." + str(round(accuracy*100, 2)) + ".bin")
            best_accuracy = accuracy
            es = 0
        else:
            es += 1
            logger.info(f"Not improved for {es} times of 5. Best so far - {best_accuracy:.4f}")

            if es > 4:
                logger.info(f"Early stopping with best accuracy: {best_accuracy:.4f} and accuracy for this epoch: {accuracy:.4f} ...")
                break
def run(dataset_index):

    datasets = [
        "gold.prep-auto.full.prep.{0}.csv",
        "gold.prep-auto.no-emoticons.prep.{0}.csv",
        "gold.prep-auto.prep.{0}.csv", "gold.prep-english.prep.{0}.csv",
        "gold.prep-peisenieks.prep.{0}.csv", "gold.prep.{0}.csv"
    ]
    # dataset_index = 5 #0-5

    train_file = config.DATASET_LOCATION + datasets[dataset_index].format(
        "train")
    df_train = pd.read_csv(train_file).fillna("none")
    df_train.label = df_train.label.apply(label_encoder)

    valid_file = config.DATASET_LOCATION + datasets[dataset_index].format(
        "dev"
    )  #"gold.prep-auto.full.prep.dev.csv" #gold.prep-auto.no-emoticons.prep.dev.csv" #gold.prep-auto.prep.dev.csv" #"gold.prep-english.prep.dev.csv" #"gold.prep-peisenieks.prep.dev.csv" #"gold.prep.dev.csv"
    df_valid = pd.read_csv(valid_file).fillna("none")
    df_valid.label = df_valid.label.apply(label_encoder)

    test_file = config.DATASET_LOCATION + "eval.prep.test.csv"
    df_test = pd.read_csv(test_file).fillna("none")
    df_test.label = df_test.label.apply(label_encoder)

    logger.info(f"Bert Model: {config.BERT_PATH}")
    logger.info(
        f"Current date and time :{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} "
    )

    logger.info(f"Train file: {train_file}")
    logger.info(f"Valid file: {valid_file}")
    logger.info(f"Test file: {test_file}")

    logger.info(f"Train size : {len(df_train):.4f}")
    logger.info(f"Valid size : {len(df_valid):.4f}")
    logger.info(f"Test size : {len(df_test):.4f}")

    train_dataset = dataset.BERTDataset(review=df_train.text.values,
                                        target=df_train.label.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4,
        shuffle=True)

    valid_dataset = dataset.BERTDataset(review=df_valid.text.values,
                                        target=df_valid.label.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    test_dataset = dataset.BERTDataset(review=df_test.text.values,
                                       target=df_test.label.values)

    test_data_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device(
        'cuda' if torch.cuda.is_available() else 'cpu')  #torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    # model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        logger.info(f"epoch={epoch}")

        train_loss, train_acc = engine.train_fn(train_data_loader, model,
                                                optimizer, device, scheduler)

        for tag, parm in model.named_parameters():
            if parm.grad is not None:
                writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch)

        outputs, targets, val_loss, val_acc = engine.eval_fn(
            valid_data_loader, model, device)
        val_mcc = metrics.matthews_corrcoef(outputs, targets)
        logger.info(f"val_MCC_Score = {val_mcc:.3f}")

        outputs, targets, test_loss, test_acc = engine.eval_fn(
            test_data_loader, model, device)
        test_mcc = metrics.matthews_corrcoef(outputs, targets)
        logger.info(f"test_MCC_Score = {test_mcc:.3f}")

        logger.info(
            f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, test_loss={test_loss:.4f}"
        )
        writer.add_scalar('loss/train', train_loss,
                          epoch)  # data grouping by `slash`
        writer.add_scalar('loss/val', val_loss,
                          epoch)  # data grouping by `slash`
        writer.add_scalar('loss/test', test_loss,
                          epoch)  # data grouping by `slash`

        logger.info(
            f"train_acc={train_acc:.3f}, val_acc={val_acc:.3f}, test_acc={test_acc:.3f}"
        )
        writer.add_scalar('acc/train', train_acc,
                          epoch)  # data grouping by `slash`
        writer.add_scalar('acc/val', val_acc,
                          epoch)  # data grouping by `slash`
        writer.add_scalar('acc/test', test_acc,
                          epoch)  # data grouping by `slash`

        logger.info(f"val_mcc={val_acc:.3f}, test_mcc={test_acc:.3f}")
        writer.add_scalar('mcc/val', val_mcc,
                          epoch)  # data grouping by `slash`
        writer.add_scalar('mcc/test', test_mcc,
                          epoch)  # data grouping by `slash`

        accuracy = metrics.accuracy_score(targets, outputs)
        logger.info(f"Accuracy Score = {accuracy:.3f}")

        if accuracy > best_accuracy:
            print(f"Saving model with Accuracy Score = {accuracy:.3f}")
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
def run():

    df_train = preprocess('./review-sentence_train_clean.csv')
    df_valid = preprocess('./review-sentence_dev_clean.csv')

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(review=df_train.sentence.values,
                                        target=df_train.ENCODE_CAT.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.BERTDataset(review=df_valid.sentence.values,
                                        target=df_valid.ENCODE_CAT.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device(config.DEVICE)
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler,
                        epoch)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device,
                                          epoch)
        accuracy = metrics.accuracy_score(outputs, targets)
        print(f"Validation Accuracy  = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
            print("Best val accuracy till now {}".format(best_accuracy))
Exemple #28
0
def run():
    dfx = pd.read_csv(
        config.TRAINING_FILE).fillna("none").reset_index(drop=True)
    # df_test = pd.read_csv(config.TESTING_FILE).fillna("none").reset_index(drop=True)

    df_train, df_test = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.label.values)

    df_train = df_train.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    train_dataset = dataset.BERTDataset(text=df_train.title.values,
                                        label=df_train.label.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    test_dataset = dataset.BERTDataset(text=df_test.title.values,
                                       label=df_test.label.values)

    test_data_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=config.TEST_BATCH_SIZE, num_workers=1)

    device = torch.device("cpu")
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, labels = engine.eval_fn(test_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(labels, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy