Ejemplo n.º 1
0
def convert_to_features(df_data, save_path, is_train=False):

    if is_train:
        if os.path.exists(save_path):
            dataset = []
            for root, dirs, files in os.walk(save_path):
                for file in files:
                    dataset.extend(load_pkl_data(os.path.join(root, file)))

        else:
            os.makedirs(save_path)
            dataset = ClozeDataset(
                tokenizer=config.TOKENIZER,
                data_id=df_data.data_id.values,
                tag=df_data.tag.values,
                text=df_data.text.values,
                candidate=df_data.candidate.values,
                groundTruth=df_data.groundTruth.values,
                max_len=config.MAX_LEN
            )
            datas = []
            data = []
            batch_id = 1
            tk = tqdm(dataset, total=len(dataset))
            for bi, item in enumerate(tk):
                data.append(item)
                if len(data) == 50000 or bi == len(dataset) - 1:
                    path = save_path + f"/train_features_{batch_id}.pkl"
                    save_pkl_data(data, path)
                    batch_id += 1
                    datas.extend(data)
                    data = []
            dataset = datas
    else:
        if os.path.exists(save_path):
            dataset = load_pkl_data(save_path)
        else:

            dataset = ClozeDataset(
                tokenizer=config.TOKENIZER,
                data_id=df_data.data_id.values,
                tag=df_data.tag.values,
                text=df_data.text.values,
                candidate=df_data.candidate.values,
                groundTruth=df_data.groundTruth.values,
                max_len=config.MAX_LEN
            )
            tk = tqdm(dataset, total=len(dataset))
            dataset = [item for item in tk]
            save_pkl_data(dataset, save_path)
    return dataset
Ejemplo n.º 2
0
def run():
    """
    Train model for a speciied fold
    """
    # Read train csv and dev csv
    df_train = pd.read_csv(config.TRAIN_FILE)
    df_valid = pd.read_csv(config.DEV_FILE)

    # Instantiate TweetDataset with training data
    train_dataset = SiameseDataset(query=df_train.sentence1.values,
                                   question=df_train.sentence2.values,
                                   label=df_train.label.values)

    if os.path.exists(config.train_features):
        train_dataset = load_pkl_data(config.train_features)
    else:
        train_dataset = [item for item in train_dataset]
        save_pkl_data(train_dataset, config.train_features)

    # Instantiate DataLoader with `train_dataset`
    # This is a generator that yields the dataset in batches
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, shuffle=False, batch_size=config.TRAIN_BATCH_SIZE)

    # Instantiate TweetDataset with validation data
    valid_dataset = SiameseDataset(
        query=df_valid.sentence1.values,
        question=df_valid.sentence2.values,
        label=df_valid.label.values,
    )

    if os.path.exists(config.valid_features):
        valid_dataset = load_pkl_data(config.valid_features)
    else:
        valid_dataset = [item for item in valid_dataset]
        save_pkl_data(valid_dataset, config.valid_features)

    # Instantiate DataLoader with `valid_dataset`
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, shuffle=False)

    # Set device as `cuda` (GPU)
    device = torch.device("cuda")
    # Load pretrained BERT (bert-base-uncased)
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    # Output hidden states
    # This is important to set since we want to concatenate the hidden states from the last 2 BERT layers
    model_config.output_hidden_states = True
    # Instantiate our model with `model_config`
    model = SiameseWmdModel(conf=model_config,
                            pretrained_model_path=config.BERT_PATH)
    # Move the model to the GPU
    model.to(device)

    # I'm training only for 3 epochs even though I specified 5!!!
    pred_labels, wmd, acc, f1, auc = predict(train_data_loader, model, device)
    logger.info(f"train set : acc = {acc}, f1 score = {f1}, auc = {auc}")
    df_train["pred_label"] = pred_labels
    df_train["wmd"] = wmd
    df_train.to_csv("../output/train_predict.csv")

    thresholds = [0.25, 0.23]
    best_f1 = 0
    best_th = 0
    for threshold in thresholds:
        pred_labels, wmd, acc, f1, auc = predict(valid_data_loader, model,
                                                 device, threshold)
        logger.info(
            f"dev set :threshold={threshold}  acc = {acc}, f1 score = {f1}, auc = {auc}"
        )

        if f1 > best_f1:
            best_f1 = f1
            best_th = threshold
    print(f"best threshold: {best_th} with best f1 {best_f1}")

    df_valid["pred_label"] = pred_labels
    df_valid["wmd"] = wmd
    df_valid.to_csv("../output/dev_predict.csv")
Ejemplo n.º 3
0
def train():
    """
    Train model for a speciied fold
    """
    # Read train csv and dev csv
    df_train = pd.read_csv(config.TRAIN_FILE)
    df_valid = pd.read_csv(config.DEV_FILE)

    # Instantiate TweetDataset with training data
    train_dataset = SiameseDataset(query=df_train.sentence1.values,
                                   question=df_train.sentence2.values,
                                   label=df_train.label.values)

    if os.path.exists(config.train_features):
        train_dataset = load_pkl_data(config.train_features)
    else:
        train_dataset = [item for item in train_dataset]
        save_pkl_data(train_dataset, config.train_features)

    # Instantiate DataLoader with `train_dataset`
    # This is a generator that yields the dataset in batches
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, shuffle=True, batch_size=config.TRAIN_BATCH_SIZE)

    # Instantiate TweetDataset with validation data
    valid_dataset = SiameseDataset(query=df_valid.sentence1.values,
                                   question=df_valid.sentence2.values,
                                   label=df_valid.label.values)

    if os.path.exists(config.valid_features):
        valid_dataset = load_pkl_data(config.valid_features)
    else:
        valid_dataset = [item for item in valid_dataset]
        save_pkl_data(valid_dataset, config.valid_features)

    # Instantiate DataLoader with `valid_dataset`
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE)

    # Set device as `cuda` (GPU)
    device = torch.device("cuda:2")
    # Load pretrained BERT (bert-base-uncased)
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    # Output hidden states
    # This is important to set since we want to concatenate the hidden states from the last 2 BERT layers
    model_config.output_hidden_states = True
    # Instantiate our model with `model_config`
    model = SiameseWmdModel(conf=model_config,
                            pretrained_model_path=config.BERT_PATH)
    # Move the model to the GPU
    model.to(device)

    # Calculate the number of training steps
    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    # Get the list of named parameters
    param_optimizer = list(model.named_parameters())
    # Specify parameters where weight decay shouldn't be applied
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    # Define two sets of parameters: those with weight decay, and those without
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]
    # Instantiate AdamW optimizer with our two sets of parameters, and a learning rate of 3e-5
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    # Create a scheduler to set the learning rate at each training step
    # "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html)
    # Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    # Apply early stopping with patience of 2
    # This means to stop training new epochs when 2 rounds have passed without any improvement
    es = utils.EarlyStopping(patience=2, mode="max")

    thresholds = [0.1, 0.15, 0.20]
    best_f1 = 0
    best_th = 0
    for threshold in thresholds:

        # I'm training only for 3 epochs even though I specified 5!!!
        for epoch in range(config.EPOCHS):
            train_fn(train_data_loader,
                     model,
                     optimizer,
                     device,
                     scheduler=scheduler,
                     threshold=threshold)
            acc, f1, auc = eval_fn(valid_data_loader, model, device)

            # logger.info(f"acc = {acc}, f1 score = {f1}")
            es(f1, model, model_path=config.MODEL_SAVE_PATH)
            if es.early_stop:
                if f1 > best_f1:
                    best_f1 = f1
                    best_th = threshold
                print("Early stopping ********")
                break
    logger.info(f"best threshold:{best_th}, best f1 :{best_f1}")
Ejemplo n.º 4
0
 def save_metrics(self, d):
     for k, v in d.items():
         utils.save_pkl_data(v, f'{k}.p', data_dir=self.model_dir)
Ejemplo n.º 5
0
def main(premise_hidden_size,
         hypo_hidden_size,
         linear_hidden_size,
         interaction_type,
         device,
         kind,
         num_layers=1,
         bidirectional=True,
         kernel_size=3,
         lr=1e-4,
         test=False,
         model_dir='models'):
    valid_types = ('cat', 'element_wise_mult')
    if interaction_type not in valid_types:
        raise ValueError('interaction_type can only be: ', valid_types)

    # data
    batch_size = 32
    save_freq = 500
    max_epochs = 40
    train_loader, val_loader = data.get_loaders(batch_size, test=test)

    # model
    embed_size = 300
    ind2vec = data.get_table_lookup()
    if kind == 'rnn':
        model = models.SNLI_Model(ind2vec,
                                  embed_size,
                                  premise_hidden_size,
                                  hypo_hidden_size,
                                  linear_hidden_size,
                                  interaction_type,
                                  device,
                                  kind='rnn',
                                  num_layers=num_layers,
                                  bidirectional=bidirectional)
    else:
        model = models.SNLI_Model(ind2vec,
                                  embed_size,
                                  premise_hidden_size,
                                  hypo_hidden_size,
                                  linear_hidden_size,
                                  interaction_type,
                                  device,
                                  kind='cnn',
                                  kernel_size=kernel_size)
    model = model.to(device)
    optimizer = torch.optim.Adam(
        [param for param in model.parameters() if param.requires_grad], lr=lr)
    loss_fn = torch.nn.CrossEntropyLoss()

    model_name = f'{kind}_model_{premise_hidden_size}_{interaction_type}'
    model_dir = os.path.join(model_dir, model_name)
    train_helper = train_helpers.TrainHelper(device, model, loss_fn, optimizer,
                                             models.batch_params_key,
                                             model_dir, test)
    train_loss, val_loss, train_acc, val_acc = train_helper.train_loop(
        train_loader, val_loader, max_epochs=max_epochs, save_freq=save_freq)

    if 'cpu' in device:
        os.makedirs('figures', exist_ok=True)
        path = f'figures/{model_name}'
        utils.plot_curves(train_loss, val_loss, train_acc, val_acc, path)

    utils.save_pkl_data(train_loss, 'train_loss.p', data_dir=model_dir)
    utils.save_pkl_data(val_loss, 'val_loss.p', data_dir=model_dir)
    utils.save_pkl_data(train_acc, 'train_acc.p', data_dir=model_dir)
    utils.save_pkl_data(val_acc, 'val_acc.p', data_dir=model_dir)