Beispiel #1
0
    def __init__(self, opts):

        random.seed(opts.seed)
        torch.manual_seed(opts.seed)
        self.opts = opts
        self.log = Log(opts)
        self.device = returnDevice(opts.cuda, opts.gpu_number)
        self.lr = opts.lr
        self.lr_decay_rate = opts.lr_decay_rate
        self.batch_size = opts.batch_size
        self.max_len = opts.max_len
        self.epochs = opts.epochs
        self.print_every_step = opts.print_every_step
        self.early_stop = opts.early_stop  # 为0时不提前结束
        self.lr_decay_every = opts.lr_decay_every
        self.weight_decay = opts.weight_decay
        self.shuffle = opts.shuffle
        self.best_model_name = ''
        self.vocab = load_pkl_data(opts.vocab_path)
        self.label_id = load_pkl_data(opts.label_id_path)

        self.best_score = 0
        self.best_score_epoch = 0

        self.model = self.get_model()
        self.optimizer = self.get_optim(opts.optims)
def convert_to_features(df_data, save_path, is_train=False):

    if is_train:
        if os.path.exists(save_path):
            dataset = []
            for root, dirs, files in os.walk(save_path):
                for file in files:
                    dataset.extend(load_pkl_data(os.path.join(root, file)))

        else:
            os.makedirs(save_path)
            dataset = ClozeDataset(
                tokenizer=config.TOKENIZER,
                data_id=df_data.data_id.values,
                tag=df_data.tag.values,
                text=df_data.text.values,
                candidate=df_data.candidate.values,
                groundTruth=df_data.groundTruth.values,
                max_len=config.MAX_LEN
            )
            datas = []
            data = []
            batch_id = 1
            tk = tqdm(dataset, total=len(dataset))
            for bi, item in enumerate(tk):
                data.append(item)
                if len(data) == 50000 or bi == len(dataset) - 1:
                    path = save_path + f"/train_features_{batch_id}.pkl"
                    save_pkl_data(data, path)
                    batch_id += 1
                    datas.extend(data)
                    data = []
            dataset = datas
    else:
        if os.path.exists(save_path):
            dataset = load_pkl_data(save_path)
        else:

            dataset = ClozeDataset(
                tokenizer=config.TOKENIZER,
                data_id=df_data.data_id.values,
                tag=df_data.tag.values,
                text=df_data.text.values,
                candidate=df_data.candidate.values,
                groundTruth=df_data.groundTruth.values,
                max_len=config.MAX_LEN
            )
            tk = tqdm(dataset, total=len(dataset))
            dataset = [item for item in tk]
            save_pkl_data(dataset, save_path)
    return dataset
def just_test(model,
              filename,
              postfix=None,
              amount=None,
              use_softmax=False,
              indicator=False):
    """
    Given the model object and the previously stored weights file,
    this function just restore the weights, load testing data and
    predict the labels.

    Args:
        model(): Keras model object.
        filename(str): Filename of the trained weight file.
        amount(int): Use only first "amount" of data.
    """
    model_dir = "model/"
    print("Restoring best weights from: {:s}".format(filename))
    model.load_weights(filename)

    X, Z, y, d = load_pkl_data(model_dir,
                               "testing",
                               postfix,
                               indicator=indicator)

    predict(model,
            X,
            Z,
            y,
            model_file=filename,
            output="results-test.txt",
            amount=amount,
            use_softmax=use_softmax)
Beispiel #4
0
def get_loaders(batch_size, data_dir='hw2_data', test=False):
    train_ind = utils.load_pkl_data('snli_train_ind.p')
    val_ind = utils.load_pkl_data('snli_val_ind.p')
    train_target = utils.load_pkl_data('snli_train_target.p')
    val_target = utils.load_pkl_data('snli_val_target.p')
    if test:
        train_dataset = SNLI_Dataset(train_ind[:5 * batch_size], train_target)
    else:
        train_dataset = SNLI_Dataset(train_ind, train_target)
    val_dataset = SNLI_Dataset(val_ind, val_target)
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            collate_fn=collate_fn)
    return train_loader, val_loader
Beispiel #5
0
import torch
import utils
import models
import data
import train_helpers


model_dir = sys.argv[1]
hidden_size = int(sys.argv[2])
interaction_type = sys.argv[3]
kind = sys.argv[4]
epoch = sys.argv[5]
batch_ix = sys.argv[6]
batch_size = 32

ind2vec = utils.load_pkl_data('ind2vec.p', data_dir='vocab')
_, val_loader = data.get_loaders(batch_size, data_dir='hw2_data')
loss_fn = torch.nn.CrossEntropyLoss()
fmodel = f'epoch_{epoch}_batch_{batch_ix}.pt'
print('model: ' + fmodel)
model = models.SNLI_Model(ind2vec,
                            300,
                            hidden_size,
                            hidden_size,
                            80,
                            interaction_type,
                            'cpu',
                            kind,
                            num_layers=1,
                            bidirectional=True,
                            kernel_size=3)
def run():
    """
    Train model for a speciied fold
    """
    # Read train csv and dev csv
    df_train = pd.read_csv(config.TRAIN_FILE)
    df_valid = pd.read_csv(config.DEV_FILE)

    # Instantiate TweetDataset with training data
    train_dataset = SiameseDataset(query=df_train.sentence1.values,
                                   question=df_train.sentence2.values,
                                   label=df_train.label.values)

    if os.path.exists(config.train_features):
        train_dataset = load_pkl_data(config.train_features)
    else:
        train_dataset = [item for item in train_dataset]
        save_pkl_data(train_dataset, config.train_features)

    # Instantiate DataLoader with `train_dataset`
    # This is a generator that yields the dataset in batches
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, shuffle=False, batch_size=config.TRAIN_BATCH_SIZE)

    # Instantiate TweetDataset with validation data
    valid_dataset = SiameseDataset(
        query=df_valid.sentence1.values,
        question=df_valid.sentence2.values,
        label=df_valid.label.values,
    )

    if os.path.exists(config.valid_features):
        valid_dataset = load_pkl_data(config.valid_features)
    else:
        valid_dataset = [item for item in valid_dataset]
        save_pkl_data(valid_dataset, config.valid_features)

    # Instantiate DataLoader with `valid_dataset`
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, shuffle=False)

    # Set device as `cuda` (GPU)
    device = torch.device("cuda")
    # Load pretrained BERT (bert-base-uncased)
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    # Output hidden states
    # This is important to set since we want to concatenate the hidden states from the last 2 BERT layers
    model_config.output_hidden_states = True
    # Instantiate our model with `model_config`
    model = SiameseWmdModel(conf=model_config,
                            pretrained_model_path=config.BERT_PATH)
    # Move the model to the GPU
    model.to(device)

    # I'm training only for 3 epochs even though I specified 5!!!
    pred_labels, wmd, acc, f1, auc = predict(train_data_loader, model, device)
    logger.info(f"train set : acc = {acc}, f1 score = {f1}, auc = {auc}")
    df_train["pred_label"] = pred_labels
    df_train["wmd"] = wmd
    df_train.to_csv("../output/train_predict.csv")

    thresholds = [0.25, 0.23]
    best_f1 = 0
    best_th = 0
    for threshold in thresholds:
        pred_labels, wmd, acc, f1, auc = predict(valid_data_loader, model,
                                                 device, threshold)
        logger.info(
            f"dev set :threshold={threshold}  acc = {acc}, f1 score = {f1}, auc = {auc}"
        )

        if f1 > best_f1:
            best_f1 = f1
            best_th = threshold
    print(f"best threshold: {best_th} with best f1 {best_f1}")

    df_valid["pred_label"] = pred_labels
    df_valid["wmd"] = wmd
    df_valid.to_csv("../output/dev_predict.csv")
Beispiel #7
0
def train():
    """
    Train model for a speciied fold
    """
    # Read train csv and dev csv
    df_train = pd.read_csv(config.TRAIN_FILE)
    df_valid = pd.read_csv(config.DEV_FILE)

    # Instantiate TweetDataset with training data
    train_dataset = SiameseDataset(query=df_train.sentence1.values,
                                   question=df_train.sentence2.values,
                                   label=df_train.label.values)

    if os.path.exists(config.train_features):
        train_dataset = load_pkl_data(config.train_features)
    else:
        train_dataset = [item for item in train_dataset]
        save_pkl_data(train_dataset, config.train_features)

    # Instantiate DataLoader with `train_dataset`
    # This is a generator that yields the dataset in batches
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, shuffle=True, batch_size=config.TRAIN_BATCH_SIZE)

    # Instantiate TweetDataset with validation data
    valid_dataset = SiameseDataset(query=df_valid.sentence1.values,
                                   question=df_valid.sentence2.values,
                                   label=df_valid.label.values)

    if os.path.exists(config.valid_features):
        valid_dataset = load_pkl_data(config.valid_features)
    else:
        valid_dataset = [item for item in valid_dataset]
        save_pkl_data(valid_dataset, config.valid_features)

    # Instantiate DataLoader with `valid_dataset`
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE)

    # Set device as `cuda` (GPU)
    device = torch.device("cuda:2")
    # Load pretrained BERT (bert-base-uncased)
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    # Output hidden states
    # This is important to set since we want to concatenate the hidden states from the last 2 BERT layers
    model_config.output_hidden_states = True
    # Instantiate our model with `model_config`
    model = SiameseWmdModel(conf=model_config,
                            pretrained_model_path=config.BERT_PATH)
    # Move the model to the GPU
    model.to(device)

    # Calculate the number of training steps
    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    # Get the list of named parameters
    param_optimizer = list(model.named_parameters())
    # Specify parameters where weight decay shouldn't be applied
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    # Define two sets of parameters: those with weight decay, and those without
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]
    # Instantiate AdamW optimizer with our two sets of parameters, and a learning rate of 3e-5
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    # Create a scheduler to set the learning rate at each training step
    # "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html)
    # Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    # Apply early stopping with patience of 2
    # This means to stop training new epochs when 2 rounds have passed without any improvement
    es = utils.EarlyStopping(patience=2, mode="max")

    thresholds = [0.1, 0.15, 0.20]
    best_f1 = 0
    best_th = 0
    for threshold in thresholds:

        # I'm training only for 3 epochs even though I specified 5!!!
        for epoch in range(config.EPOCHS):
            train_fn(train_data_loader,
                     model,
                     optimizer,
                     device,
                     scheduler=scheduler,
                     threshold=threshold)
            acc, f1, auc = eval_fn(valid_data_loader, model, device)

            # logger.info(f"acc = {acc}, f1 score = {f1}")
            es(f1, model, model_path=config.MODEL_SAVE_PATH)
            if es.early_stop:
                if f1 > best_f1:
                    best_f1 = f1
                    best_th = threshold
                print("Early stopping ********")
                break
    logger.info(f"best threshold:{best_th}, best f1 :{best_f1}")
Beispiel #8
0
def run(args):
    # Add underscore to the tag
    args.tag = ("_" + args.tag) if args.tag is not None else ""
    # Parse prefix and postfix
    prefix = "{}{}".format("-Subword" if args.subword else "", "-Attention"
                             if args.attention else "")

    postfix = "{}{}{}".format("_subword" if args.subword else "",
                             ("_" + args.data_tag) if args.data_tag is not None else "",
                             ("_d" if args.description else ""))

    # Parse directory name
    if not args.model_dir.endswith("/"):
        args.model_dir += "/"
    if args.matching:
        print("Matching problem.")
    #########################################
    # Load models (TO-BE-REVISED)
    tokenizers = pkl.load(open(args.tokenizers, "rb"))
    n_classes = len(tokenizers["mlb"].classes_)
    try:
        desc_tokenizer = tokenizers["description"]
    except:
        desc_tokenizer = None
    #########################################
    # Building Model
    print("Building computational graph...")

    model = EntityTypingNet(
        architecture=args.arch,
        n_classes=n_classes,
        context_tokenizer=tokenizers["context"],
        mention_tokenizer=tokenizers["mention"],
        desc_tokenizer=desc_tokenizer,
        context_emb=args.context_emb,
        context_embedding_dim=args.context_embedding_dim,
        mention_emb=args.mention_emb,
        mention_embedding_dim=args.mention_embedding_dim,
        desc_emb=args.desc_emb,
        desc_embedding_dim=args.desc_embedding_dim,
        same_emb=args.same_emb,
        n_words=MAX_NUM_WORDS,
        n_mention=MAX_NUM_MENTION_WORDS,
        n_description=MAX_NUM_DESCRIPTION_WORDS,
        len_context=MAX_SEQUENCE_LENGTH,
        len_mention=MAX_MENTION_LENGTH,
        len_description=MAX_DESCRIPTION_LENGTH,
        attention=args.attention,
        subword=args.subword,
        indicator=args.indicator,
        description=False, # args.description,
        matching=args.matching,
        merge_mode=args.merge_mode,
        dropout=args.dropout,
        use_softmax=args.use_softmax,
        optimizer=args.optimizer,
        learning_rate=args.learning_rate)

    print(model.summary())

    # Save weights at the end of each epoch
    save_prefix = "{:s}{:s}-weights{:s}".format(args.arch, prefix, args.tag)
    filename = save_prefix + "-{epoch:02d}.hdf5"

    checkpoint = ModelCheckpoint(
        filename,
        monitor="val_loss",
        verbose=1,
        save_best_only=False,
        mode="min")
    early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
    callbacks_list = [checkpoint, early]

    X_train, Z_train, y_train, D_train = load_pkl_data(
        args.model_dir, "training", postfix, indicator=args.indicator, matching=args.matching)
    ######################################################
    """
    print(X_train.shape, y_train.shape)
    print("Stacking positive samples")
    n_instance = X_train.shape[0] // 6
    idxs = [i * 6 for i in range(n_instance)]
    tmp = np.vstack([X_train[idxs] for _ in range(4)])
    X_train = np.vstack([X_train, tmp])
    del tmp
    tmp = np.vstack([Z_train[idxs] for _ in range(4)])
    Z_train = np.vstack([Z_train, tmp])
    del tmp
    tmp = np.hstack([y_train[idxs] for _ in range(4)])
    y_train = np.hstack([y_train, tmp])
    del tmp
    if args.description:
        tmp = np.vstack([D_train[idxs] for _ in range(4)])
        D_train = np.vstack([D_train, tmp])
    """
    ######################################################
    # input = [X_train, Z_train]
    print(X_train.shape, Z_train.shape, y_train.shape)

    #if args.use_softmax:
    #    y_train =  np.array(mlb.inverse_transform(y_train)).flatten()
    input = [X_train, Z_train, D_train] if args.description else [X_train, Z_train]
    print("Begin training...")
    model.fit(
        input,
        y_train,
        batch_size=args.batch_size,
        epochs=args.epochs,
        validation_split=0.01,
        callbacks=callbacks_list)

    # Evaluation
    record = 0
    index = 0

    X_val, Z_val, y_val, D_val = load_pkl_data(
        args.model_dir, "validation", postfix, indicator=args.indicator, description=args.description)

    print("Loading trained weights for validation...")
    for i in range(1, args.epochs + 1, 1):
        # Deal with model_name for each epoch
        model_name = "{:s}-{:02d}.hdf5".format(save_prefix, i)
        model.load_weights(model_name)

        f = predict(
            model,
            X_val,
            Z_val,
            y_val,
            model_name,
            "results.txt",
            return_mf1=True,
            use_softmax=args.use_softmax)

        # Always choose model trained with more epoch when the F-1 score is same
        if record <= f:
            record = f
            index = i

    print("\n * Best micro-F1 at Validation: epoch #{:02d}".format(index))
    # Test model with best micro F1 score
    model_name = "{:s}-{:02d}.hdf5".format(save_prefix, index)
    just_test(
        model=model,
        filename=model_name,
        postfix=postfix,
        use_softmax=args.use_softmax,
        indicator=args.indicator)

    K.clear_session()
Beispiel #9
0
def get_table_lookup(data_dir='vocab'):
    return utils.load_pkl_data('ind2vec.p', data_dir='vocab')
Beispiel #10
0
                                  mode='constant',
                                  constant_values=0)

    return (torch.from_numpy(premise_data), torch.from_numpy(hypo_data),
            torch.LongTensor(premise_lens), torch.LongTensor(hypo_lens),
            torch.LongTensor(targets))


class SNLI_Dataset(Dataset):
    max_len = MAX_LEN

    def __init__(self, data, target):
        self.data = [[premise[:self.max_len], hypo[:self.max_len]]
                     for premise, hypo in data]
        self.target = target

    def __len__(self):
        return len(self.data)

    def __getitem__(self, ix):
        x = self.data[ix]
        lens = [len(x[0]), len(x[1])]
        target = self.target[ix]
        return x, lens, target


if __name__ == '__main__':
    train_data = utils.load_pkl_data('snli_train_ind.p')
    val_data = utils.load_pkl_data('snli_val_ind.p')
    print(f'Max length sentence: ', get_max_len(train_data, val_data))