Esempio n. 1
0
    def loaders(self,
                batch_size: int,
                shuffle_train=False,
                shuffle_test=False,
                num_workers: int = 0) -> (DataLoader, DataLoader):

        # Use BucketSampler for sampling
        train_sampler = BucketBatchSampler(self.train_set,
                                           batch_size=batch_size,
                                           drop_last=True,
                                           sort_key=lambda r: len(r['text']))
        test_sampler = BucketBatchSampler(self.test_set,
                                          batch_size=batch_size,
                                          drop_last=True,
                                          sort_key=lambda r: len(r['text']))

        print("trainset", self.train_set)

        train_loader = DataLoader(dataset=self.train_set,
                                  batch_sampler=train_sampler,
                                  collate_fn=collate_fn,
                                  num_workers=num_workers)
        test_loader = DataLoader(dataset=self.test_set,
                                 batch_sampler=test_sampler,
                                 collate_fn=collate_fn,
                                 num_workers=num_workers)
        return train_loader, test_loader
Esempio n. 2
0
def test_bucket_batch_sampler_uneven():
    data_source = [[1], [2], [3], [4], [5]]
    sort_key = lambda r: len(r)
    batch_size = 2
    sampler = BucketBatchSampler(
        data_source, batch_size, sort_key=sort_key, drop_last=False, bucket_size_multiplier=2)
    batches = list(sampler)
    assert len(batches) == 3
    assert len(sampler) == 3
    sampler = BucketBatchSampler(
        data_source, batch_size, sort_key=sort_key, drop_last=True, bucket_size_multiplier=2)
    batches = list(sampler)
    assert len(batches) == 2
    assert len(sampler) == 2
Esempio n. 3
0
def test_pickleable():
    sampler = SequentialSampler(list(range(10)))
    batch_sampler = BucketBatchSampler(sampler,
                                       batch_size=2,
                                       drop_last=False,
                                       bucket_size_multiplier=2)
    pickle.dumps(batch_sampler)
def get_data_loader(sampler_name, dataset, batch_size, max_tokens, num_workers,
                    shuffle):
    kwargs_test = {}
    if sampler_name == "bucket":
        sampler = BucketBatchSampler(
            SequentialSampler(dataset),
            batch_size=batch_size,
            drop_last=False,
            sort_key=lambda i: len(dataset.datasets.iloc[i]["src"].split()))
    elif sampler_name == "maxtokens":
        sampler = MaxTokensBatchSampler(
            SequentialSampler(dataset),
            shuffle=shuffle,
            batch_size=batch_size,
            max_tokens=max_tokens,
            drop_last=False,
            sort_key=lambda i: len(dataset.datasets.iloc[i]["src"].split()))
    else:
        sampler = None
        kwargs_test = {"batch_size": batch_size, "shuffle": shuffle}

    # Define dataloader
    data_loader = DataLoader(
        dataset,
        num_workers=num_workers,
        collate_fn=lambda x: TranslationDataset.collate_fn(x, max_tokens),
        pin_memory=True,
        batch_sampler=sampler,
        **kwargs_test)
    return data_loader
Esempio n. 5
0
def test_bucket_batch_sampler__drop_last():
    sampler = SequentialSampler(list(range(10)))
    batch_sampler = BucketBatchSampler(sampler,
                                       batch_size=3,
                                       drop_last=True,
                                       bucket_size_multiplier=2)
    assert len(batch_sampler) == 3
    assert len(list(iter(batch_sampler))) == 3
Esempio n. 6
0
def test_bucket_batch_sampler():
    sampler = SequentialSampler(list(range(10)))
    batch_sampler = BucketBatchSampler(sampler,
                                       batch_size=3,
                                       drop_last=False,
                                       bucket_size_multiplier=2)
    assert len(batch_sampler) == 4
    assert list(batch_sampler) == [[0, 1, 2], [3, 4, 5], [9], [6, 7, 8]]
Esempio n. 7
0
def test_bucket_batch_sampler_last_batch_first():
    data_source = [torch.tensor([j for j in range(i)]) for i in range(100)]
    sort_key = lambda r: len(r)
    batch_size = 1
    batches = list(
        BucketBatchSampler(
            data_source, batch_size, sort_key=sort_key, drop_last=False, bucket_size_multiplier=2))
    # Largest batch (4) is in first batch
    assert 99 == batches[0][0]
Esempio n. 8
0
def test_bucket_batch_sampler():
    data_source = [[1], [2], [3], [4], [5], [6]]
    sort_key = lambda r: len(r)
    batch_size = 2
    batches = list(
        BucketBatchSampler(data_source,
                           batch_size,
                           sort_key,
                           bucket_size_multiplier=2))
    assert len(batches) == 3
Esempio n. 9
0
def main(args):
    """
    Main training routine specific for this project
    :param hparams:
    """
    if args.train_file is not None and args.dev_file is not None:
        if (args.load_checkpoint):
            model = StreamingPunctuatorModel.load_from_checkpoint(args.load_checkpoint, **vars(args))
        else:
            model = StreamingPunctuatorModel(**vars(args))

        batch_size = args.batch_size

        train_dataset = PunctuationDataset(tokenizer=model.tokenizer, filename=args.train_file, label_delay=args.label_delay)
        dev_dataset = PunctuationDataset(tokenizer=model.tokenizer, filename=args.dev_file, label_delay=args.label_delay)

        random_sampler = RandomSampler(train_dataset)
        
        batch_iterator = BucketBatchSampler(random_sampler, batch_size=batch_size, drop_last=False, sort_key=lambda i: train_dataset[i]["length"], bucket_size_multiplier=100)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_sampler=batch_iterator, collate_fn=train_dataset.collate_batch, num_workers=8)


        dev_loader = torch.utils.data.DataLoader(
                dataset=dev_dataset,
                batch_size=batch_size,
                shuffle=False,
                collate_fn=dev_dataset.collate_batch,
                num_workers=2)



        checkpoint_callback = ModelCheckpoint(
                save_top_k=4,
                save_last=True,
                verbose=True,
                monitor='val_f1',
                mode='max',
                prefix=''
        )        
        trainer = Trainer.from_argparse_args(args,
                                             checkpoint_callback=checkpoint_callback,
                                             callbacks=[
                                             ])
        trainer.fit(model, train_dataloader=train_loader,
                    val_dataloaders=dev_loader)
    elif args.process_stdin and args.load_checkpoint is not None:
        model = StreamingPunctuatorModel.load_from_checkpoint(args.load_checkpoint)
        while True:
            l = sys.stdin.readline()
            if not l: break
            print(model.process_line(l.strip()))
            sys.stdout.flush()
        
    else:
        raise Exception("Either --train-file and --dev-file or --process-stdin and --load-checkpoint should be specified")
Esempio n. 10
0
def get_data_loader(data,
                    batch_size,
                    drop_last,
                    collate_fn=collate_fn_eval_base):
    sampler = BucketBatchSampler(data,
                                 batch_size,
                                 drop_last=drop_last,
                                 sort_key=lambda row: -len(row['word_ids']))

    loader = DataLoader(data, batch_sampler=sampler, collate_fn=collate_fn)

    return loader
Esempio n. 11
0
def test_bucket_batch_sampler_last_batch_first():
    data_source = [[1], [2], [3], [4], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
    sort_key = lambda r: len(r)
    batch_size = 2
    batches = list(
        BucketBatchSampler(data_source,
                           batch_size,
                           sort_key,
                           biggest_batches_first=True,
                           bucket_size_multiplier=2))
    # Largest batch (4) is in first batch
    assert 4 in batches[0]
Esempio n. 12
0
def test_bucket_batch_sampler_uneven_length():
    data_source = [[1], [2], [3], [4], [5]]
    sort_key = lambda i: len(data_source[i])
    batch_size = 2
    sampler = SequentialSampler(data_source)
    batch_sampler = BucketBatchSampler(sampler,
                                       batch_size,
                                       sort_key=sort_key,
                                       drop_last=False,
                                       bucket_size_multiplier=2)
    batches = list(batch_sampler)
    assert len(batches) == 3
    assert len(batch_sampler) == 3
    batch_sampler = BucketBatchSampler(sampler,
                                       batch_size,
                                       sort_key=sort_key,
                                       drop_last=True,
                                       bucket_size_multiplier=2)
    batches = list(batch_sampler)
    assert len(batches) == 2
    assert len(batch_sampler) == 2
Esempio n. 13
0
def test_bucket_batch_sampler_sorted():
    data_source = [[1], [2], [3], [4], [5]]
    sort_key = lambda i: data_source[i]
    batch_size = len(data_source)
    sampler = SequentialSampler(data_source)
    batches = list(
        BucketBatchSampler(sampler,
                           batch_size,
                           sort_key=sort_key,
                           drop_last=False,
                           bucket_size_multiplier=1))
    for i, batch in enumerate(batches):
        assert batch[0] == i
Esempio n. 14
0
def test_bucket_batch_sampler_sorted():
    data_source = [[1], [2], [3], [4], [5]]
    sort_key = lambda r: r[0]
    batch_size = len(data_source)
    batches = list(
        BucketBatchSampler(data_source,
                           batch_size,
                           sort_key,
                           biggest_batches_first=False,
                           bucket_size_multiplier=1))
    # Largest batch (4) is in first batch
    for i, batch in enumerate(batches):
        assert batch[0] == i
Esempio n. 15
0
def get_data_loader(data, batch_size, drop_last, use_rnn, is_train=False):

    # if is_train:
    sampler = BucketBatchSampler(data,
                                 batch_size,
                                 drop_last=drop_last,
                                 sort_key=lambda row: -len(row["sents"]))
    # else:
    # 	sampler = SequentialSampler(data)

    collate_fn = collate_fn_rnn if use_rnn else collate_fn_transformer

    loader = DataLoader(data,
                        batch_sampler=sampler,
                        collate_fn=collate_fn,
                        pin_memory=False)
    # shuffle=True,
    # num_workers=1)
    return loader
Esempio n. 16
0
    def test_dataloader(self):
        tokenizer = Tokenizer.from_file("test/tokenizer.json") 
        tokenizer.add_special_tokens(["<s>", "</s>"])

        dataset = PunctuationDataset(tokenizer, "test/dev.txt")
        
        batch_size = 8

        random_sampler = RandomSampler(dataset)
        
        batch_iterator = BucketBatchSampler(random_sampler, batch_size=batch_size, drop_last=False, sort_key=lambda x: dataset[x]["length"], bucket_size_multiplier=100)
        dataloader = torch.utils.data.DataLoader(dataset, batch_sampler=batch_iterator, collate_fn=dataset.collate_batch)

        for i in range(2):
            print(f"Testing epoch {i}")
            for j, batch in enumerate(dataloader):
                if j == 0:
                    # make sure that the length difference inside a batch is not > 20%
                    self.assertTrue((batch["lengths"].max() - batch["lengths"].min()) / batch["lengths"].max() < 0.2 )
Esempio n. 17
0
    def train_dataloader(self) -> torch.utils.data.DataLoader:
        """Return a PyTorch DataLoader for the training set.

        Requires calling ``prepare_data`` beforehand.

        Return:
            PyTorch DataLoader
        """
        sampler = BucketBatchSampler(
            RandomSampler(self.train_dataset),
            batch_size=self.config.batch_size.train,
            drop_last=False,
            sort_key=lambda sample: len(self.train_dataset[sample][const.TARGET
                                                                   ].split()),
            # bucket_size_multiplier=100,
        )

        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_sampler=sampler,
            num_workers=self.config.num_data_workers,
            collate_fn=self.data_encoders.collate_fn,
            pin_memory=torch.cuda.is_initialized(),  # NOQA
        )
Esempio n. 18
0
dev_log_template = ' '.join(
    '{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:8.6f},{:12.4f},{:12.4f}'
    .split(','))
log_template = ' '.join(
    '{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{},{:12.4f},{}'
    .split(','))
makedirs(args.save_path)
print(header)

for epoch in range(args.epochs):
    n_correct, n_total = 0, 0

    train_sampler = SequentialSampler(train)
    train_batch_sampler = BucketBatchSampler(
        train_sampler,
        args.batch_size,
        True,
        sort_key=lambda r: len(train[r]['premise']))
    train_iterator = DataLoader(train,
                                batch_sampler=train_batch_sampler,
                                collate_fn=collate_fn,
                                pin_memory=torch.cuda.is_available(),
                                num_workers=0)
    for batch_idx, (premise_batch, hypothesis_batch,
                    label_batch) in enumerate(train_iterator):

        # switch model to training mode, clear gradient accumulators
        model.train()
        torch.set_grad_enabled(True)
        opt.zero_grad()
Esempio n. 19
0
    TEXT_LENGTH = opt.text_length
    ROUTING_TYPE = opt.routing_type
    NUM_ITERATIONS = opt.num_iterations
    BATCH_SIZE = opt.batch_size
    MODEL_WEIGHT = opt.load_model_weight
    GPU = opt.gpu

    vocab_size, num_class, test_dataset = load_data(data_type='custom',
                                                    preprocessing=False,
                                                    fine_grained=FINE_GRAINED,
                                                    verbose=True,
                                                    text_length=TEXT_LENGTH)

    print("[!] vocab_size: {}, num_class: {}".format(vocab_size, num_class))
    test_sampler = BucketBatchSampler(test_dataset,
                                      BATCH_SIZE,
                                      False,
                                      sort_key=lambda row: len(row['text']))
    test_iterator = DataLoader(test_dataset,
                               batch_sampler=test_sampler,
                               collate_fn=collate_fn)

    model = Model(vocab_size,
                  num_class=num_class,
                  routing_type=ROUTING_TYPE,
                  num_iterations=NUM_ITERATIONS)
    if MODEL_WEIGHT is not None:
        model.load_state_dict(torch.load('epochs/' + MODEL_WEIGHT))
    margin_loss, focal_loss = MarginLoss(), FocalLoss()
    if torch.cuda.is_available():
        model, margin_loss, focal_loss = model.to(
            'cuda:{}'.format(GPU)), margin_loss.to(
Esempio n. 20
0
iterations = 0
start = time.time()
best_dev_acc = -1
header = '  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss     Accuracy  Dev/Accuracy'
dev_log_template = ' '.join(
    '{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:8.6f},{:12.4f},{:12.4f}'
    .split(','))
log_template = ' '.join(
    '{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{},{:12.4f},{}'.split(','))
makedirs(args.save_path)
print(header)

for epoch in range(args.epochs):
    n_correct, n_total = 0, 0

    train_sampler = BucketBatchSampler(
        train, args.batch_size, True, sort_key=lambda r: len(row['premise']))
    train_iterator = DataLoader(
        train,
        batch_sampler=train_sampler,
        collate_fn=collate_fn,
        pin_memory=torch.cuda.is_available(),
        num_workers=0)
    for batch_idx, (premise_batch, hypothesis_batch, label_batch) in enumerate(train_iterator):

        # switch model to training mode, clear gradient accumulators
        model.train()
        torch.set_grad_enabled(True)
        opt.zero_grad()

        iterations += 1
Esempio n. 21
0
def test_pickleable():
    data_source = [[1], [2], [3], [4], [5]]
    sampler = BucketBatchSampler(data_source, batch_size=2, drop_last=False)
    pickle.dumps(sampler)
def run_experiment(datapath,
                   src,
                   trg,
                   model_name,
                   domain=None,
                   smart_batch=False):
    start_time = time.time()

    ###########################################################################
    ###########################################################################

    wandb.init(project=WANDB_PROJECT, entity='salvacarrion', reinit=True)
    config = wandb.config
    config.model_name = MODEL_NAME
    config.domain = domain
    config.max_epochs = MAX_EPOCHS
    config.learning_rate = LEARNING_RATE
    config.batch_size = BATCH_SIZE
    config.max_tokens = MAX_TOKENS
    config.warmup_updates = WARMUP_UPDATES
    config.patience = PATIENCE
    config.acc_gradients = ACC_GRADIENTS
    config.weight_decay = WEIGHT_DECAY
    config.clip_gradients = CLIP_GRADIENTS
    config.multigpu = MULTIGPU
    config.device1 = str(DEVICE1)
    config.device2 = str(DEVICE2)
    config.num_workers = NUM_WORKERS
    config.tok_model = TOK_MODEL
    config.tok_size = TOK_SIZE
    config.tok_folder = TOK_FOLDER
    config.lowercase = LOWERCASE
    config.truncate = TRUNCATE
    config.max_length_truncate = MAX_LENGTH_TRUNC
    config.sampler_name = str(SAMPLER_NAME)
    print(config)
    ###########################################################################
    ###########################################################################

    checkpoint_path = os.path.join(datapath, DATASET_CHECKPOINT_NAME,
                                   f"{model_name}_{domain}_acc")

    # Load tokenizers
    src_tok, trg_tok = helpers.get_tokenizers(os.path.join(
        datapath, DATASET_TOK_NAME, TOK_FOLDER),
                                              src,
                                              trg,
                                              tok_model=TOK_MODEL,
                                              lower=LOWERCASE,
                                              truncation=TRUNCATE,
                                              max_length=MAX_LENGTH_TRUNC)

    # Load dataset
    datapath_clean = DATASET_CLEAN_SORTED_NAME if smart_batch else DATASET_CLEAN_NAME
    if TOK_MODEL == "bpe":  # Do not preprocess again when using bpe
        src_tok.apply_bpe = False
        trg_tok.apply_bpe = False
        datapath_clean = os.path.join(DATASET_TOK_NAME, TOK_FOLDER)

    train_ds = TranslationDataset(os.path.join(datapath, datapath_clean),
                                  src_tok, trg_tok, "train")
    val_ds = TranslationDataset(os.path.join(datapath, datapath_clean),
                                src_tok, trg_tok, "val")

    # Build dataloaders
    kwargs_train = {}
    kwargs_val = {}
    if SAMPLER_NAME == "bucket":
        train_sampler = BucketBatchSampler(
            SequentialSampler(train_ds),
            batch_size=BATCH_SIZE,
            drop_last=False,
            sort_key=lambda i: len_func(train_ds, i))
        val_sampler = BucketBatchSampler(
            SequentialSampler(val_ds),
            batch_size=BATCH_SIZE,
            drop_last=False,
            sort_key=lambda i: len_func(val_ds, i))
    elif SAMPLER_NAME == "maxtokens":
        train_sampler = MaxTokensBatchSampler(
            SequentialSampler(train_ds),
            shuffle=True,
            batch_size=BATCH_SIZE,
            max_tokens=MAX_TOKENS,
            drop_last=False,
            sort_key=lambda i: len_func(train_ds, i))
        val_sampler = MaxTokensBatchSampler(
            SequentialSampler(val_ds),
            shuffle=False,
            batch_size=BATCH_SIZE,
            max_tokens=MAX_TOKENS,
            drop_last=False,
            sort_key=lambda i: len_func(val_ds, i))
    else:
        train_sampler = val_sampler = None
        kwargs_train = {"batch_size": BATCH_SIZE, "shuffle": True}
        kwargs_val = {"batch_size": BATCH_SIZE, "shuffle": False}

    # Define dataloader
    train_loader = DataLoader(
        train_ds,
        num_workers=NUM_WORKERS,
        collate_fn=lambda x: TranslationDataset.collate_fn(x, MAX_TOKENS),
        pin_memory=True,
        batch_sampler=train_sampler,
        **kwargs_train)
    val_loader = DataLoader(
        val_ds,
        num_workers=NUM_WORKERS,
        collate_fn=lambda x: TranslationDataset.collate_fn(x, MAX_TOKENS),
        pin_memory=True,
        batch_sampler=val_sampler,
        **kwargs_val)

    # Instantiate model #1
    model = Transformer(d_model=256,
                        enc_layers=3,
                        dec_layers=3,
                        enc_heads=8,
                        dec_heads=8,
                        enc_dff_dim=512,
                        dec_dff_dim=512,
                        enc_dropout=0.1,
                        dec_dropout=0.1,
                        max_src_len=2000,
                        max_trg_len=2000,
                        src_tok=src_tok,
                        trg_tok=trg_tok,
                        static_pos_emb=True)  #.to(DEVICE1)
    model.apply(initialize_weights)
    print(f'The model has {model.count_parameters():,} trainable parameters')
    criterion = nn.CrossEntropyLoss(
        ignore_index=trg_tok.word2idx[trg_tok.PAD_WORD])
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # Load weights
    # checkpoint_path = os.path.join(datapath, DATASET_CHECKPOINT_NAME, "transformer_multi30k_best_new.pt")
    # print(f"Loading weights from: {checkpoint_path}")
    # model.load_state_dict(torch.load(checkpoint_path))

    # Tensorboard (it needs some epochs to start working ~10-20)
    tb_writer = SummaryWriter(
        os.path.join(datapath, DATASET_LOGS_NAME, f"{model_name}"))
    wandb.watch(model)

    # Prepare model and data for acceleration
    model, optimizer, train_loader, val_loader = accelerator.prepare(
        model, optimizer, train_loader, val_loader)

    # Train and validate model
    fit(model,
        optimizer,
        train_loader=train_loader,
        val_loader=val_loader,
        epochs=MAX_EPOCHS,
        criterion=criterion,
        checkpoint_path=checkpoint_path,
        src_tok=src_tok,
        trg_tok=trg_tok,
        tb_writer=tb_writer)

    print("************************************************************")
    epoch_hours, epoch_mins, epoch_secs = helpers.epoch_time(
        start_time, end_time=time.time())
    print(f'Time experiment: {epoch_hours}h {epoch_mins}m {epoch_secs}s')
    print("************************************************************")
    print("Done!")
Esempio n. 23
0
def train(arg):
    version = arg.version
    device = arg.device
    block = arg.block
    batch_size = arg.batch_size
    eval_batch_size = arg.eval_batch_size
    epoch_num = arg.epoch_num
    learning_rate = arg.learning_rate
    early_stop_epoch = arg.early_stop_epoch
    valid_sample_num = arg.valid_sample_num
    train_sample_num = arg.train_sample_num
    max_len = arg.max_len
    POSSIBLE_BATCH_SIZE = arg.possible_batch_size

    # build collate_fn function
    def my_collate_fn(data):
        # x, pad_id = 1
        # bert, pad_id = 0, cls = 101, seq = 102
        length = max(d[0].shape[0] for d in data)
        length = min(max_len, length)
        x = np.empty([len(data), length + 2], dtype=np.int64)
        x.fill(0)
        x[:, 0] = 101
        x[:, -1] = 102
        for i, d in enumerate(data):
            l = min(d[0].shape[0], max_len)
            x[i, 1:l + 1] = d[0][-l:]

        y = np.vstack([d[1] for d in data])

        # turn to torch tensor
        x = torch.LongTensor(x)
        y = torch.FloatTensor(y)

        return x, y

    # load data
    dl_model_dir = os.path.join(model_dir, arg.data_name, "bert", version)
    data_cached_path = os.path.join(dl_model_dir, "data.h5")
    os.makedirs(dl_model_dir, exist_ok=True)
    print(f"output model and all the info to '{dl_model_dir}'")

    # save config
    with open(os.path.join(dl_model_dir, "config.json"), 'w',
              encoding='utf-8') as outfile:
        json.dump(
            {
                "block": block,
                "batch_size": batch_size,
                "epoch_num": epoch_num,
                "learning_rate": learning_rate,
                "early_stop_epoch": early_stop_epoch,
                "valid_sample_num": valid_sample_num,
                "train_sample_num": train_sample_num,
                "max_len": max_len,
            },
            outfile,
            indent=4)

    if arg.data_name == "bookcorpus":
        if arg.history == 1:
            _, x_train, y_train = load_text_data(block=block,
                                                 phase="train",
                                                 target_model=arg.model_type,
                                                 verbose=True)
            _, x_valid, y_valid = load_text_data(block=block,
                                                 phase="valid",
                                                 target_model=arg.model_type,
                                                 verbose=True)
            _, x_test, y_test = load_text_data(block=block,
                                               phase="test",
                                               target_model=arg.model_type,
                                               verbose=True)
        else:
            _, x_train, y_train = load_text_data_long(
                block=block,
                phase="train",
                target_model=arg.model_type,
                verbose=True,
                history=arg.history)
            _, x_valid, y_valid = load_text_data_long(
                block=block,
                phase="valid",
                target_model=arg.model_type,
                verbose=True,
                history=arg.history)
            _, x_test, y_test = load_text_data_long(
                block=block,
                phase="test",
                target_model=arg.model_type,
                verbose=True,
                history=arg.history)

    elif arg.data_name == "coda19":
        _, x_train, y_train = coda_load_text_data(block=block,
                                                  phase="train",
                                                  target_model=arg.model_type,
                                                  verbose=True)
        _, x_valid, y_valid = coda_load_text_data(block=block,
                                                  phase="valid",
                                                  target_model=arg.model_type,
                                                  verbose=True)
        _, x_test, y_test = coda_load_text_data(block=block,
                                                phase="test",
                                                target_model=arg.model_type,
                                                verbose=True)
    else:
        print(f"{arg.data_name} not supported yet!")
        quit()

    if arg.downsample != -1:
        random_index = np.random.RandomState(5516).permutation(
            x_train.shape[0])[:arg.downsample]
        x_train, y_train = x_train[random_index], y_train[random_index]

    random_index = np.random.permutation(x_valid.shape[0])[:valid_sample_num]
    x_valid, y_valid = x_valid[random_index], y_valid[random_index]
    random_index = np.random.permutation(x_test.shape[0])[:]
    x_test, y_test = x_test[random_index], y_test[random_index]

    print("Train", x_train.shape, y_train.shape)
    print("Test", x_test.shape, y_test.shape)
    print("Valid", x_valid.shape, y_valid.shape)

    x_valid, x_test = x_valid.tolist(), x_test.tolist()

    validation = data.DataLoader(
        StoryDataset(x_valid, y_valid),
        batch_sampler=BucketBatchSampler(
            torch.utils.data.sampler.SequentialSampler(x_valid),
            batch_size=batch_size,
            drop_last=True,
            sort_key=lambda i: x_valid[i].shape[0],
            bucket_size_multiplier=100),
        num_workers=3,
        collate_fn=my_collate_fn,
    )
    testing = data.DataLoader(
        StoryDataset(x_test, y_test),
        batch_sampler=BucketBatchSampler(
            torch.utils.data.sampler.SequentialSampler(x_test),
            batch_size=batch_size,
            drop_last=True,
            sort_key=lambda i: x_test[i].shape[0],
            bucket_size_multiplier=100),
        num_workers=3,
        collate_fn=my_collate_fn,
    )

    if arg.model_type == "bert":
        model = BertRegressor(output_size=y_train.shape[1])
    elif arg.model_type == "scibert":
        pretrained_model = AutoModel.from_pretrained(
            "allenai/scibert_scivocab_uncased")
        pretrained_config = AutoConfig.from_pretrained(
            "allenai/scibert_scivocab_uncased")
        model = BertRegressor(output_size=y_train.shape[1],
                              model=pretrained_model,
                              config=pretrained_config)

    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_function = lambda y_pred, y_batch: 1 - F.cosine_similarity(
        y_pred, y_batch).mean()

    best_epoch = 0
    best_cosine = 0.0
    stopper = EarlyStop(mode="max", history=early_stop_epoch)

    for epoch in range(1, epoch_num + 1):
        # generate data
        if arg.downsample == -1 or arg.downsample > train_sample_num:
            random_index = np.random.permutation(
                x_train.shape[0])[:train_sample_num]
            x_train_epoch, y_train_epoch = x_train[random_index], y_train[
                random_index]
            x_train_epoch = x_train_epoch.tolist()
        else:
            x_train_epoch, y_train_epoch = x_train, y_train
            x_train_epoch = x_train_epoch.tolist()

        training = data.DataLoader(
            StoryDataset(x_train_epoch, y_train_epoch),
            batch_sampler=BucketBatchSampler(
                torch.utils.data.sampler.SequentialSampler(x_train_epoch),
                batch_size=batch_size
                if POSSIBLE_BATCH_SIZE == -1 else POSSIBLE_BATCH_SIZE,
                drop_last=True,
                sort_key=lambda i: x_train_epoch[i].shape[0],
                bucket_size_multiplier=100),
            num_workers=3,
            collate_fn=my_collate_fn,
        )

        # training
        model.train()
        total_loss = 0
        total_acc = 0
        total_count = len(
            training.dataset) // training.batch_sampler.batch_size
        error_case = 0
        if POSSIBLE_BATCH_SIZE != -1:
            accumulation_steps = batch_size // POSSIBLE_BATCH_SIZE
        for count, (x_batch, y_batch) in enumerate(training, 1):
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            try:
                y_pred = model(x_batch)
                loss = loss_function(y_pred, y_batch)
                loss.backward()
                total_loss += loss.item()

                if POSSIBLE_BATCH_SIZE == -1:
                    optimizer.step()
                    optimizer.zero_grad()
                elif count % accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()

            except RuntimeError:
                #print(x_batch.shape, y_batch.shape)
                error_case += 1
                continue

            # compute cosine
            total_acc += F.cosine_similarity(y_pred, y_batch,
                                             dim=1).mean().item()

            print(
                "\x1b[2K\rEpoch: {} / {} [{:.2f}%] Loss: {:.5f} Acc: {:.5f} Error: {}"
                .format(epoch, epoch_num, 100.0 * count / total_count,
                        total_loss / count, total_acc / count, error_case),
                end="")

        print()
        if epoch % 1 == 0:
            cosine = evaluate(model, validation, device=device)

            if cosine > best_cosine:
                best_model = copy.deepcopy(model.state_dict())
                best_cosine = cosine
                best_epoch = epoch

            # check early stopping
            if stopper.check(cosine):
                print("Early Stopping at Epoch = ", epoch)
                break

    # finish training
    print("Loading model from epoch {}".format(best_epoch))
    torch.save(best_model, os.path.join(dl_model_dir, "best_model.pt"))
    model.load_state_dict(best_model)
    test_cosine = evaluate(model, testing, device)
    print("Testing Cosine = ", test_cosine)
    with open(os.path.join(dl_model_dir, "result.json"), 'w',
              encoding='utf-8') as outfile:
        json.dump(
            {
                "cosine": float(test_cosine),
                "best_cosine": float(best_cosine),
                "best_epoch": best_epoch,
                "max_len": max_len,
            },
            outfile,
            indent=4)
Esempio n. 24
0
    def train_f(config):

        run_name = 'run_%d' % run_config['run']
        run_config['run'] = run_config['run'] + 1

        visdom_logger.new_run(run_name)

        model_path = Path('/tmp/models/')

        delete_checkpoint(model_path)

        train_batch_sampler = FlexibleBucketBatchSampler(
            train,
            config.batch_size,
            sampler=train_sampler,
            drop_last=True,
            sort_key=lambda r: len(row['text']))

        train_loader = DataLoader(train,
                                  batch_sampler=train_batch_sampler,
                                  collate_fn=collate_fn,
                                  pin_memory=config.use_cuda,
                                  num_workers=0)

        dev_batch_sampler = FlexibleBucketBatchSampler(
            train,
            config.test_batch_size,
            drop_last=True,
            sampler=dev_sampler,
            sort_key=lambda r: len(row['text']))

        dev_loader = DataLoader(train,
                                batch_sampler=dev_batch_sampler,
                                collate_fn=collate_fn,
                                pin_memory=config.use_cuda,
                                num_workers=0)

        test_sampler = BucketBatchSampler(test,
                                          config.test_batch_size,
                                          drop_last=True,
                                          sort_key=lambda r: len(row['text']))

        test_loader = DataLoader(test,
                                 batch_sampler=test_sampler,
                                 collate_fn=collate_fn,
                                 pin_memory=config.use_cuda,
                                 num_workers=0)

        embedding = nn.Embedding(text_encoder.vocab_size, config.d_embedding)

        if config.word_vectors_freeze:
            embedding.weight.requires_grad = False

        if config.word_vectors:
            # Load word vectors
            word_vectors = word_to_vector.aliases[config.word_vectors](
                cache=config.vector_cache_dir)
            for i, token in enumerate(text_encoder.vocab):
                embedding.weight.data[i] = word_vectors[token]
            print(
                'Found vectors for %d tokens in vocabulary' %
                len([t for t in text_encoder.vocab if t in word_vectors.stoi]))

        model = LSTMClassifier(d_in=embedding.embedding_dim,
                               d_out=label_encoder.vocab_size,
                               d_hidden=config.d_hidden,
                               dropout=config.dropout,
                               embedding=embedding)
        model.to(device)

        optimizer_params = list(
            filter(lambda p: p.requires_grad, model.parameters()))

        optimizer = torch.optim.SGD(optimizer_params,
                                    lr=config.lr,
                                    momentum=config.momentum)

        trainer = create_supervised_trainer(model,
                                            optimizer,
                                            F.nll_loss,
                                            device=device)

        evaluator_train = \
            create_supervised_evaluator(model,
                                        metrics={
                                            'accuracy': CategoricalAccuracy(),
                                            'nll': Loss(F.nll_loss)},
                                        device=device)

        evaluator_dev = \
            create_supervised_evaluator(model,
                                        metrics={
                                            'accuracy': CategoricalAccuracy(),
                                            'nll': Loss(F.nll_loss)},
                                        device=device)

        visdom_logger.attach_trainer(trainer)
        visdom_logger.attach_evaluator(evaluator_train, trainer, phase='train')
        visdom_logger.attach_evaluator(evaluator_dev, trainer, phase='dev')

        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
            optimizer, lambda epoch_: 1. / (1 + config.lr_decay *
                                            (epoch_ - 1)))

        # scoring function for early stopping and checkpointing
        def score_function(engine):
            dev_loss = engine.state.metrics['nll']
            return -dev_loss

        early_stopping = EarlyStopping(patience=15,
                                       score_function=score_function,
                                       trainer=trainer)

        def checkpoint_score_function(engine):
            dev_accuracy = engine.state.metrics['accuracy']
            return dev_accuracy

        checkpoint = ModelCheckpoint('/tmp/models',
                                     'checkpoint',
                                     score_function=checkpoint_score_function,
                                     n_saved=1,
                                     create_dir=True,
                                     score_name="dev_accuracy")

        # lets train!
        train_model(
            model=model,
            trainer=trainer,
            epochs=config.epochs,
            evaluator_train=evaluator_train,
            evaluator_dev=evaluator_dev,
            train_loader=train_loader,
            dev_loader=dev_loader,
            lr_scheduler=lr_scheduler,
            early_stopping=early_stopping if config.early_stopping else None,
            checkpoint=checkpoint if config.checkpoint else None)

        # load checkpointed (best) model and evaluate on test loader
        model = torch.load(list(model_path.glob('checkpoint_model*.pth'))[0])

        test_evaluator = \
            create_supervised_evaluator(model,
                                        metrics={
                                            'accuracy': CategoricalAccuracy(),
                                            'nll': Loss(F.nll_loss)},
                                        device=device)

        test_evaluator.run(test_loader)
        metrics = test_evaluator.state.metrics
        print("Test Results: Avg accuracy: {:.2f} Avg loss: {:.2f}".format(
            metrics['accuracy'], metrics['nll']))

        test_evaluator.run(dev_loader)
        metrics = test_evaluator.state.metrics
        return metrics['nll']
Esempio n. 25
0
import xgboost as xgb
import pandas as pd
from torchnlp.encoders.text import WhitespaceEncoder
from torchnlp.samplers import BucketBatchSampler
from torchnlp.utils import collate_tensors
from torchnlp.encoders.text import stack_and_pad_tensors
from torchnlp.nn import LockedDropout

loaded_data = ["now this ain't funny", "so don't you dare laugh"]
encoder = WhitespaceEncoder(loaded_data)
encoded_data = [encoder.encode(example) for example in loaded_data]

print("encoded_data", encoded_data)

encoded_data = [torch.randn(2), torch.randn(3), torch.randn(4), torch.randn(5)]

train_sampler = torch.utils.data.sampler.SequentialSampler(encoded_data)
train_batch_sampler = BucketBatchSampler(
    train_sampler,
    batch_size=2,
    drop_last=False,
    sort_key=lambda i: encoded_data[i].shape[0])

batches = [[encoded_data[i] for i in batch] for batch in train_batch_sampler]
batches = [
    collate_tensors(batch, stack_tensors=stack_and_pad_tensors)
    for batch in batches
]

print("batches=", batches)
Esempio n. 26
0
def main(params_file,
         batch_size,
         epochs,
         model_file_name,
         learning_rate=1e-3,
         weight_decay=1e-5,
         n_workers=6,
         use_pretrained_embs=False):

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    seed = 0
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    # Files to Load
    train_json_file = './cleaned_datasets/intro_1_para/train.txt'
    val_json_file = './cleaned_datasets/intro_1_para/val.txt'
    word2vec_model_path = '/word2vec_100D.w2v'

    print('[' + str(datetime.datetime.now()) + '] : Reading Files')

    if use_pretrained_embs:
        wordsModel = load_w2v_model(word2vec_model_path)
        vocab = sorted(list(wordsModel.wv.vocab))
    else:
        vocab = None

    print('[' + str(datetime.datetime.now()) + '] : Creating Dataset Objects')
    train_dataset = WikiDataset.fromJsonFile(train_json_file,
                                             vocab=vocab,
                                             mode='train')
    val_dataset = WikiDataset.fromJsonFile(
        val_json_file,
        text_encoder=train_dataset.text_encoder,
        label_encoder=train_dataset.label_encoder,
        vocab=train_dataset.text_encoder.vocab,
        mode='train')

    trainset = DataLoader(train_dataset,
                          num_workers=n_workers,
                          batch_sampler=BucketBatchSampler(
                              train_dataset.data['data'],
                              batch_size=batch_size,
                              drop_last=True,
                              sort_key=lambda a: -len(a['intro'].split())),
                          collate_fn=train_dataset.collate_fn)
    valset = DataLoader(val_dataset,
                        num_workers=n_workers,
                        batch_sampler=BucketBatchSampler(
                            val_dataset.data['data'],
                            batch_size=batch_size,
                            drop_last=True,
                            sort_key=lambda a: -len(a['intro'].split())),
                        collate_fn=val_dataset.collate_fn)

    print('[' + str(datetime.datetime.now()) + '] : Reading params_file')
    with open(params_file, 'r') as stream:
        params = yaml.load(stream)

    params['emb_size'] = (train_dataset.vocab_size, 100)
    params['num_classes'] = train_dataset.label_encoder.vocab_size

    print('[' + str(datetime.datetime.now()) + '] : Creating Model Object')
    classifier = create_model(params)

    if use_pretrained_embs:
        print('[' + str(datetime.datetime.now()) +
              '] : Creating Embedding Matrix')
        embedding_matrix = create_embedding_matrix(wordsModel, train_dataset)

        classifier.embeddings.weight = nn.Parameter(embedding_matrix)

        del embedding_matrix

    classifier.to(device)

    criterion = nn.CrossEntropyLoss(
        weight=torch.tensor([0, 1.36 / 1, 1.36 / 0.36]).to(device))
    optimizer = optim.Adam(classifier.parameters(),
                           lr=learning_rate,
                           weight_decay=weight_decay)

    print('[' + str(datetime.datetime.now()) + '] : Training Model ...')
    classifier = train_model(classifier, epochs, trainset, valset, criterion,
                             optimizer, device, model_file_name)

    model_utils = {
        'text_encoder': train_dataset.text_encoder,
        'label_encoder': train_dataset.label_encoder
    }

    joblib.dump(model_utils, model_file_name + str('_model_utils.pkl'))

    with open(params_file, 'w') as stream:
        yaml.dump(params, stream)