Esempio n. 1
0
def cross_val(dataset, nb_fold, batch_size, collate_fn):
    """Given a dataset yields data for cross-validation.

    Parameters
    -----------
        dataset : torch.Dataset
            The dataset used
        
        nb_fold : int
            Number of fold used for cross-validation
        
        batch_size : int
            The batch size
    
    Yields
    ------
        trainloader : torch.Dataloader
        testloader : torch.Dataloader
    """
    indices = list(range(len(dataset)))
    random.shuffle(indices)

    for trainindices, testindices in all_but_one(indices, k=nb_fold):
        trainindices = chain(*trainindices)
        trainset = Subset(dataset, list(trainindices))
        testset = Subset(dataset, list(testindices))
        trainloader = DataLoader(trainset,
                                 batch_size,
                                 shuffle=True,
                                 collate_fn=collate_fn)
        testloader = DataLoader(testset,
                                batch_size,
                                shuffle=True,
                                collate_fn=collate_fn)

        yield trainloader, testloader
Esempio n. 2
0
def learn(model,
          model_args,
          device,
          k=5,
          batch_size=32,
          seed=666,
          smt_epoch=100,
          rl_epoch=1000):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # Le probleme vient du count vectorizer qui vire certains mots
    print("Load Dataset")
    dataset, dataclasses = load(torch_dataset=True, dataclasses=True).values()
    dataclasses = {qt._id: qt for qt in dataclasses}
    engine = build_ir_engine()

    collate_fn = embedding_collate_decorator(sequence_collate_fn)

    indices = list(range(len(dataset)))
    random.shuffle(indices)
    for i, (trainindices, testindices) in enumerate(all_but_one(indices, k=5)):
        trainindices = chain(*trainindices)
        trainset = Subset(dataset, list(trainindices))
        testset = Subset(dataset, list(testindices))
        trainloader = DataLoader(trainset, device, True, collate_fn=collate_fn)
        testloader = DataLoader(testset, device, True, collate_fn=collate_fn)

        print("Build model")

        model = model(*model_args)
        try:
            model = model.to(device)
        except RuntimeError:
            print("cudnn error")
        model = model.to(device)

        optimizer = optim.Adam(model.parameters())
        loss_function = nn.BCELoss()

        print("Train")
        best_model = 0
        delay = 0
        max_delay = 10
        print("Supervised Machine Translation")
        for epoch in range(smt_epoch):
            model.train()
            n, mean = 0, 0
            train_predictions = []
            train_ids = []
            for x, y, q_id, qrels, _ in trainloader:
                x = x.to(device)
                y = y.to(device)
                pred = model(x)

                pred__ = pred > 0.5
                pred_ = pred__.detach().cpu().long().t().numpy().tolist()
                train_predictions.extend(pred_)
                train_ids.extend(map(lambda x: x.long().tolist(), q_id))

                loss = loss_function(pred, y.float())
                n += 1
                mean = ((n - 1) * mean + loss.item()) / n
                print(f"\rFold {i}, Epoch {epoch}\tTrain : {mean}", end="")

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            train_queries = {
                id_: dataclasses[str(id_)].get_text(pred)
                for id_, pred in zip(train_ids, train_predictions)
            }
            train_qrel = {
                id_: dataclasses[str(id_)].qrels
                for id_, pred in zip(train_ids, train_predictions)
            }
            train_map = eval_queries(train_queries, train_qrel, engine)
            print(
                f"\rFold {i}, Epoch {epoch}\tTrain Loss: {mean}, Train MAP {train_map}",
                end="")

            model.eval()
            train_mean = mean
            n, mean = 0, 0
            test_predictions = []
            test_ids = []
            for x, y, q_id, qrels, _ in testloader:
                x = x.to(device)
                y = y.to(device)

                pred = model(x)
                pre__ = pred > 0.5
                pred_ = pred__.detach().cpu().long().t().numpy().tolist()
                test_predictions.extend(pred_)
                test_ids.extend(map(lambda x: x.long().tolist(), q_id))

                loss = loss_function(pred, y.float())

                n += 1
                mean = ((n - 1) * mean + loss.item()) / n
                print(
                    f"\rFold {i}, Epoch {epoch}\tTrain Loss: {train_mean}\tTest : {mean}",
                    end="")

            test_queries = {
                id_: dataclasses[str(id_)].get_text(pred)
                for id_, pred in zip(test_ids, test_predictions)
            }
            test_qrel = {
                id_: dataclasses[str(id_)].qrels
                for id_, pred in zip(test_ids, test_predictions)
            }
            test_map = eval_queries(test_queries, test_qrel, engine)

            dataset_queries = {**train_queries, **test_queries}
            dataset_qrel = {**train_qrel, **test_qrel}
            dataset_map = eval_queries(dataset_queries, dataset_qrel, engine)

            print(
                "\b" * 500 +
                f"\nFold {i}, Epoch {epoch}\tTrain MAP {train_map}\tTest MAP : {test_map}\tDataset MAP : {dataset_map}"
            )

            if test_map > best_model:
                best_model = test_map
                delay = 0
            elif test_map < best_model:
                delay += 1
                if delay > max_delay:
                    print(best_model)
                    break

        print("Reinforcement Learning")
        for epoch in range(rl_epoch):
            model.train()
            n, mean = 0, 0
            train_predictions = []
            train_ids = []
            for x, y, q_id, qrels, seq_lens in trainloader:
                x = x.to(device)
                y = y.to(device)
                pred = model(x)

                sampler = Bernoulli(pred)

                batch_pred = sampler.sample()
                log_probs = sampler.log_prob(batch_pred)
                loss = log_probs.sum()

                batch_ids = list(map(lambda x: x.long().tolist(), q_id))

                batch_queries = {
                    id_: dataclasses[str(id_)].get_text(pred)
                    for id_, pred in zip(batch_ids, batch_pred)
                }
                batch_qrel = {
                    id_: dataclasses[str(id_)].qrels
                    for id_, pred in zip(batch_ids, batch_pred)
                }

                batch_map = eval_queries(batch_queries, batch_qrel, engine)
                print(f"\rTrain Map : {batch_map}", end="")
                loss = batch_map * log_probs

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            model.eval()
            train_mean = mean
            n, mean = 0, 0
            test_predictions = []
            test_ids = []
            print()
            for x, y, q_id, qrels, seq_lens in testloader:
                x = x.to(device)
                y = y.to(device)

                pred = model(x)

                sampler = Bernoulli(pred)
                batch_pred = sampler.sample()
                log_probs = sampler.log_prob(batch_pred)
                loss = log_probs.sum()
                batch_qrel = {
                    id_: dataclasses[str(id_)].qrels
                    for id_, pred in zip(batch_ids, batch_pred)
                }

                batch_map = eval_queries(batch_queries, batch_qrel, engine)

                print(f"\rTest Map : {batch_map}", end="")
            print()
Esempio n. 3
0
dataset = load(torch_dataset=True)["torch"]


def embedding_collate_decorator(collate_fn):
    def wrapper(batch):
        x, y, id_ = collate_fn(batch)
        return x, y

    return wrapper


collate_fn = embedding_collate_decorator(sequence_collate_fn)

indices = list(range(len(dataset)))
random.shuffle(indices)
for i, (trainindices, testindices) in enumerate(all_but_one(indices, k=10)):
    trainindices = chain(*trainindices)
    trainset = Subset(dataset, list(trainindices))
    testset = Subset(dataset, list(testindices))
    trainloader = DataLoader(trainset, 32, True, collate_fn=collate_fn)
    testloader = DataLoader(testset, 32, True, collate_fn=collate_fn)

    print("Build model")
    encoder_archi = {
        "input_size": embedding_size,
        "hidden_size": hidden_size,
        "num_layers": num_layers,
        "bidirectional": True,
        "dropout": 0.2
    }
    decoder_archi = {