Beispiel #1
0
def main():
    args = parser.parse_args()
    
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    print("Building dataset")
#    train_len = 200
    dataset = Robust2004.torch_dataset()
    dataclasses = Robust2004.dataclasses()
    dataclasses = {dc._id: dc for dc in dataclasses}

    collate_fn = sequence_collate_fn_mask
    trainlogs, testlogs = {}, {}
    for i, (trainloader, testloader) in enumerate(cross_val(dataset, 5, args.batch, collate_fn)):
        model = eval("KeyWordSelectionModel_1" + args.model)
        memory = eval("memory_2" + args.memory)
        archi_function = eval(args.model + "_archi")
        model1= memory(model)(*archi_function(args.nlayers, args.hsize))
        
        device = torch.device("cuda:" + str(args.device))
        model1 = model1.to(device)
        
        model = model1
        optimizer = optim.Adam(model.parameters())
    #    optimizer = optim.SGD(model.parameters(), lr=1e-3)
        print("Getting Engine")
        engine = get_engine(hosts=["localhost:9200"])
    
        resp_filename = f"{args.model}{args.memory}-{args.smt_lambda}-{args.reinforce_lambda}-{args.entropy_lambda}_{args.batch}-{args.nlayers}-{args.hsize}.resp"
        initialized_eval_fn = partial(eval_fn, dataclasses=dataclasses, engine=engine, resp_filename=resp_filename)
        print("Training")
        model, train_logs, test_logs = learn(
                                    model,
                                    trainloader,
                                    testloader,
                                    optimizer,
                                    args.nb_epoch,
                                    device,
                                    initialized_eval_fn,
                                    50,
                                    args.entropy_lambda,
                                    args.smt_lambda,
                                    args.reinforce_lambda
                                )
        trainlogs[i] = train_logs
        testlogs[i] = test_logs
    with open(f"{args.model}{args.memory}-{args.smt_lambda}-{args.reinforce_lambda}-{args.entropy_lambda}_{args.batch}-{args.nlayers}-{args.hsize}.txt", "w") as f:
        f.write(str(trainlogs))
        f.write("\n"+ str(testlogs))
Beispiel #2
0
def main():
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    print("Building dataset")
    dataset = Robust2004.torch_dataset()
    dataclasses = Robust2004.dataclasses()
    dataclasses = {dc._id: dc for dc in dataclasses}

    collate_fn = sequence_collate_fn_mask
    trainlogs, testlogs = {}, {}
    for i, (trainloader, testloader) in enumerate(
            cross_val(dataset, 5, args.batch, collate_fn)):

        model = AttModel(args.heads, 300, args.inner, args.hsize, args.hsize,
                         args.nlayers, 0.1)
        pytorch_total_params = sum(p.numel() for p in model.parameters())
        print(pytorch_total_params, "parameters")

        device = torch.device("cuda:" + str(args.device))
        try:
            model = model.to(device)
        except:
            model = model.to(device)

        optimizer = optim.Adam(model.parameters())
        print("Getting Engine")
        engine = get_engine(hosts=["localhost:9200"])

        resp_filename = f"transformer-{args.smt_lambda}-{args.reinforce_lambda}-{args.entropy_lambda}_{args.batch}-{args.nlayers}-{args.hsize}.resp"
        initialized_eval_fn = partial(eval_fn,
                                      dataclasses=dataclasses,
                                      engine=engine,
                                      resp_filename=resp_filename)
        print("Training")
        model, train_logs, test_logs = learn(model, trainloader, testloader,
                                             optimizer, args.nb_epoch, device,
                                             initialized_eval_fn, 50,
                                             args.entropy_lambda,
                                             args.smt_lambda,
                                             args.reinforce_lambda)
        trainlogs[i] = train_logs
        testlogs[i] = test_logs
    with open(
            f"transformer-{args.smt_lambda}-{args.reinforce_lambda}-{args.entropy_lambda}_{args.batch}-{args.nlayers}-{args.hsize}.txt",
            "w") as f:
        f.write(str(trainlogs))
        f.write("\n" + str(testlogs))
assert train_len % 2 == 0
assert val_len % 2 == 0
assert test_len % 2 == 0

trainset, valset, testset = indices[:train_len], indices[
    train_len:train_len + val_len], indices[train_len + val_len:]

trainset, valset, testset = Subset(dataset,
                                   trainset), Subset(dataset, valset), Subset(
                                       dataset, testset)
print(len(trainset), len(valset), len(testset))
trainloader = DataLoader(trainset, 1024, True, collate_fn=collate_fn)
valloader = DataLoader(valset, 1024, True, collate_fn=collate_fn)
testloader = DataLoader(testset, 1024, True, collate_fn=collate_fn)

robset = Robust2004.torch_dataset()
rob_loader = DataLoader(robset, 64, True, collate_fn=collate_fn)
rob_dc = Robust2004.dataclasses()
rob_dc = {q._id: q for q in rob_dc}

print("Build model")
embedding_size = 300
hidden_size = 128
num_layers = 1
bidirectional = True

encoder_archi = {
    "input_size": embedding_size,
    "hidden_size": hidden_size,
    "num_layers": num_layers,
    "bidirectional": True
        res.append(" ".join(map(str, [qid, "Q0", docid, rank, score, "EARIA"])))
    return res


def retrieve_doc_ids(hits):
    ret = {hit["_id"]: hit["_score"] for hit in hits}
    return ret


MAX_DOC = 3000
index = "robust2004"
doc_type = "trec"

engine = es.Elasticsearch()

dataclasses = Robust2004.dataclasses()
dataclasses = {qt._id: qt for qt in dataclasses}
print(len(dataclasses))

queries = {str(k): v.query for k, v in dataclasses.items()} 

query_ids, query_texts = zip(*queries.items())
query_ids = list(map(str, query_ids))
qrel = {id_: dataclasses[id_].qrels for id_ in query_ids}
qrel = {str(k): v for k, v in qrel.items()}

msearch_body = msearch_preprocess(query_texts, index, doc_type)

res = [] 
for i in range(8): 
    res.extend(engine.msearch(msearch_body[i*50:i*50+50], index)["responses"]) 
import sys
import os
from os import path

libpath = path.normpath(
    path.join(path.dirname(path.realpath(__file__)), os.pardir, "src"))
sys.path.append(libpath)

import pickle as pkl
import torch

import data
from datasets import Quora, Robust2004

sys.modules["dataset"] = data

quora_dc = Quora.dataclasses()
quora_torch = Quora.torch_dataset()
rb_dc = Robust2004.dataclasses()
rb_torch = Robust2004.torch_dataset()

del sys.modules["dataset"]

with open(Quora.dataclasses_path, "wb") as f:
    pkl.dump(quora_dc, f)

with open(Robust2004.dataclasses_path, "wb") as f:
    pkl.dump(rb_dc, f)

torch.save(quora_torch, Quora.torch_path)
torch.save(rb_torch, Robust2004.torch_path)
Beispiel #6
0
def learn(model,
          model_args,
          device,
          k=5,
          batch_size=32,
          seed=666,
          smt_epoch=100,
          rl_epoch=1000):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # Le probleme vient du count vectorizer qui vire certains mots
    print("Load Dataset")
    dataset = Robust2004.torch_dataset()
    dataclasses = Robust2004.dataclasses()
    dataclasses = {qt._id: qt for qt in dataclasses}
    engine = get_engine()

    collate_fn = embedding_collate_decorator(sequence_collate_fn)

    indices = list(range(len(dataset)))
    random.shuffle(indices)
    for i, (trainindices, testindices) in enumerate(all_but_one(indices, k=k)):
        trainindices = chain(*trainindices)
        trainset = Subset(dataset, list(trainindices))
        testset = Subset(dataset, list(testindices))
        trainloader = DataLoader(trainset, 1, True, collate_fn=collate_fn)
        testloader = DataLoader(testset, 1, True, collate_fn=collate_fn)

        print("Build model")

        model = model(*model_args)
        try:
            model = model.to(device)
        except RuntimeError:
            print("cudnn error")
        model = model.to(device)

        optimizer = optim.Adam(model.parameters())
        loss_function = nn.BCELoss()

        print("Train")
        best_model = 0
        delay = 0
        max_delay = 5
        print("Supervised Machine Translation")
        for epoch in range(smt_epoch):
            model.train()
            n, mean = 0, 0
            train_predictions = []
            train_ids = []
            for x, y, q_id, qrels, _ in trainloader:
                x = x.to(device)
                y = y.to(device)
                pred = model(x)

                pred__ = pred > 0.5
                pred_ = pred__.detach().cpu().long().t().numpy().tolist()
                train_predictions.extend(pred_)
                train_ids.extend(map(lambda x: x.long().tolist(), q_id))

                loss = loss_function(pred, y.float())
                n += 1
                mean = ((n - 1) * mean + loss.item()) / n
                print(f"\rFold {i}, Epoch {epoch}\tTrain : {mean}", end="")

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            train_queries = {
                id_: dataclasses[str(id_)].get_text(pred)
                for id_, pred in zip(train_ids, train_predictions)
            }
            train_qrel = {
                id_: dataclasses[str(id_)].qrels
                for id_, pred in zip(train_ids, train_predictions)
            }
            train_map = eval_queries(train_queries, train_qrel, engine)
            print(
                f"\rFold {i}, Epoch {epoch}\tTrain Loss: {mean}, Train MAP {train_map}",
                end="")

            model.eval()
            train_mean = mean
            n, mean = 0, 0
            test_predictions = []
            test_ids = []
            for x, y, q_id, qrels, _ in testloader:
                x = x.to(device)
                y = y.to(device)

                pred = model(x)
                pre__ = pred > 0.5
                pred_ = pred__.detach().cpu().long().t().numpy().tolist()
                test_predictions.extend(pred_)
                test_ids.extend(map(lambda x: x.long().tolist(), q_id))

                loss = loss_function(pred, y.float())

                n += 1
                mean = ((n - 1) * mean + loss.item()) / n
                print(
                    f"\rFold {i}, Epoch {epoch}\tTrain Loss: {train_mean}\tTest : {mean}",
                    end="")

            test_queries = {
                id_: dataclasses[str(id_)].get_text(pred)
                for id_, pred in zip(test_ids, test_predictions)
            }
            test_qrel = {
                id_: dataclasses[str(id_)].qrels
                for id_, pred in zip(test_ids, test_predictions)
            }
            test_map = eval_queries(test_queries, test_qrel, engine)

            dataset_queries = {**train_queries, **test_queries}
            dataset_qrel = {**train_qrel, **test_qrel}
            dataset_map = eval_queries(dataset_queries, dataset_qrel, engine)

            print(
                "\b" * 500 +
                f"\nFold {i}, Epoch {epoch}\tTrain MAP {train_map}\tTest MAP : {test_map}\tDataset MAP : {dataset_map}"
            )

            if test_map > best_model:
                best_model = test_map
                delay = 0
            elif test_map < best_model:
                delay += 1
                if delay > max_delay:
                    print(best_model)
                    break

        print("Reinforcement Learning")
        mean_maps = {id_: [] for id_ in dataclasses.keys()}
        for epoch in range(rl_epoch):
            model.train()
            n, mean = 0, 0
            train_predictions = []
            train_ids = []
            for x, y, q_id, qrels, seq_lens in trainloader:
                x = x.to(device)
                y = y.to(device)
                pred = model(x)

                sampler = Bernoulli(pred)

                batch_pred = sampler.sample()
                log_probs = sampler.log_prob(batch_pred)
                loss = log_probs.sum()

                batch_ids = list(map(lambda x: x.long().tolist(), q_id))

                batch_queries = {
                    id_: dataclasses[str(id_)].get_text(pred)
                    for id_, pred in zip(batch_ids, batch_pred)
                }
                batch_qrel = {
                    id_: dataclasses[str(id_)].qrels
                    for id_, pred in zip(batch_ids, batch_pred)
                }

                batch_map = eval_queries(batch_queries, batch_qrel, engine)

                n += 1
                mean = ((n - 1) * mean + batch_map) / n
                print(f"\rTrain Map : {mean: .3f}", end="")
                loss = -batch_map * loss

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            train_mean = mean
            n, mean = 0, 0
            test_predictions = []
            test_ids = []
            print()
            for x, y, q_id, qrels, seq_lens in testloader:
                x = x.to(device)
                y = y.to(device)

                pred = model(x)

                sampler = Bernoulli(pred)
                batch_pred = sampler.sample()
                log_probs = sampler.log_prob(batch_pred)
                loss = log_probs.sum()
                batch_qrel = {
                    id_: dataclasses[str(id_)].qrels
                    for id_, pred in zip(batch_ids, batch_pred)
                }

                batch_map = eval_queries(batch_queries, batch_qrel, engine)
                n += 1
                mean = ((n - 1) * mean + batch_map) / n
                print(
                    f"\rTrain MAP : {train_mean: .3f}\tTest Map : {mean: .3f}",
                    end="")
            print()