def main(): args = parser.parse_args() torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) print("Building dataset") # train_len = 200 dataset = Robust2004.torch_dataset() dataclasses = Robust2004.dataclasses() dataclasses = {dc._id: dc for dc in dataclasses} collate_fn = sequence_collate_fn_mask trainlogs, testlogs = {}, {} for i, (trainloader, testloader) in enumerate(cross_val(dataset, 5, args.batch, collate_fn)): model = eval("KeyWordSelectionModel_1" + args.model) memory = eval("memory_2" + args.memory) archi_function = eval(args.model + "_archi") model1= memory(model)(*archi_function(args.nlayers, args.hsize)) device = torch.device("cuda:" + str(args.device)) model1 = model1.to(device) model = model1 optimizer = optim.Adam(model.parameters()) # optimizer = optim.SGD(model.parameters(), lr=1e-3) print("Getting Engine") engine = get_engine(hosts=["localhost:9200"]) resp_filename = f"{args.model}{args.memory}-{args.smt_lambda}-{args.reinforce_lambda}-{args.entropy_lambda}_{args.batch}-{args.nlayers}-{args.hsize}.resp" initialized_eval_fn = partial(eval_fn, dataclasses=dataclasses, engine=engine, resp_filename=resp_filename) print("Training") model, train_logs, test_logs = learn( model, trainloader, testloader, optimizer, args.nb_epoch, device, initialized_eval_fn, 50, args.entropy_lambda, args.smt_lambda, args.reinforce_lambda ) trainlogs[i] = train_logs testlogs[i] = test_logs with open(f"{args.model}{args.memory}-{args.smt_lambda}-{args.reinforce_lambda}-{args.entropy_lambda}_{args.batch}-{args.nlayers}-{args.hsize}.txt", "w") as f: f.write(str(trainlogs)) f.write("\n"+ str(testlogs))
def main(): args = parser.parse_args() torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) print("Building dataset") dataset = Robust2004.torch_dataset() dataclasses = Robust2004.dataclasses() dataclasses = {dc._id: dc for dc in dataclasses} collate_fn = sequence_collate_fn_mask trainlogs, testlogs = {}, {} for i, (trainloader, testloader) in enumerate( cross_val(dataset, 5, args.batch, collate_fn)): model = AttModel(args.heads, 300, args.inner, args.hsize, args.hsize, args.nlayers, 0.1) pytorch_total_params = sum(p.numel() for p in model.parameters()) print(pytorch_total_params, "parameters") device = torch.device("cuda:" + str(args.device)) try: model = model.to(device) except: model = model.to(device) optimizer = optim.Adam(model.parameters()) print("Getting Engine") engine = get_engine(hosts=["localhost:9200"]) resp_filename = f"transformer-{args.smt_lambda}-{args.reinforce_lambda}-{args.entropy_lambda}_{args.batch}-{args.nlayers}-{args.hsize}.resp" initialized_eval_fn = partial(eval_fn, dataclasses=dataclasses, engine=engine, resp_filename=resp_filename) print("Training") model, train_logs, test_logs = learn(model, trainloader, testloader, optimizer, args.nb_epoch, device, initialized_eval_fn, 50, args.entropy_lambda, args.smt_lambda, args.reinforce_lambda) trainlogs[i] = train_logs testlogs[i] = test_logs with open( f"transformer-{args.smt_lambda}-{args.reinforce_lambda}-{args.entropy_lambda}_{args.batch}-{args.nlayers}-{args.hsize}.txt", "w") as f: f.write(str(trainlogs)) f.write("\n" + str(testlogs))
assert train_len % 2 == 0 assert val_len % 2 == 0 assert test_len % 2 == 0 trainset, valset, testset = indices[:train_len], indices[ train_len:train_len + val_len], indices[train_len + val_len:] trainset, valset, testset = Subset(dataset, trainset), Subset(dataset, valset), Subset( dataset, testset) print(len(trainset), len(valset), len(testset)) trainloader = DataLoader(trainset, 1024, True, collate_fn=collate_fn) valloader = DataLoader(valset, 1024, True, collate_fn=collate_fn) testloader = DataLoader(testset, 1024, True, collate_fn=collate_fn) robset = Robust2004.torch_dataset() rob_loader = DataLoader(robset, 64, True, collate_fn=collate_fn) rob_dc = Robust2004.dataclasses() rob_dc = {q._id: q for q in rob_dc} print("Build model") embedding_size = 300 hidden_size = 128 num_layers = 1 bidirectional = True encoder_archi = { "input_size": embedding_size, "hidden_size": hidden_size, "num_layers": num_layers, "bidirectional": True
res.append(" ".join(map(str, [qid, "Q0", docid, rank, score, "EARIA"]))) return res def retrieve_doc_ids(hits): ret = {hit["_id"]: hit["_score"] for hit in hits} return ret MAX_DOC = 3000 index = "robust2004" doc_type = "trec" engine = es.Elasticsearch() dataclasses = Robust2004.dataclasses() dataclasses = {qt._id: qt for qt in dataclasses} print(len(dataclasses)) queries = {str(k): v.query for k, v in dataclasses.items()} query_ids, query_texts = zip(*queries.items()) query_ids = list(map(str, query_ids)) qrel = {id_: dataclasses[id_].qrels for id_ in query_ids} qrel = {str(k): v for k, v in qrel.items()} msearch_body = msearch_preprocess(query_texts, index, doc_type) res = [] for i in range(8): res.extend(engine.msearch(msearch_body[i*50:i*50+50], index)["responses"])
import sys import os from os import path libpath = path.normpath( path.join(path.dirname(path.realpath(__file__)), os.pardir, "src")) sys.path.append(libpath) import pickle as pkl import torch import data from datasets import Quora, Robust2004 sys.modules["dataset"] = data quora_dc = Quora.dataclasses() quora_torch = Quora.torch_dataset() rb_dc = Robust2004.dataclasses() rb_torch = Robust2004.torch_dataset() del sys.modules["dataset"] with open(Quora.dataclasses_path, "wb") as f: pkl.dump(quora_dc, f) with open(Robust2004.dataclasses_path, "wb") as f: pkl.dump(rb_dc, f) torch.save(quora_torch, Quora.torch_path) torch.save(rb_torch, Robust2004.torch_path)
def learn(model, model_args, device, k=5, batch_size=32, seed=666, smt_epoch=100, rl_epoch=1000): torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # Le probleme vient du count vectorizer qui vire certains mots print("Load Dataset") dataset = Robust2004.torch_dataset() dataclasses = Robust2004.dataclasses() dataclasses = {qt._id: qt for qt in dataclasses} engine = get_engine() collate_fn = embedding_collate_decorator(sequence_collate_fn) indices = list(range(len(dataset))) random.shuffle(indices) for i, (trainindices, testindices) in enumerate(all_but_one(indices, k=k)): trainindices = chain(*trainindices) trainset = Subset(dataset, list(trainindices)) testset = Subset(dataset, list(testindices)) trainloader = DataLoader(trainset, 1, True, collate_fn=collate_fn) testloader = DataLoader(testset, 1, True, collate_fn=collate_fn) print("Build model") model = model(*model_args) try: model = model.to(device) except RuntimeError: print("cudnn error") model = model.to(device) optimizer = optim.Adam(model.parameters()) loss_function = nn.BCELoss() print("Train") best_model = 0 delay = 0 max_delay = 5 print("Supervised Machine Translation") for epoch in range(smt_epoch): model.train() n, mean = 0, 0 train_predictions = [] train_ids = [] for x, y, q_id, qrels, _ in trainloader: x = x.to(device) y = y.to(device) pred = model(x) pred__ = pred > 0.5 pred_ = pred__.detach().cpu().long().t().numpy().tolist() train_predictions.extend(pred_) train_ids.extend(map(lambda x: x.long().tolist(), q_id)) loss = loss_function(pred, y.float()) n += 1 mean = ((n - 1) * mean + loss.item()) / n print(f"\rFold {i}, Epoch {epoch}\tTrain : {mean}", end="") optimizer.zero_grad() loss.backward() optimizer.step() train_queries = { id_: dataclasses[str(id_)].get_text(pred) for id_, pred in zip(train_ids, train_predictions) } train_qrel = { id_: dataclasses[str(id_)].qrels for id_, pred in zip(train_ids, train_predictions) } train_map = eval_queries(train_queries, train_qrel, engine) print( f"\rFold {i}, Epoch {epoch}\tTrain Loss: {mean}, Train MAP {train_map}", end="") model.eval() train_mean = mean n, mean = 0, 0 test_predictions = [] test_ids = [] for x, y, q_id, qrels, _ in testloader: x = x.to(device) y = y.to(device) pred = model(x) pre__ = pred > 0.5 pred_ = pred__.detach().cpu().long().t().numpy().tolist() test_predictions.extend(pred_) test_ids.extend(map(lambda x: x.long().tolist(), q_id)) loss = loss_function(pred, y.float()) n += 1 mean = ((n - 1) * mean + loss.item()) / n print( f"\rFold {i}, Epoch {epoch}\tTrain Loss: {train_mean}\tTest : {mean}", end="") test_queries = { id_: dataclasses[str(id_)].get_text(pred) for id_, pred in zip(test_ids, test_predictions) } test_qrel = { id_: dataclasses[str(id_)].qrels for id_, pred in zip(test_ids, test_predictions) } test_map = eval_queries(test_queries, test_qrel, engine) dataset_queries = {**train_queries, **test_queries} dataset_qrel = {**train_qrel, **test_qrel} dataset_map = eval_queries(dataset_queries, dataset_qrel, engine) print( "\b" * 500 + f"\nFold {i}, Epoch {epoch}\tTrain MAP {train_map}\tTest MAP : {test_map}\tDataset MAP : {dataset_map}" ) if test_map > best_model: best_model = test_map delay = 0 elif test_map < best_model: delay += 1 if delay > max_delay: print(best_model) break print("Reinforcement Learning") mean_maps = {id_: [] for id_ in dataclasses.keys()} for epoch in range(rl_epoch): model.train() n, mean = 0, 0 train_predictions = [] train_ids = [] for x, y, q_id, qrels, seq_lens in trainloader: x = x.to(device) y = y.to(device) pred = model(x) sampler = Bernoulli(pred) batch_pred = sampler.sample() log_probs = sampler.log_prob(batch_pred) loss = log_probs.sum() batch_ids = list(map(lambda x: x.long().tolist(), q_id)) batch_queries = { id_: dataclasses[str(id_)].get_text(pred) for id_, pred in zip(batch_ids, batch_pred) } batch_qrel = { id_: dataclasses[str(id_)].qrels for id_, pred in zip(batch_ids, batch_pred) } batch_map = eval_queries(batch_queries, batch_qrel, engine) n += 1 mean = ((n - 1) * mean + batch_map) / n print(f"\rTrain Map : {mean: .3f}", end="") loss = -batch_map * loss optimizer.zero_grad() loss.backward() optimizer.step() train_mean = mean n, mean = 0, 0 test_predictions = [] test_ids = [] print() for x, y, q_id, qrels, seq_lens in testloader: x = x.to(device) y = y.to(device) pred = model(x) sampler = Bernoulli(pred) batch_pred = sampler.sample() log_probs = sampler.log_prob(batch_pred) loss = log_probs.sum() batch_qrel = { id_: dataclasses[str(id_)].qrels for id_, pred in zip(batch_ids, batch_pred) } batch_map = eval_queries(batch_queries, batch_qrel, engine) n += 1 mean = ((n - 1) * mean + batch_map) / n print( f"\rTrain MAP : {train_mean: .3f}\tTest Map : {mean: .3f}", end="") print()