Beispiel #1
0
def test_model(args):
    embed_path = args.weights if args.weights is not None else args.train
    tensor_embeddings, voc = load_embeddings_from_file(embed_path +
                                                       "tuned_word")
    #dataset = NCDataset(args.train, args)
    #eval_dataset = NCDataset(args.eval, args)
    print("Vocabulary:", len(voc))

    # Construct model
    print("🏝 Build model")
    model = Model(len(voc), SIZE_EMBEDDING, args.h1, args.h2, args.h3,
                  SIZE_PAIR_IN, SIZE_SINGLE_IN)
    model.load_embeddings(tensor_embeddings)
    if args.cuda:
        model.cuda()
    if args.weights is not None:
        print("🏝 Loading pre-trained weights")
        model.load_weights(args.weights)
    if args.checkpoint_file is not None:
        print("⛄️ Loading model from", args.checkpoint_file)
        model.load_state_dict(torch.load(args.checkpoint_file) if args.cuda \
            else torch.load(args.checkpoint_file, map_location=lambda storage, loc: storage))

    print("test folder is", args.test)
    test_dataset = NCDataset(args.test, args)
    test_evaluator = ConllEvaluator(model, test_dataset, args.test,
                                    args.testkey, embed_path, args)
    start_time = time.time()
    test_evaluator.build_test_file()
    score, f1_conll, ident = test_evaluator.get_score()
    elapsed = time.time() - start_time
    print("TEST F1_CONLL SCORE IS,", f1_conll)
    del model, test_dataset
Beispiel #2
0
    def __init__(self, model_path):

        list_of_files = glob.glob(
            model_path +
            '*modelranking')  # * means all if need specific format then *.csv
        latest_file = max(list_of_files, key=os.path.getctime)

        checkpoint_file = latest_file
        embed_path = 'weights/'
        tensor_embeddings, voc = load_embeddings_from_file(embed_path +
                                                           "tuned_word")
        print("🏝 Build model")
        h1 = 1000
        h2 = 500
        h3 = 500
        self.model = Model(len(voc), SIZE_EMBEDDING, h1, h2, h3, SIZE_PAIR_IN,
                           SIZE_SINGLE_IN)
        self.model.load_embeddings(tensor_embeddings)
        cuda = torch.cuda.is_available()

        if cuda:
            model.cuda()
        if checkpoint_file is not None:
            print("⛄️ Loading model from", checkpoint_file)
            self.model.load_state_dict(
                torch.load(checkpoint_file) if cuda else torch.
                load(checkpoint_file,
                     map_location=lambda storage, loc: storage))
            self.model.eval()
Beispiel #3
0
from dataset import (NCDataset, NCBatchSampler, load_embeddings_from_file,
                     padder_collate, SIZE_PAIR_IN, SIZE_SINGLE_IN,
                     SIZE_EMBEDDING)
from model import Model

# Load datasets and embeddings
list_of_files = glob.glob('checkpoints/*modelranking'
                          )  # * means all if need specific format then *.csv
latest_file = max(list_of_files, key=os.path.getctime)
#save_path = os.path.join('checkpoints', current_time + '_' + socket.gethostname() + '_')
#save_name = "ranking"
#best_model_path = save_path + "best_model" + save_name
weights = 'weights/'
checkpoint_file = latest_file
embed_path = weights
tensor_embeddings, voc = load_embeddings_from_file(embed_path + "tuned_word")

# Construct model
print("🏝 Build model")
h1 = 1000
h2 = 500
h3 = 500
model = Model(len(voc), SIZE_EMBEDDING, h1, h2, h3, SIZE_PAIR_IN,
              SIZE_SINGLE_IN)
model.load_embeddings(tensor_embeddings)

#print(model.state_dict)
#cklmvb = input("state dict printed")
cuda = torch.cuda.is_available()

if cuda:
Beispiel #4
0
def run_model(args):
    print("Training for", args.all_pairs_epoch, args.top_pairs_epoch,
          args.ranking_epoch, "epochs")
    # Tensorboard server
    writer = SummaryWriter()

    # Load datasets and embeddings
    embed_path = args.weights if args.weights is not None else args.train
    tensor_embeddings, voc = load_embeddings_from_file(embed_path +
                                                       "tuned_word")
    dataset = NCDataset(args.train, args)
    eval_dataset = NCDataset(args.eval, args)
    print("Vocabulary:", len(voc))

    # Construct model
    print("🏝 Build model")
    model = Model(len(voc), SIZE_EMBEDDING, args.h1, args.h2, args.h3,
                  SIZE_PAIR_IN, SIZE_SINGLE_IN)
    model.load_embeddings(tensor_embeddings)
    if args.weights is not None:
        print("🏝 Loading pre-trained weights")
        model.load_weights(args.weights)
    if args.checkpoint_file is not None:
        print("⛄️ Loading model from", args.checkpoint_file)
        model.load_state_dict(torch.load(args.checkpoint_file) if args.cuda \
            else torch.load(args.checkpoint_file, map_location=lambda storage, loc: storage))

    print("🏝 Loading conll evaluator")
    eval_evaluator = ConllEvaluator(model, eval_dataset, args.eval,
                                    args.evalkey, embed_path, args)
    train_evaluator = ConllEvaluator(model, dataset, args.train, args.trainkey,
                                     embed_path, args)
    print("🏝 Testing evaluator and getting first eval score")
    eval_evaluator.test_model()
    start_time = time.time()
    eval_evaluator.build_test_file()
    score, f1_conll, ident = eval_evaluator.get_score()
    elapsed = time.time() - start_time
    print('|| s/evaluation {:5.2f}'.format(elapsed))
    writer.add_scalar("eval/" + "F1_conll", f1_conll, 0)

    # Preparing dataloader
    print("🏝 Preparing dataloader")
    print("Dataloader parameters: batchsize", args.batchsize, "numworkers",
          args.numworkers)
    batch_sampler = NCBatchSampler(dataset.mentions_pair_length,
                                   shuffle=True,
                                   batchsize=args.batchsize)
    dataloader = DataLoader(dataset,
                            collate_fn=padder_collate,
                            batch_sampler=batch_sampler,
                            num_workers=0,
                            pin_memory=args.cuda)
    mentions_idx, n_pairs = batch_sampler.get_batch_info()

    print("🏝 Start training")
    g_step = 0
    start_from = args.startstep if args.startstep is not None and args.startstage is not None else 0

    def run_epochs(start_epoch,
                   end_epoch,
                   loss_func,
                   optim_func,
                   save_name,
                   lr,
                   g_step,
                   debug=None):
        best_model_path = args.save_path + "best_model" + save_name
        start_time_all = time.time()
        best_f1_conll = 0
        lower_eval = 0
        for epoch in tqdm(range(start_epoch, end_epoch)):
            """ Run an epoch """
            print("🚘 {} Epoch {:d}".format(save_name, epoch))
            model.train()
            start_time_log = time.time()
            start_time_epoch = time.time()
            epoch_loss = 0
            for batch_i, (m_idx, n_pairs_l, batch) in enumerate(
                    zip(mentions_idx, n_pairs, dataloader)):
                if debug is not None and (debug == -1 or debug in m_idx):
                    l = list(dataset.flat_m_loc[m][2:] for m in m_idx)
                    print(
                        "🏔 Batch", batch_i, "m_idx:",
                        "|".join(str(i) for i in m_idx), "mentions:",
                        "|".join(dataset.docs[d]['mentions'][i]
                                 for u, i, d in l))
                    print("Batch n_pairs:",
                          "|".join(str(p) for p in n_pairs_l))
                inputs, targets = batch
                inputs = tuple(
                    Variable(inp, requires_grad=False) for inp in inputs)
                targets = tuple(
                    Variable(tar, requires_grad=False) for tar in targets)
                if args.cuda:
                    inputs = tuple(i.cuda() for i in inputs)
                    targets = tuple(t.cuda() for t in targets)
                scores = model(inputs)

                if debug is not None and (debug == -1 or debug in m_idx):
                    print("Scores:\n" +
                          "\n".join("|".join(str(s) for s in s_l)
                                    for s_l in scores.data.cpu().numpy()))
                    print("Labels:\n" +
                          "\n".join("|".join(str(s) for s in s_l)
                                    for s_l in targets[0].data.cpu().numpy()))
                loss = loss_func(scores, targets)
                del inputs, targets
                if debug is not None and (debug == -1 or debug in m_idx):
                    #print('Loss', loss.data[0])
                    print('Loss', loss.data)
                # Zero gradients, perform a backward pass, and update the weights.
                optim_func.zero_grad()
                loss.backward()
                #epoch_loss += loss.data[0]
                epoch_loss += loss.data
                #epoch_loss = loss.item()
                optim_func.step()
                #writer.add_scalar("train/" + save_name + "_loss", loss.data[0], g_step)
                writer.add_scalar("train/" + save_name + "_loss", loss.data,
                                  g_step)
                writer.add_scalar("meta/" + "lr", lr, g_step)
                writer.add_scalar("meta/" + "stage", STAGES.index(save_name),
                                  g_step)
                g_step += 1
                if batch_i % args.log_interval == 0 and batch_i > 0:
                    elapsed = time.time() - start_time_log
                    print(
                        '| epoch {:3d} | {:5d}/{:5d} batches | lr {:.2e} | ms/batch {:5.2f} | '
                        'loss {:.2e}'.format(
                            epoch, batch_i, len(dataloader),
                            optim_func.param_groups[0]['lr'],
                            elapsed * 1000 / args.log_interval, loss.data))
                    #elapsed * 1000 / args.log_interval, loss.data[0]))
                    start_time_log = time.time()
            elapsed_all = time.time() - start_time_all
            elapsed_epoch = time.time() - start_time_epoch
            print(
                '|| min/epoch {:5.2f} | est. remaining time (h) {:5.2f} | loss {:.2e}'
                .format(
                    elapsed_epoch / 60, elapsed_all / 3600 *
                    float(end_epoch - epoch) / float(epoch - start_epoch + 1),
                    epoch_loss))
            writer.add_scalar("epoch/" + "loss", epoch_loss, g_step)
            if epoch % args.conll_train_interval == 0:
                start_time = time.time()
                train_evaluator.build_test_file()
                score, f1_conll, ident = train_evaluator.get_score()
                elapsed = time.time() - start_time
                print('|| min/train evaluation {:5.2f} | F1_conll {:5.2f}'.
                      format(elapsed / 60, f1_conll))
                writer.add_scalar("epoch/" + "F1_conll", f1_conll, g_step)
            if epoch % args.conll_eval_interval == 0:
                start_time = time.time()
                eval_evaluator.build_test_file()
                score, f1_conll, ident = eval_evaluator.get_score()
                elapsed = time.time() - start_time
                print('|| min/evaluation {:5.2f}'.format(elapsed / 60))
                writer.add_scalar("eval/" + "F1_conll", f1_conll, g_step)
                g_step += 1
                save_path = args.save_path + save_name + "_" + str(epoch)
                torch.save(model.state_dict(), save_path)
                if f1_conll > best_f1_conll:
                    best_f1_conll = f1_conll
                    torch.save(model.state_dict(), best_model_path)
                    lower_eval = 0
                elif args.on_eval_decrease != 'nothing':
                    print("Evaluation metric decreases")
                    lower_eval += 1
                    if lower_eval >= args.patience:
                        if args.on_eval_decrease == 'divide_lr' or args.on_eval_decrease == 'divide_then_next':
                            print("reload best model and decrease lr")
                            load_model(model, best_model_path)
                            lr = decrease_lr(optim_func)
                        if args.on_eval_decrease == 'next_stage' or lr <= args.min_lr:
                            print("Switch to next stage")
                            break
        # Save last step
        start_time = time.time()
        eval_evaluator.build_test_file()
        score, f1_conll, ident = eval_evaluator.get_score()
        elapsed = time.time() - start_time
        print('|| min/evaluation {:5.2f}'.format(elapsed / 60))
        writer.add_scalar("eval/" + "F1_conll", f1_conll, g_step)
        g_step += 1
        save_path = args.save_path + save_name + "_" + str(epoch)
        torch.save(model.state_dict(), save_path)
        load_model(model, best_model_path)
        #load_model(model, save_path)

        writer.add_scalar("test/" + "F1_conll", f1_conll, g_step)
        return g_step

    if args.startstage is None or args.startstage == "allpairs":
        optimizer = RMSprop(model.parameters(),
                            lr=args.all_pairs_lr,
                            weight_decay=args.all_pairs_l2)
        loss_func = get_all_pairs_loss(batch_sampler.pairs_per_batch)
        #multi_pool = Pool(processes=5)
        g_step = run_epochs(start_from, args.all_pairs_epoch, loss_func,
                            optimizer, "allpairs", args.all_pairs_lr, g_step)
        del optimizer, loss_func
        #multi_pool.close()
        #multi_pool.join()
        start_from = 0

    if args.startstage is None or args.startstage in ["allpairs", "toppairs"]:
        optimizer = RMSprop(model.parameters(),
                            lr=args.top_pairs_lr,
                            weight_decay=args.top_pairs_l2)
        loss_func = get_top_pair_loss(10 * batch_sampler.mentions_per_batch)
        #multi_pool = Pool(processes=5)
        #predictions = multi_pool.map(get_pred,scale_list)
        g_step = run_epochs(start_from, args.top_pairs_epoch, loss_func,
                            optimizer, "toppairs", args.top_pairs_lr, g_step)
        #multi_pool.close()
        #multi_pool.join()
        del optimizer, loss_func
        start_from = 0

    if args.startstage is None or args.startstage in [
            "ranking", "allpairs", "toppairs"
    ]:
        optimizer = RMSprop(model.parameters(),
                            lr=args.ranking_lr,
                            weight_decay=args.ranking_l2)
        loss_func = get_ranking_loss(batch_sampler.mentions_per_batch)
        #multi_pool = Pool(processes=5)
        g_step = run_epochs(start_from, args.ranking_epoch, loss_func,
                            optimizer, "ranking", args.ranking_lr, g_step)
        #multi_pool.close()
        #multi_pool.join()
        del optimizer, loss_func
    del dataset, eval_dataset, train_evaluator, eval_evaluator
    test_dataset = NCDataset(args.test, args)
    test_evaluator = ConllEvaluator(model, test_dataset, args.test,
                                    args.testkey, embed_path, args)
    test_evaluator.build_test_file()
    score, f1_conll, ident = test_evaluator.get_score()
    elapsed = time.time() - start_time
    print("TEST F1_CONLL SCORE IS,", f1_conll)
    del model, test_dataset