Beispiel #1
0
def train(args, data):
    device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    model = BiDAF(args, data.WORD.vocab.vectors).to(device)

    ema = EMA(args.exp_decay_rate)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1

    iterator = data.train_iter
    for i, batch in enumerate(iterator):
        present_epoch = int(iterator.epoch)
        if present_epoch == args.epoch:
            break
        if present_epoch > last_epoch:
            print('epoch:', present_epoch + 1)
        last_epoch = present_epoch

        p1, p2 = model(batch)

        optimizer.zero_grad()
        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        loss += batch_loss.item()
        batch_loss.backward()
        optimizer.step()

        for name, param in model.named_parameters():
            if param.requires_grad:
                ema.update(name, param.data)

        if (i + 1) % args.print_freq == 0:
            dev_loss, dev_exact, dev_f1 = test(model, ema, args, data)
            c = (i + 1) // args.print_freq

            print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}'
                  f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}')

            if dev_f1 > max_dev_f1:
                max_dev_f1 = dev_f1
                max_dev_exact = dev_exact
                best_model = copy.deepcopy(model)

            loss = 0
            model.train()

    print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}')
    return best_model
def train_pipeline(args):
    field = data.SpanFieldCollection()
    train_loader = data.SpanDataLoader(args.pre_train_path, field)
    dev_loader = data.SpanDataLoader(args.pre_dev_path, field)
    field.build_vocab(None, train_loader.dataset, dev_loader.dataset)
    model = BiDAF(**argument_wrapper.get_model_args(field))
    if args.pretrained_wv_path is not None:
        pretrained = load_pretrained_wv(args.pretrained_wv_path)
        model.set_word_embedding(pretrained)
    train_iter = train_loader.get_batchiter(args.batch_size, args.gpu_id)
    dev_iter = dev_loader.get_batchiter(args.batch_size, args.gpu_id)
    test_func = rc.create_test_func('foo')
    best_model = rc.train_by_span(model,train_iter,dev_iter,RawCorpus(args.dev_path).get_answers(),test_func,\
                     args.epoch_num,args.learning_rate,args.exp_decay_rate,args.save_freq)

    best_model.dump('model.pt')
Beispiel #3
0
def load_mf_from_run_folder(folder_path):
    model_path = f'{folder_path}/model.pt'
    field_path = f'{folder_path}/field.pkl'
    model = BiDAF.load_model(model_path)
    field = SpanFieldCollection()
    field.load_fileds(field_path)
    return model, field
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--char-emb-dim', default=8, type=int)
    parser.add_argument('--char-channel-width', default=5, type=int)
    parser.add_argument('--char-channel-num', default=100, type=int)
    parser.add_argument('--dev-batch-size', default=60, type=int)
    parser.add_argument('--disable-c2q', type=ast.literal_eval)
    parser.add_argument('--disable-q2c', type=ast.literal_eval)
    parser.add_argument('--dropout', default=0.2, type=float)
    parser.add_argument('--epoch', default=12, type=int)
    parser.add_argument('--gpu', default=0, type=int)
    parser.add_argument('--hidden-size', default=100, type=int)
    parser.add_argument('--learning-rate', default=0.5, type=float)
    parser.add_argument('--moving-average-decay', default=0.999, type=float)
    parser.add_argument('--squad-version', default='1.1')
    parser.add_argument('--train-batch-size', default=60, type=int)
    parser.add_argument('--word-vec-dim', default=100, type=int)
    parser.add_argument('--validation-freq', default=500, type=int)
    args = parser.parse_args()

    device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    print("Device is set to", device)

    print(f"Load SQuAD {args.squad_version}")
    data = SQuAD(device=device,
                 squad_version=args.squad_version,
                 word_vec_dim=args.word_vec_dim,
                 train_batch_size=args.train_batch_size,
                 dev_batch_size=args.dev_batch_size)
    setattr(args, 'char_vocab_size', len(data.CHAR_NESTING.vocab))
    setattr(args, 'word_vocab_size', len(data.WORD.vocab))

    print(f"Load BiDAF Model")
    model = BiDAF(pretrain_embedding=data.WORD.vocab.vectors,
                  char_vocab_size=len(data.CHAR_NESTING.vocab),
                  consider_na=True if args.squad_version == "2.0" else False,
                  enable_c2q= not args.disable_c2q if args.disable_c2q is not None else True,
                  enable_q2c= not args.disable_q2c if args.disable_q2c is not None else True,
                  hidden_size=args.hidden_size,
                  char_emb_dim=args.char_emb_dim,
                  char_channel_num=args.char_channel_num,
                  char_channel_width=args.char_channel_width,
                  dropout=args.dropout)

    print(f"Training start")
    trained_model = train(device=device,
                          data=data,
                          model=model,
                          epoch=args.epoch,
                          lr=args.learning_rate,
                          moving_average_decay=args.moving_average_decay,
                          validation_freq=args.validation_freq)
    model_time = int(time.time())
    if not os.path.exists('models'):
        os.makedirs('models')
    torch.save(trained_model.state_dict(), f'trained_models/BiDAF_{model_time}.pt')
Beispiel #5
0
        setattr(options, 'dataset_file',
                '.data/squad/{}'.format(options.dev_file))
    setattr(options, 'prediction_file',
            os.path.join(root_dir, 'prediction.json'))
    setattr(options, 'model_time', strftime('%H:%M:%S', gmtime()))
    logger.info('data loading complete!')

    options.old_model = best_model_file_name
    options.old_ema = best_ema

    answer_append_sentences = joblib.load(
        'sampled_perturb_answer_sentences.pkl')
    question_append_sentences = joblib.load(
        'sampled_perturb_question_sentences.pkl')

    model = BiDAF(options, data.WORD.vocab.vectors).to(device)
    if options.old_model is not None:
        model.load_state_dict(
            torch.load(options.old_model,
                       map_location="cuda:{}".format(options.gpu)))
    if options.old_ema is not None:
        # ema = pickle.load(open(options.old_ema, "rb"))
        ema = torch.load(options.old_ema, map_location=device)
    else:
        ema = EMA(options.exp_decay_rate)
        for name, param in model.named_parameters():
            if param.requires_grad:
                ema.register(name, param.data)

    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
def train(args, data):
    device = torch.device(
        "cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu")
    model = BiDAF(args, data.WORD.vocab.vectors).to(device)

    ema = EMA(args.exp_decay_rate)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    writer = SummaryWriter(log_dir='runs/' + args.model_time)

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1

    iterator = data.train_iter
    for i, batch in enumerate(iterator):
        present_epoch = int(iterator.epoch)
        if present_epoch == args.epoch:
            break
        if present_epoch > last_epoch:
            print('epoch:', present_epoch + 1)
        last_epoch = present_epoch

        p1, p2 = model(batch)

        optimizer.zero_grad()
        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        loss += batch_loss.item()
        batch_loss.backward()
        optimizer.step()

        for name, param in model.named_parameters():
            if param.requires_grad:
                ema.update(name, param.data)

        if (i + 1) % args.print_freq == 0:
            dev_loss, dev_exact, dev_f1 = test(model, ema, args, data)
            c = (i + 1) // args.print_freq

            writer.add_scalar('loss/train', loss, c)
            writer.add_scalar('loss/dev', dev_loss, c)
            writer.add_scalar('exact_match/dev', dev_exact, c)
            writer.add_scalar('f1/dev', dev_f1, c)
            print('train loss: {} / dev loss: {}'.format(loss, dev_loss) +
                  ' / dev EM: {} / dev F1: {}'.format(dev_exact, dev_f1))

            if dev_f1 > max_dev_f1:
                max_dev_f1 = dev_f1
                max_dev_exact = dev_exact
                best_model = copy.deepcopy(model)

            loss = 0
            model.train()

    writer.close()
    print('max dev EM: {} / max dev F1: {}'.format(max_dev_exact, max_dev_f1))

    return best_model
Beispiel #7
0
def train(args, data):
    if args.load_model != "":
        model = BiDAF(args, data.WORD.vocab.vectors)
        model.load_state_dict(torch.load(args.load_model))
    else:
        model = BiDAF(args, data.WORD.vocab.vectors)
    device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    ema = EMA(args.exp_decay_rate)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)
    for name, i in model.named_parameters():
        if not i.is_leaf:
            print(name,i)

    writer = SummaryWriter(log_dir='runs/' + args.model_name)
    best_model = None

    for iterator, dev_iter, dev_file_name, index, print_freq, lr in zip(data.train_iter, data.dev_iter, args.dev_files, range(len(data.train)), args.print_freq, args.learning_rate):
        # print
        # (iterator[0])
        embed()
        exit(0)
        optimizer = optim.Adadelta(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()
        model.train()
        loss, last_epoch = 0, 0
        max_dev_exact, max_dev_f1 = -1, -1
        print(f"Training with {dev_file_name}")
        print()
        for i, batch in tqdm(enumerate(iterator), total=len(iterator) * args.epoch[index], ncols=100):
            present_epoch = int(iterator.epoch)
            eva = False
            if present_epoch == args.epoch[index]:
                break
            if present_epoch > last_epoch:
                print('epoch:', present_epoch + 1)
                eva = True
            last_epoch = present_epoch

            p1, p2 = model(batch)

            optimizer.zero_grad()
            batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
            loss += batch_loss.item()
            batch_loss.backward()
            optimizer.step()

            for name, param in model.named_parameters():
                if param.requires_grad:
                    ema.update(name, param.data)

            torch.cuda.empty_cache()
            if (i + 1) % print_freq == 0 or eva:
                dev_loss, dev_exact, dev_f1 = test(model, ema, args, data, dev_iter, dev_file_name)
                c = (i + 1) // print_freq

                writer.add_scalar('loss/train', loss, c)
                writer.add_scalar('loss/dev', dev_loss, c)
                writer.add_scalar('exact_match/dev', dev_exact, c)
                writer.add_scalar('f1/dev', dev_f1, c)
                print()
                print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}'
                      f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}')

                if dev_f1 > max_dev_f1:
                    max_dev_f1 = dev_f1
                    max_dev_exact = dev_exact
                    best_model = copy.deepcopy(model)

                loss = 0
                model.train()

    writer.close()
    print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}')
    print("testing with test batch on best model")
    test_loss, test_exact, test_f1 = test(best_model, ema, args, data, list(data.test_iter)[-1], args.test_files[-1])

    print(f'test loss: {test_loss:.3f}'
          f' / test EM: {test_exact:.3f} / test F1: {test_f1:.3f}')
    return best_model
Beispiel #8
0
def train(args, data):
    device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    model = BiDAF(args, data.CONTEXT_WORD.vocab.vectors).to(device)
    
    num = count_parameters(model)
    print(f'paramter {num}')

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)

    ema = EMA(args.exp_decay_rate)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    writer = SummaryWriter(log_dir='runs/' + args.model_time)

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1
    print('totally {} epoch'.format(args.epoch))
    
    sys.stdout.flush()
    iterator = data.train_iter
    iterator.repeat = True
    for i, batch in enumerate(iterator):

        present_epoch = int(iterator.epoch)
        if present_epoch == args.epoch:
            print('present_epoch value:',present_epoch)
            break
        if present_epoch > last_epoch:
            print('epoch:', present_epoch + 1)
        last_epoch = present_epoch

        p1, p2 = model(batch.c_char,batch.q_char,batch.c_word[0],batch.q_word[0],batch.c_word[1],batch.q_word[1])
        optimizer.zero_grad()
        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        loss += batch_loss.item()
        batch_loss.backward()
        optimizer.step()

        for name, param in model.named_parameters():
            if param.requires_grad:
                ema.update(name, param.data)

        if (i + 1) % args.print_freq == 0:
            dev_loss, dev_exact, dev_f1, dev_hasans_exact, dev_hasans_f1, dev_noans_exact,dev_noans_f1 = test(model, ema, args, data)
            c = (i + 1) // args.print_freq

            writer.add_scalar('loss/train', loss, c)
            writer.add_scalar('loss/dev', dev_loss, c)
            writer.add_scalar('exact_match/dev', dev_exact, c)
            writer.add_scalar('f1/dev', dev_f1, c)
            print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}'
                  f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}'
                  f' / dev hasans EM: {dev_hasans_exact} / dev hasans F1: {dev_hasans_f1}'
                  f' / dev noans EM: {dev_noans_exact} / dev noans F1: {dev_noans_f1}')

            if dev_f1 > max_dev_f1:
                max_dev_f1 = dev_f1
                max_dev_exact = dev_exact
                best_model = copy.deepcopy(model)

            loss = 0
            model.train() 
        sys.stdout.flush()
    writer.close()
    args.max_f1 = max_dev_f1
    print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}')

    return best_model
def predict():
    parser = argparse.ArgumentParser()
    parser.add_argument('--char-dim', default=8, type=int)
    parser.add_argument('--char-channel-width', default=5, type=int)
    parser.add_argument('--char-channel-size', default=100, type=int)
    parser.add_argument('--context-threshold', default=400, type=int)
    parser.add_argument('--dev-batch-size', default=100, type=int)
    parser.add_argument('--test-batch-size', default=100, type=int)
    parser.add_argument('--dev-file', default='dev-v1.1.json')
    parser.add_argument('--test-file', default='test1.json')
    parser.add_argument('--dropout', default=0.2, type=float)
    parser.add_argument('--epoch', default=12, type=int)
    parser.add_argument('--exp-decay-rate', default=0.999, type=float)
    parser.add_argument('--gpu', default=0, type=int)
    parser.add_argument('--hidden-size', default=200, type=int)
    parser.add_argument('--learning-rate', default=0.5, type=float)
    parser.add_argument('--print-freq', default=250, type=int)
    parser.add_argument('--train-batch-size', default=60, type=int)
    parser.add_argument('--train-file', default='train-v1.1.json')
    parser.add_argument('--word-dim', default=300, type=int)
    args = parser.parse_args()

    print('loading SQuAD data...')
    current_dir = os.getcwd()
    current_dir = os.path.join(current_dir, 'BiDAF')
    path = os.path.join(current_dir, 'testing_files')
    data = SQuAD(args, path)
    setattr(args, 'char_vocab_size', len(data.CHAR.vocab))
    setattr(args, 'word_vocab_size', len(data.WORD.vocab))
    setattr(args, 'dataset_file', f'testing_files/{args.dev_file}')
    setattr(args, 'prediction_file', f'output/prediction{time.time()}.out')
    setattr(args, 'model_time', strftime('%H:%M:%S', gmtime()))
    print('data loading complete!')

    device = torch.device(
        f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    model = BiDAF(args, data.WORD.vocab.vectors).to(device)

    # load trained the parameters to the model
    model_path = os.path.join(current_dir, 'saved_models')
    model.load_state_dict(
        torch.load(os.path.join(model_path, 'BiDAF_tl_new.pt')))
    model.eval()  # show the model result

    answers = dict()  # answers to be saved in dictionary format
    with torch.set_grad_enabled(False):
        for batch in iter(data.test_iter):
            # print(batch)
            p1, p2 = model(batch)
            # batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
            # loss += batch_loss.item()

            batch_size, c_len = p1.size()
            ls = nn.LogSoftmax(dim=1)
            mask = (torch.ones(c_len, c_len) *
                    float('-inf')).to(device).tril(-1).unsqueeze(0).expand(
                        batch_size, -1, -1)
            score = (ls(p1).unsqueeze(2) + ls(p2).unsqueeze(1)) + mask
            score, s_idx = score.max(dim=1)
            score, e_idx = score.max(dim=1)
            s_idx = torch.gather(s_idx, 1, e_idx.view(-1, 1)).squeeze()
            for i in range(batch_size):
                id = batch.id[i]
                answer = batch.c_word[0][i][s_idx[i]:e_idx[i] + 1]
                answer = ' '.join(
                    [data.WORD.vocab.itos[idx] for idx in answer])
                answers[id] = answer

    print(answers)
    return answers
Beispiel #10
0
def train(args, data):
    device = torch.device(
        f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    model = BiDAF(args, data.WORD.vocab.vectors).to(device)

    ema = EMA(args.exp_decay_rate)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    writer = SummaryWriter(log_dir='runs/' + args.model_time)

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1

    iterator = data.train_iter
    num_batch = len(iterator)
    for present_epoch in range(args.epoch):
        print('epoch', present_epoch + 1)
        for i, batch in enumerate(iterator):
            # present_epoch = int(iterator.epoch)
            """
            if present_epoch == args.epoch:
                print(present_epoch)
                print()
                print(args.epoch)
                break
            if present_epoch > last_epoch:
                print('epoch:', present_epoch + 1)
            last_epoch = present_epoch
            """

            p1, p2 = model(batch)

            optimizer.zero_grad()
            """
            print(p1)
            print()
            print(batch.s_idx)
            """

            if len(p1.size()) == 1:
                p1 = p1.reshape(1, -1)
            if len(p2.size()) == 1:
                p2 = p2.reshape(1, -1)
            batch_loss = criterion(p1, batch.s_idx) + criterion(
                p2, batch.e_idx)
            loss += batch_loss.item()
            batch_loss.backward()
            optimizer.step()

            for name, param in model.named_parameters():
                if param.requires_grad:
                    ema.update(name, param.data)

            best_model = copy.deepcopy(model)

            if i + 1 == num_batch:
                dev_loss, dev_exact, dev_f1 = test(model, ema, args, data)
                c = (i + 1) // args.print_freq

                writer.add_scalar('loss/train', loss / num_batch, c)
                writer.add_scalar('loss/dev', dev_loss, c)
                writer.add_scalar('exact_match/dev', dev_exact, c)
                writer.add_scalar('f1/dev', dev_f1, c)
                print(
                    f'train loss: {loss/num_batch:.3f} / dev loss: {dev_loss:.3f}'
                    f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}')

                if dev_f1 > max_dev_f1:
                    max_dev_f1 = dev_f1
                    max_dev_exact = dev_exact
                    best_model = copy.deepcopy(model)

                loss = 0
                model.train()

    writer.close()
    print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}')

    return best_model
Beispiel #11
0
def train(args, data):
    device = torch.device(
        f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    model = BiDAF(args).to(device)

    D_batch = args.train_batch_size
    ema = EMA(args.exp_decay_rate)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    # writer = SummaryWriter(log_dir='runs/' + args.model_time)

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1
    i = 0
    # iterator = data.train_iter
    while i + D_batch < len(data.data):
        b_id = i
        e_id = i + D_batch
        # present_epoch = int(iterator.epoch)
        # if present_epoch == args.epoch:
        #     break
        # if present_epoch > last_epoch:
        #     print('epoch:', present_epoch + 1)
        # last_epoch = present_epoch

        p1, p2 = model(data, b_id, e_id)

        optimizer.zero_grad()
        s_idx, e_idx = data.get_targ(b_id, e_id)
        batch_loss = criterion(p1, s_idx) + criterion(p2, e_idx)
        loss += batch_loss.item()
        batch_loss.backward()
        optimizer.step()

        for name, param in model.named_parameters():
            if param.requires_grad:
                ema.update(name, param.data)

        # if (i + 1) % args.print_freq == 0:
        #     dev_loss, dev_exact, dev_f1 = test(model, ema, args, data)
        #     c = (i + 1) // args.print_freq

        #     # writer.add_scalar('loss/train', loss, c)
        #     # writer.add_scalar('loss/dev', dev_loss, c)
        #     # writer.add_scalar('exact_match/dev', dev_exact, c)
        #     # writer.add_scalar('f1/dev', dev_f1, c)
        #     # print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}'
        #     #       f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}')

        #     if dev_f1 > max_dev_f1:
        #         max_dev_f1 = dev_f1
        #         max_dev_exact = dev_exact
        #         best_model = copy.deepcopy(model)

        #     loss = 0
        #     model.train()

        i += D_batch

    # writer.close()
    print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}')

    return best_model
Beispiel #12
0
def train(args, data):
    device = torch.device("cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu")
    model = BiDAF(args).to(device)

    # 如果载入的这些参数中,有些参数不要求被更新,即固定不变,不参与训练,需要手动设置这些参数的梯度属性为Fasle,
    # 并且在optimizer传参时筛选掉这些参数:
    # ema = EMA(args.exp_decay_rate)
    # for name, param in model.named_parameters():
    #     if param.requires_grad:
    #         ema.register(name, param.data)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adam(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    writer = SummaryWriter(logdir='runs/' + args.model_time)

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1

    iterator = data.train_iter
    for i, batch in enumerate(iterator):
        present_epoch = int(iterator.epoch)
        if present_epoch == args.epoch:
            break
        if present_epoch > last_epoch:
            print('epoch:', present_epoch + 1)
        last_epoch = present_epoch
        try:
            p1, p2 = model(batch)
        except OSError:
            pass
        optimizer.zero_grad()
        batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        loss = batch_loss.item()
        batch_loss.backward()
        optimizer.step()
        print("loss", loss)
        # for name, param in model.named_parameters():
        #     if param.requires_grad:
        #         ema.update(name, param.data)
    #     if (i + 1) % args.print_freq == 0:
    #         dev_loss, dev_exact, dev_f1 = test(model, args, data)
    #         c = (i + 1) // args.print_freq

    #         writer.add_scalar('loss/train', loss, c)
    #         writer.add_scalar('loss/dev', dev_loss, c)
    #         writer.add_scalar('exact_match/dev', dev_exact, c)
    #         writer.add_scalar('f1/dev', dev_f1, c)
    #         print('train loss: {:.3f} / dev loss: {:.3f} dev EM: {:.3f} dev F1: {:.3f}'.format(loss, dev_loss, dev_exact, dev_f1))

    #         if dev_f1 > max_dev_f1:
    #             max_dev_f1 = dev_f1
    #             max_dev_exact = dev_exact
    #             best_model = copy.deepcopy(model)

    #         loss = 0
    #         model.train()

    # writer.close()
    # print('max dev EM: {:.3f} / max dev F1: {:.3f}'.format(max_dev_exact, max_dev_f1))

    return best_model
Beispiel #13
0
parser.add_argument('--context-threshold', default=400, type=int)
parser.add_argument('--dev-batch-size', default=100, type=int)
parser.add_argument('--dev-file', default='dev-v1.1.json')
parser.add_argument('--dropout', default=0.2, type=float)
parser.add_argument('--epoch', default=12, type=int)
parser.add_argument('--exp-decay-rate', default=0.999, type=float)
parser.add_argument('--gpu', default=0, type=int)
parser.add_argument('--hidden-size', default=100, type=int)
parser.add_argument('--learning-rate', default=0.5, type=float)
parser.add_argument('--print-freq', default=250, type=int)
parser.add_argument('--train-batch-size', default=60, type=int)
parser.add_argument('--train-file', default='train-v1.1.json')
parser.add_argument('--word-dim', default=100, type=int)
args = parser.parse_args()

print('loading SQuAD data...')
data = SQuAD(args)
setattr(args, 'char_vocab_size', len(data.CHAR.vocab))
setattr(args, 'word_vocab_size', len(data.WORD.vocab))
setattr(args, 'dataset_file', f'.data/squad/{args.dev_file}')
setattr(args, 'prediction_file', f'prediction{args.gpu}.out')
setattr(args, 'model_time', strftime('%H:%M:%S', gmtime()))
print('data loading complete!')

device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
model = BiDAF(args, data.WORD.vocab.vectors).to(device)

# load trained the parameters to the model
# model.load_state_dict(torch.load(r'saved_models/BiDAF_08:03:17.pt'))
model.eval()    # show the model result
print(model)
parser.add_argument('--word-dim', default=100, type=int)
args = parser.parse_args()

path = r'testing_files'
print('loading SQuAD data...')
data = SQuAD(args, path)
setattr(args, 'char_vocab_size', len(data.CHAR.vocab))
setattr(args, 'word_vocab_size', len(data.WORD.vocab))
setattr(args, 'dataset_file', f'testing_files/{args.dev_file}')
setattr(args, 'prediction_file', f'output/prediction{time.time()}.out')
setattr(args, 'model_time', strftime('%H:%M:%S', gmtime()))
print('data loading complete!')

device = torch.device(
    f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
model = BiDAF(args, data.WORD.vocab.vectors).to(device)

# load trained the parameters to the model
model.load_state_dict(torch.load(r'saved_models/BiDAF_08:03:17.pt'))
model.eval()  # show the model result

answers = dict()  # answers to be saved in dictionary format
with torch.set_grad_enabled(False):
    for batch in iter(data.dev_iter):
        # batch.to(device)
        p1, p2 = model(batch)
        # batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
        # loss += batch_loss.item()

        # (batch, c_len, c_len)
        batch_size, c_len = p1.size()