Beispiel #1
0
    def forward(self, y_z, mu_phi, log_var_phi, mu_theta, log_var_theta):
        std_theta = std(log_var_theta)
        std_phi = std(log_var_phi)
        N = Normal(mu_theta, std_theta)

        # if input more than one sample
        # print(y_z.shape)
        # print(y_z[0,-9:])
        # y_z = th.mean(y_z.view(batch_size, input_size, n_samples_y), dim=2)
        y_z = y_z.view(-1, self.n_samples_y, self.input_size)
        y_z = y_z[:, -1, :]
        # print(y_z.shape)
        # print(y_z[0, -1, :])
        # print(y_z.size())

        # two clusters
        y_expanded = th.cat([y_z, y_z], dim=1)
        # expand(y_z)
        # print(y_expanded.size())
        pdf_y = N.log_prob(y_expanded)
        pdf_y = reshape(pdf_y)
        # print(y_expanded)
        # print(pdf_y)
        # sample z to build empirical sample mean over z for the likelihood
        # we are using only one sample at time from the mixture ----> likelihood
        # is simply the normal
        loglikelihood = 0
        # for every sample compute the weighted mixture
        for sample in range(self.n_samples_z):
            eps = th.randn(y_expanded.size())
            # we use z_y as a selector/weight (z_i is a three dimensional Gaussian
            # in this way we can also measure uncertainly)
            z_y = eps * std_phi + mu_phi

            z_y = reshape(z_y)
            z_y = F.softmax(z_y, dim=2)
            # log of mixture weighted with z
            # print(z_y.shape)
            # print(pdf_y.shape)
            s = th.sum(pdf_y * z_y, dim=2)
            loglikelihood += th.sum(pdf_y * z_y, dim=2)

        loglikelihood /= self.n_samples_z
        loglikelihood = th.sum(loglikelihood, dim=1)
        loglikelihood = th.mean(loglikelihood)  # / y_z.size()[0]*y_z.size()[1]
        # reduce mean over the batch size reduce sum over the lidars
        loglikelihood = move_to_cuda(loglikelihood)

        # reduce over KLD
        # explicit form when q(z|x) is normal and N(0,I)
        # what about k? 9 or 27?
        k = 1  # z_y.size()[2]
        kld = 0.5 * ((log_var_phi.exp() + mu_phi.pow(2) - log_var_phi) - k)
        kld = th.sum(kld, dim=1)
        kld = th.mean(kld)

        # we want to maximize this guy
        elbo = loglikelihood - kld
        # so we need to negate the elbo to minimize
        return -elbo, kld, loglikelihood
Beispiel #2
0
def train(args):
    ckpt = ckpt_utc()
    train_set = PandaDataSetImg(root_dir=args.data_dir, split=args.split)
    train_loader = DataLoader(dataset=train_set,
                              batch_size=args.batch_size,
                              shuffle=True)

    loss_fn = LossReconstruction()
    loss_fn = move_to_cuda(loss_fn)

    model = Autoencoder()
    model = move_to_cuda(model)
    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    epoch_loss_history = []
    for epoch in range(args.epochs):
        loss_batch_history = []
        for iter, x in enumerate(train_loader):

            x = tensor_to_variable(x)
            depth_pred, _ = model(x)
            loss = loss_fn(x, depth_pred)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            loss_batch_history.append(loss.item())

        epoch_loss = np.mean(loss_batch_history)
        epoch_loss_history.append(epoch_loss)

        print("train epoch: {} avg. loss: {:.4f}".format(epoch, epoch_loss))

        plot_eval(
            np.arange(len(epoch_loss_history)),
            np.array(epoch_loss_history),
            save_to=osp.join(args.result_dir, "train_loss.png"),
            title="train loss",
            xlabel="epochs",
            ylabel="loss",
        )
        th.save(model.state_dict(), osp.join(args.ckpt_dir, ckpt))
Beispiel #3
0
def test(args):
    test_set = PandaDataSetImg(root_dir=args.data_dir, split=args.split)
    test_loader = DataLoader(dataset=test_set, batch_size=args.batch_size, shuffle=False)
    model = Autoencoder()
    model = move_to_cuda(model)
    model.eval()

    saved_state_dict = th.load(args.ckpt_dir + args.ckpt_test)
    model.load_state_dict(saved_state_dict)

    MSE, T = 0, 0
    for iter, x in enumerate(test_loader):
        x_pred, _ = model(x)

        b, _, n, m = x.size()
        T += n * m * b
        MSE += th.sum((x - x_pred) ** 2)

        print(MSE)

    print("RMSE: {:.4f}".format(np.sqrt(MSE / T)))
Beispiel #4
0
def predict(args, model, eval_dataloader, device, logger):
    model.eval()
    num_correct = 0
    num_total = 0.0
    rrs = []  # reciprocal rank
    for batch in tqdm(eval_dataloader):
        batch_to_feed = move_to_cuda(batch)
        with torch.no_grad():
            outputs = model(batch_to_feed)

            q = outputs['q']
            c = outputs['c']
            neg_c = outputs['neg_c']

            product_in_batch = torch.mm(q, c.t())
            product_neg = (q * neg_c).sum(-1).unsqueeze(1)
            product = torch.cat([product_in_batch, product_neg], dim=-1)

            target = torch.arange(product.size(0)).to(product.device)
            ranked = product.argsort(dim=1, descending=True)
            prediction = product.argmax(-1)

            # MRR
            idx2rank = ranked.argsort(dim=1)
            for idx, t in enumerate(target.tolist()):
                rrs.append(1 / (idx2rank[idx][t].item() + 1))

            pred_res = prediction == target
            num_total += pred_res.size(0)
            num_correct += pred_res.sum(0)

    acc = num_correct / num_total
    mrr = np.mean(rrs)
    logger.info(f"evaluated {num_total} examples...")
    logger.info(f"avg. Acc: {acc}")
    logger.info(f'MRR: {mrr}')
    model.train()
    return mrr
Beispiel #5
0
    logger.info("Encoding claims and searching")
    questions = [_["claim"] for _ in ds_items]
    metrics = []
    retrieval_outputs = []
    for b_start in tqdm(range(0, len(questions), args.batch_size)):
        with torch.no_grad():
            batch_q = questions[b_start:b_start + args.batch_size]
            batch_ann = ds_items[b_start:b_start + args.batch_size]
            bsize = len(batch_q)
            batch_q_encodes = tokenizer.batch_encode_plus(
                batch_q,
                max_length=args.max_q_len,
                pad_to_max_length=True,
                return_tensors="pt")
            batch_q_encodes = move_to_cuda(dict(batch_q_encodes))
            q_embeds = model.encode_q(
                batch_q_encodes["input_ids"],
                batch_q_encodes["attention_mask"],
                batch_q_encodes.get("token_type_ids", None))
            q_embeds_numpy = q_embeds.cpu().contiguous().numpy()

            D, I = index.search(q_embeds_numpy, args.topk)

            for b_idx in range(bsize):
                topk_docs = []
                for _, doc_id in enumerate(I[b_idx]):
                    doc = id2doc[str(doc_id)]
                    topk_docs.append({"title": doc[0], "text": doc[1]})

                # saving when there's no annotations
def train(args):
    ckpt = ckpt_utc()

    loss_fn = nELBO(args.batch_size, args.n_samples_z, args.n_samples_y)

    model = VAE(
        encoder_layer_sizes=args.encoder_layer_sizes,
        decoder_layer_sizes=args.decoder_layer_sizes,
        latent_size=args.latent_size,
        batch_size=args.batch_size,
        conditional=args.conditional,
        num_labels=args.num_labels,
    )
    model = move_to_cuda(model)
    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    optimizer.zero_grad()

    dataset = Loader(split=args.split, samples=args.n_samples_y)
    # randomize an auxiliary index because we want to use sample of time-series (10 time steps)
    data_loader = DataLoader(dataset=dataset,
                             batch_size=args.batch_size,
                             shuffle=False)

    loss_list = []
    for epoch in range(args.epochs):
        dataset.generate_index()
        print("Epoch: ", epoch)
        L = []
        for itr, batch in enumerate(data_loader):
            # observable
            y, x = batch
            y = tensor_to_variable(y)
            x = tensor_to_variable(x)
            if y.size(0) != args.batch_size:
                continue
            else:
                mu_phi, log_var_phi, mu_theta, log_var_theta = model(y, x)

                loss, kld, ll = loss_fn(y, mu_phi, log_var_phi, mu_theta,
                                        log_var_theta)

                if args.split == "train":
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()

                # compute the loss averaging over epochs and dividing by batches
                L.append(loss.cpu().data.numpy())

        print("negative likelihood: ", -ll.cpu().data.numpy())
        print("kl: ", kld.cpu().data.numpy())
        print("loss:", loss.cpu().data.numpy())

        loss_list.append(np.mean(L) / (len(data_loader)))

    plt.plot(np.array(loss_list))
    plt.grid()
    plt.show()

    path_exists(args.ckpt_dir)
    th.save(model.state_dict(), args.ckpt_dir + ckpt)
    print("done!")
Beispiel #7
0
def train(args):
    ckpt = ckpt_utc()
    loss_fn = Loss(
        args.batch_size,
        args.n_samples_y,
        args.lidar_input_size,
        args.n_clusters,
        model_type=args.model_type,
        is_entropy=args.is_entropy,
        lmbda=args.lmbda,
    )
    model = Model(
        encoder_layer_sizes=args.encoder_layer_sizes,
        latent_size=args.latent_size,
        n_clusters=args.n_clusters,
        batch_size=args.batch_size,
        model_type=args.model_type,
        is_multimodal=args.is_multimodal,
    )
    model = move_to_cuda(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    optimizer.zero_grad()

    dataset = Dataset(path=args.data_dir, split=args.split, n_samples=args.n_samples_y)
    # randomize an auxiliary index because we want to use random sample of time-series (10 time steps)
    # but the time series have to be intact
    data_loader = DataLoader(dataset=dataset, batch_size=args.batch_size, shuffle=False)

    dataset_val = Dataset(path=args.data_dir, split="val", n_samples=args.n_samples_y)
    data_loader_val = DataLoader(dataset=dataset_val, batch_size=args.batch_size, shuffle=False)

    loss_train, loss_val = [], []

    for epoch in range(args.epochs):
        model.train()
        dataset.generate_index()
        print("Epoch: ", epoch)
        loss_epoch = []
        for itr, batch in enumerate(data_loader):
            # observable
            y, x, depth = batch
            y = tensor_to_variable(y)
            x = tensor_to_variable(x)
            depth = tensor_to_variable(depth)
            mu_c, std_c, clusters = model(x)

            loss = loss_fn(y, mu_c, std_c, clusters)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            # compute the loss averaging over epochs and dividing by batches
            loss_epoch.append(loss.cpu().data.numpy())

        print("train loss:", np.mean(loss_epoch))
        loss_train.append(np.mean(loss_epoch))

        if epoch % args.test_every_n_epochs == 0:
            model.eval()
            loss_epoch = []
            with th.no_grad():
                for itr, batch in enumerate(data_loader):
                    y, x = batch
                    mu_c, std_c, clusters = model(x)
                    loss = loss_fn(y, mu_c, std_c, clusters)
                    loss_epoch.append(loss.cpu().data.numpy())

            print("val loss:", np.mean(loss_epoch))
            loss_val.append(np.mean(loss_epoch))

    plt.plot(np.array(loss_train))
    plt.plot(np.array(loss_val))
    plt.grid()
    plt.show()

    path_exists(args.ckpt_dir)
    th.save(model.state_dict(), args.ckpt_dir + ckpt)
    print("done!")
def train(args):
    ckpt = ckpt_utc()
    loss_fn = Loss()
    model = Model(layer_sizes=args.encoder_layer_sizes,
                  latent_size=args.latent_size,
                  is_uq=False)
    model = move_to_cuda(model)

    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
    optimizer.zero_grad()

    # randomize an auxiliary index because we want to use random sample of time-series (10 time steps)
    # but the time series have to be intact
    dataset = Dataset(
        path=args.data_dir,
        path_images=args.data_dir + "TRAIN_DATA/DEPTH/",
        split=args.split,
        n_samples=args.n_samples_y,
        is_label_y=args.is_label_y,
        is_multimodal=args.is_multimodal,
    )
    data_loader = DataLoader(dataset=dataset,
                             batch_size=args.batch_size,
                             shuffle=False)

    dataset_val = Dataset(
        path=args.data_dir,
        path_images=args.data_dir + "TRAIN_DATA/DEPTH/",
        split="val",
        n_samples=args.n_samples_y,
        is_label_y=args.is_label_y,
        is_multimodal=args.is_multimodal,
    )
    data_loader_val = DataLoader(dataset=dataset_val,
                                 batch_size=args.batch_size,
                                 shuffle=False)

    loss_train, loss_val = [], []

    for epoch in range(args.epochs):
        model.train()
        dataset.generate_index()
        print("Epoch: ", epoch)
        loss_epoch = []
        for itr, batch in enumerate(data_loader):
            # observable
            y, x, lbl = batch
            y = tensor_to_variable(y)
            x = tensor_to_variable(x)
            lbl = tensor_to_variable(lbl)
            state = th.cat([y, x], dim=1)

            pred = model(state)
            pred = pred.reshape(-1, args.n_clusters,
                                args.lidar_input_size)  # .permute(0,2,1)

            loss = loss_fn(pred, lbl)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            # compute the loss averaging over epochs and dividing by batches
            loss_epoch.append(loss.cpu().data.numpy())

        print("train loss:", np.mean(loss_epoch))
        loss_train.append(np.mean(loss_epoch))

        if epoch % args.test_every_n_epochs == 0:
            model.eval()
            loss_epoch = []
            with th.no_grad():
                for itr, batch in enumerate(data_loader):
                    y, x, lbl = batch
                    state = th.cat([y, x], dim=1)

                    pred = model(state)
                    pred = pred.reshape(
                        -1, args.n_clusters,
                        args.lidar_input_size)  # .permute(0,2,1)

                    loss = loss_fn(pred, lbl)
                    loss_epoch.append(loss.cpu().data.numpy())

            print("val loss:", np.mean(loss_epoch))
            loss_val.append(np.mean(loss_epoch))

    plt.plot(np.array(loss_train))
    plt.plot(np.array(loss_val))
    plt.grid()
    plt.show()

    path_exists(args.ckpt_dir)
    th.save(model.state_dict(), args.ckpt_dir + ckpt)
    print("done!")
Beispiel #9
0
def main():
    args = train_args()

    if args.fp16:
        apex.amp.register_half_function(torch, 'einsum')

    date_curr = date.today().strftime("%m-%d-%Y")
    model_name = f"{args.prefix}-seed{args.seed}-bsz{args.train_batch_size}-fp16{args.fp16}-lr{args.learning_rate}-decay{args.weight_decay}-warm{args.warmup_ratio}-{args.model_name}"
    args.output_dir = os.path.join(args.output_dir, date_curr, model_name)
    tb_logger = SummaryWriter(
        os.path.join(args.output_dir.replace("logs", "tflogs")))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        print(
            f"output directory {args.output_dir} already exists and is not empty."
        )
    os.makedirs(args.output_dir, exist_ok=True)

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO,
        handlers=[
            logging.FileHandler(os.path.join(args.output_dir, "log.txt")),
            logging.StreamHandler()
        ])
    logger = logging.getLogger(__name__)
    logger.info(args)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    bert_config = AutoConfig.from_pretrained(args.model_name)
    if args.momentum:
        model = MomentumRetriever(bert_config, args)
    elif "roberta" in args.model_name:
        model = RobertaRetrieverSingle(bert_config, args)
    else:
        model = BertRetrieverSingle(bert_config, args)

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    collate_fc = partial(sp_collate, pad_id=tokenizer.pad_token_id)

    if args.do_train and args.max_c_len > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_c_len, bert_config.max_position_embeddings))

    if "fever" in args.predict_file:
        eval_dataset = FeverSingleDataset(tokenizer, args.predict_file,
                                          args.max_q_len, args.max_c_len)
    else:
        eval_dataset = SPDataset(tokenizer, args.predict_file, args.max_q_len,
                                 args.max_c_len)
    eval_dataloader = DataLoader(eval_dataset,
                                 batch_size=args.predict_batch_size,
                                 collate_fn=collate_fc,
                                 pin_memory=True,
                                 num_workers=args.num_workers)
    logger.info(f"Num of dev batches: {len(eval_dataloader)}")

    if args.init_checkpoint != "":
        model = load_saved(model, args.init_checkpoint)

    model.to(device)
    print(
        f"number of trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}"
    )

    if args.do_train:
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = Adam(optimizer_parameters,
                         lr=args.learning_rate,
                         eps=args.adam_epsilon)

        if args.fp16:
            model, optimizer = apex.amp.initialize(
                model, optimizer, opt_level=args.fp16_opt_level)
    else:
        if args.fp16:
            model = apex.amp.initialize(model, opt_level=args.fp16_opt_level)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.do_train:
        global_step = 0  # gradient update step
        batch_step = 0  # forward batch count
        best_mrr = 0
        train_loss_meter = AverageMeter()
        model.train()
        if "fever" in args.predict_file:
            train_dataset = FeverSingleDataset(tokenizer,
                                               args.train_file,
                                               args.max_q_len,
                                               args.max_c_len,
                                               train=True)
        else:
            train_dataset = SPDataset(tokenizer,
                                      args.train_file,
                                      args.max_q_len,
                                      args.max_c_len,
                                      train=True)
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=args.train_batch_size,
                                      pin_memory=True,
                                      collate_fn=collate_fc,
                                      num_workers=args.num_workers,
                                      shuffle=True)

        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs
        warmup_steps = t_total * args.warmup_ratio
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=t_total)

        logger.info('Start training....')
        for epoch in range(int(args.num_train_epochs)):

            for batch in tqdm(train_dataloader):
                batch_step += 1
                batch = move_to_cuda(batch)
                loss = loss_single(model, batch, args.momentum)

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                train_loss_meter.update(loss.item())

                if (batch_step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            apex.amp.master_params(optimizer),
                            args.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()
                    global_step += 1

                    tb_logger.add_scalar('batch_train_loss', loss.item(),
                                         global_step)
                    tb_logger.add_scalar('smoothed_train_loss',
                                         train_loss_meter.avg, global_step)

                    if args.eval_period != -1 and global_step % args.eval_period == 0:
                        mrr = predict(args, model, eval_dataloader, device,
                                      logger)
                        logger.info(
                            "Step %d Train loss %.2f MRR %.2f on epoch=%d" %
                            (global_step, train_loss_meter.avg, mrr * 100,
                             epoch))

                        if best_mrr < mrr:
                            logger.info(
                                "Saving model with best MRR %.2f -> MRR %.2f on epoch=%d"
                                % (best_mrr * 100, mrr * 100, epoch))
                            torch.save(
                                model.state_dict(),
                                os.path.join(args.output_dir,
                                             f"checkpoint_best.pt"))
                            model = model.to(device)
                            best_mrr = mrr

            mrr = predict(args, model, eval_dataloader, device, logger)
            logger.info("Step %d Train loss %.2f MRR %.2f on epoch=%d" %
                        (global_step, train_loss_meter.avg, mrr * 100, epoch))
            tb_logger.add_scalar('dev_mrr', mrr * 100, epoch)
            if best_mrr < mrr:
                torch.save(
                    model.state_dict(),
                    os.path.join(args.output_dir, f"checkpoint_last.pt"))
                logger.info(
                    "Saving model with best MRR %.2f -> MRR %.2f on epoch=%d" %
                    (best_mrr * 100, mrr * 100, epoch))
                torch.save(
                    model.state_dict(),
                    os.path.join(args.output_dir, f"checkpoint_best.pt"))
                model = model.to(device)
                best_mrr = mrr

        logger.info("Training finished!")

    elif args.do_predict:
        acc = predict(args, model, eval_dataloader, device, logger)
        logger.info(f"test performance {acc}")