if topic_counter >= 0:
                new_data["data"].append(topic_dict)
            elif topic_counter >= -1 * dev_topic_num:
                new_dev_data["data"].append(topic_dict)
            elif topic_counter >= -1 * (dev_topic_num + test_topic_num):
                new_test_data["data"].append(topic_dict)
            else:
                break

            topic_counter -= 1

    logger.info(f"Saving new data to {train_file}")
    save(filename=train_file, obj=new_data)
    logger.info(f"Saving new dev data to {dev_file}")
    save(filename=dev_file, obj=new_dev_data)
    logger.info(f"Saving new test data to {test_file}")
    save(filename=test_file, obj=new_test_data)


if __name__ == "__main__":
    args = get_exp2_data_gen_args()
    log = get_logger(log_dir=args.logging_dir, name="data-gen")
    toy_transformer(in_file=args.raw_train_data,
                    train_file=args.train_data_src,
                    dev_file=args.dev_data_src,
                    test_file=args.test_data_src,
                    train_topic_num=args.train_topic_num,
                    dev_topic_num=args.dev_topic_num,
                    test_topic_num=args.test_topic_num,
                    logger=log)
Esempio n. 2
0
def main(args):
    # setting up logging
    logger = get_logger(log_dir=args.logging_dir, name="exp3_evaluation")

    # grabbing GPU
    gpu_ids = []
    if torch.cuda.is_available():
        gpu_ids += [gpu_id for gpu_id in range(torch.cuda.device_count())]
        device = torch.device(f'cuda:{gpu_ids[0]}')
        torch.cuda.set_device(device)
    else:
        device = torch.device('cpu')

    logger.info(f"Using device type: {device}")

    # getting word embeddings
    with open(args.word_emb_file, 'r') as fh:
        word_vectors = np.array(json.load(fh))
    word_vectors = torch.from_numpy(word_vectors)

    # loading in the model
    model = classifier(args=args, word_vectors=word_vectors)
    model = nn.DataParallel(model, gpu_ids)

    ckpt_dict = torch.load("./checkpoints/train/exp3_train-34/best.pth.tar",
                           map_location=device)

    model.load_state_dict(ckpt_dict['model_state'])

    dataset = qcd(data_path=args.dev_feature_file,
                  num_categories=args.num_categories)
    loader = data.DataLoader(dataset,
                             shuffle=True,
                             batch_size=args.batch_size,
                             collate_fn=collate_fn)

    # loading eval_file
    with open(args.dev_eval_file, 'r') as fh:
        gold_dict = json.load(fh)
        all_predicted_indexes = {}
        predicted_indexes = {}
        with torch.no_grad():
            for qw_idxs, ids, topic_ids, lengths in loader:

                qw_idxs.to(device)
                ids.to(device)
                topic_ids.to(device)

                batch_size = qw_idxs.size(0)

                if batch_size != args.batch_size:
                    logger.info(
                        'Did not process because did not meet batch_size threshold'
                    )
                    continue

                targets = [torch.zeros(args.num_categories) for _ in topic_ids]
                targets = torch.stack(targets)
                for tid, t in zip(topic_ids, targets):
                    t[tid] = 1

                res = model(qw_idxs, lengths)

                predicted_indexes = {
                    int(idx): int(torch.argmax(i))
                    for i, idx in zip(res, ids)
                }
                all_predicted_indexes.update(predicted_indexes)

        print(
            f"Was able to predict {len(all_predicted_indexes)}/{len(gold_dict)} total examples."
        )

        correct = 0
        total_eval = 0
        for i in all_predicted_indexes:
            if i in gold_dict:
                if all_predicted_indexes[i] == gold_dict[i]:
                    correct += 1
                total_eval += 1
        logger.info(f"Got {correct}/{total_eval} correct")
Esempio n. 3
0
def main(args):

    # Set up logging and devices
    name = "train_exp2"
    args.save_dir = util.get_save_dir(args.logging_dir, name, training=True)
    log = get_logger(args.save_dir, name)
    tbx = SummaryWriter(args.save_dir)
    device, gpu_ids = util.get_available_devices()
    log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}")
    args.batch_size *= max(1, len(gpu_ids))

    # Set random seed
    log.info(f"Using random seed {args.random_seed}...")
    random.seed(args.random_seed)
    np.random.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)

    # Get embeddings
    log.info(f"Loading embeddings from {args.word_emb_file}...")
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info("Building model...")
    model = BiDAF(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, gpu_ids)
    if args.load_path:
        log.info(f"Loading checkpoint from {args.load_path}...")
        model, step = util.load_model(model, args.load_path, gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(), args.learning_rate,
                               weight_decay=args.learning_rate_decay)
    # scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR
    scheduler = sched.ReduceLROnPlateau(optimizer=optimizer,
                                        mode="min", factor=0.1,
                                        patience=2, verbose=True, cooldown=0 
                                        min_lr=0.0005)


    for epoch in range(args.num_epochs):
        log.info(f"Starting epoch {epoch}...")
        for i in range(args.num_train_chunks):
        # Get data loader
            train_rec_file = f"{args.train_record_file_exp2}_{i}.npz"
            log.info(f'Building dataset from {train_rec_file} ...')
            train_dataset = SQuAD(train_rec_file, args.exp2_train_topic_contexts, use_v2=True)
            train_loader = data.DataLoader(train_dataset,
                                           batch_size=args.batch_size,
                                           shuffle=True,
                                           num_workers=args.num_workers,
                                           collate_fn=collate_fn)

            # Train
            log.info('Training...')
            steps_till_eval = args.eval_steps
            epoch = 0
        # torch.set_num_threads(7)
            with torch.enable_grad(), tqdm(total=len(train_loader.dataset)) as progress_bar:
                for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                    # Setup for forward
                    cw_idxs = cw_idxs.to(device)
                    qw_idxs = qw_idxs.to(device)
                    batch_size = qw_idxs.size(0)
                    optimizer.zero_grad()

                    # Forward
                    log_p1, log_p2 = model(cw_idxs, qw_idxs)
                    y1, y2 = y1.to(device), y2.to(device)
                    loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                    loss_val = loss.item()

                    # Backward
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                    optimizer.step()
                    scheduler.step(step // batch_size)
                    ema(model, step // batch_size)

                    # Log info
                    step += batch_size
                    progress_bar.update(batch_size)
                    progress_bar.set_postfix(epoch=epoch,
                                             NLL=loss_val)
                    tbx.add_scalar('train/NLL', loss_val, step)
                    tbx.add_scalar('train/LR',
                                   optimizer.param_groups[0]['lr'],
                                   step)

                    steps_till_eval -= batch_size
                    if steps_till_eval <= 0:
                        steps_till_eval = args.eval_steps

                        # Evaluate and save checkpoint
                        log.info(f"Evaluating at step {step}...")
                        ema.assign(model)

                        for i in range(args.num_dev_chunks):
                        # Get data loader
                            all_pred_dicts = {}
                            all_results = OrderedDict() 
                            dev_rec_file = f"{args.dev_record_file_exp2}_{i}.npz"
                            log.info(f'Building evaluating dataset from {dev_rec_file} ...')
                            dev_dataset = SQuAD(dev_rec_file, 
                                                args.exp2_dev_topic_contexts, 
                                                use_v2=True)
                            dev_loader = data.DataLoader(dev_dataset,
                                                           batch_size=args.batch_size,
                                                           shuffle=True,
                                                           num_workers=args.num_workers,
                                                           collate_fn=collate_fn)
                            results, pred_dict = evaluate(model, dev_loader, device,
                                                          args.dev_eval_file,
                                                          args.max_ans_len,
                                                          use_squad_v2=True)
                            all_results.update(results)
                            all_pred_dicts.update(pred_dict)

                            del dev_dataset
                            del dev_loader
                            del results
                            del pred_dict
                            torch.cuda.empty_cache()

                        saver.save(step, model, all_results[args.metric_name], device)
                        ema.resume(model)

                        # Log to console
                        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in all_results.items())
                        log.info(f"Dev {results_str}")

                        # Log to TensorBoard
                        log.info('Visualizing in TensorBoard...')
                        for k, v in all_results.items():
                            tbx.add_scalar(f"dev/{k}", v, step)
                        util.visualize(tbx,
                                       pred_dict=all_pred_dicts,
                                       eval_path=args.dev_eval_file,
                                       step=step,
                                       split='dev',
                                       num_visuals=args.num_visuals)
                    torch.cuda.empty_cache()
            del train_dataset
            del train_loader
            torch.cuda.empty_cache()
Esempio n. 4
0
def main(args):

    exp_name = "exp3_train"

    # setting up logging
    log = get_logger(args.logging_dir, exp_name)

    # setting a save directory
    save_dir = get_save_dir("./checkpoints",
                            exp_name,
                            training=True,
                            id_max=200)

    # setting up tensor board
    tbx = SummaryWriter(save_dir)

    # setting up saver
    saver = CheckpointSaver(save_dir=save_dir,
                            max_checkpoints=args.max_checkpoints,
                            metric_name="BCELoss",
                            log=log)

    # setting the random seed
    log.info(f"Using random seed {args.random_seed}...")
    random.seed(args.random_seed)
    np.random.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)

    # grabbing a gpu if it is available
    gpu_ids = []
    gpu_ids += [gpu_id for gpu_id in range(torch.cuda.device_count())]
    device = torch.device(f'cuda:{gpu_ids[0]}')
    torch.cuda.set_device(device)

    log.info(f"Using device type: {device}")

    # getting word embeddings
    with open(args.word_emb_file, 'r') as fh:
        word_vectors = np.array(json.load(fh))
    word_vectors = torch.from_numpy(word_vectors)

    # setting up the datasets
    train_dataset = qcd(data_path=args.train_feature_file,
                        num_categories=args.num_categories)
    train_loader = data.DataLoader(train_dataset,
                                   shuffle=True,
                                   batch_size=args.batch_size,
                                   collate_fn=collate_fn)
    dev_dataset = qcd(data_path=args.dev_feature_file,
                      num_categories=args.num_categories)
    dev_loader = data.DataLoader(dev_dataset,
                                 shuffle=False,
                                 batch_size=args.batch_size,
                                 collate_fn=collate_fn)

    # setting up the model
    model = classifier(args=args, word_vectors=word_vectors)
    model = nn.DataParallel(model, gpu_ids)
    model.to(device)
    model.train()
    ema = EMA(model, args.ema_decay)

    # optimizer = optim.Adadelta(model.parameters(), args.learning_rate,
    #                            weight_decay=args.learning_rate_decay)
    # scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR
    optimizer = optim.SGD(model.parameters(), lr=args.learning_rate)
    step = 0
    steps_till_eval = args.eval_steps

    log.info(f"Vars: {json.dumps(vars(args), indent=4, sort_keys=True)}")

    for epoch in range(args.num_epochs):
        log.info(f"Starting epoch {epoch+1}")
        with torch.enable_grad(), tqdm(
                total=len(train_loader.dataset)) as progress_bar:
            for qw_idxs, ids, topic_ids, lengths in train_loader:
                qw_idxs = qw_idxs.to(device)
                batch_size = qw_idxs.size(0)
                if batch_size != args.batch_size:
                    log.info(
                        'Did not process because did not meet batch_size threshold'
                    )
                    continue
                topic_ids = topic_ids.to(device)
                lengths = lengths.to(device)
                optimizer.zero_grad()

                # targets = [torch.zeros(args.num_categories) for _ in topic_ids]
                # targets = torch.stack(targets).to(device)
                # for tid, t in zip(topic_ids, targets):
                #     t[tid] = 1
                res = model(qw_idxs, lengths)

                # for loss, either nn.softmax_cross_entropy_with_logits or nn.BCELoss or nn.BCEWithLogitsLoss
                # not really sure why this is working and the others aren't
                #        loss = nn.CrossEntropyLoss()
                #         loss = nn.BCELoss()
                #         loss = nn.BCEWithLogitsLoss()
                loss_output = F.nll_loss(F.log_softmax(res, dim=1), topic_ids)
                loss_output.backward()
                loss_val = loss_output.item()
                optimizer.step()
                # scheduler.step(step//batch_size)
                ema(model, step // batch_size)

                step += batch_size
                steps_till_eval -= batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(NLL=(loss_val), Epoch=(epoch + 1))

                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    log.info(f"Evaluating at step: {step}")

                    ema.assign(model)
                    perc_correct, vis_examples, avg_loss = evaluate(
                        model, dev_loader, device, args.dev_eval_file)
                    log.info(
                        f"Out of Sample BCE loss: {avg_loss} at step {step} in epoch {epoch+1}, resulting in {perc_correct} percent correct"
                    )

                    tbx.add_scalar("NLL Loss", loss_val, step)
                    tbx.add_scalar("Percent Accuracy", perc_correct, step)

                    for i, example in enumerate(vis_examples):
                        tbl_fmt = (
                            f'- **Question:** {example["question"]}\n' +
                            f'- **Topic ID:** {example["answer"]}\n' +
                            f'- **Prediction:** {example["prediction"]}')

                        tbx.add_text(tag=f'{i}_of_{len(vis_examples)}',
                                     text_string=tbl_fmt,
                                     global_step=step)

                    saver.save(model=model,
                               step=step,
                               epoch=epoch,
                               metric_val=loss_val,
                               device=device)
                    ema.resume(model)
                    model.to(device)
                    model.train()

                    log.info(f"resuming training on device {device}")
                                    "answer_start"]
                            answer_dict["text"] = answer["text"]

                            qas_dict["answers"].append(answer_dict)
                    paragraph["qas"].append(qas_dict)
                topic_dict["paragraphs"].append(paragraph)
            new_data["data"].append(topic_dict)

    logger.info(f"Processed {counter} question, answer pairs")
    logger.info(f"Saving to {out_file}")
    save(filename=out_file, obj=new_data)


if __name__ == "__main__":
    args = get_exp1_transform_args()
    logger = get_logger(log_dir=args.logging_dir,
                        name="exp_1 data transformer")

    # standard sanity check to run every time
    c, b = get_new_context("test", [
        "test1", "test2", "test3", "test4", "test5", "test6", "test7", "test8",
        "test9"
    ])
    test_val = "test" == c[b:b + 4]
    if test_val != True:
        raise ValueError('The get_new_context function is not working')

    if args.datasplit == "train" or args.datasplit == "all":
        exp_1_transformer(args.train_data_src, args.train_data_exp1, logger)
    if args.datasplit == "dev" or args.datasplit == "all":
        exp_1_transformer(args.dev_data_src, args.dev_data_exp1, logger)
    if args.datasplit == "test" or args.datasplit == "all":
Esempio n. 6
0
                                  out_file=args.dev_record_file_exp1,
                                  word2idx_dict=word2idx_dict,
                                  char2idx_dict=char2idx_dict,
                                  is_test=False)
    save(args.dev_meta_file, dev_meta)
    save(args.dev_eval_file, dev_eval)
    del dev_meta
    del dev_eval

    #    test_examples, test_eval = process_file(filename=args.test_data_exp1,
    #                                            data_type="test",
    #                                            word_counter=word_counter,
    #                                            char_counter=char_counter,
    #                                            logger=logger)
    #    test_meta = build_features(args=args, examples=test_examples, data_type="test",
    #                               out_file=args.test_record_file_exp1, word2idx_dict=word2idx_dict,
    #                               char2idx_dict=char2idx_dict, is_test=True)
    #    save(args.test_meta_file, test_meta)
    #    save(args.test_eval_file, test_eval)

    save(args.word2idx_file, word2idx_dict)
    save(args.char2idx_file, char2idx_dict)


if __name__ == '__main__':
    nlp = spacy.blank("en")
    args = get_exp1_setup_args()
    logger = get_logger(log_dir=args.logging_dir, name="exp1_setup")

    pre_process(args=args, logger=logger)
Esempio n. 7
0
def main(args):
    args.save_dir = util.get_save_dir(args.save_dir,
                                      "exp1_training",
                                      training=False)
    log = get_logger(args.logging_dir, "exp1_training")
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size)
    model = nn.DataParallel(model, gpu_ids)

    log.info(f'Loading checkpoint from {args.load_path}...')
    model = util.load_model(model, c.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    log.info('Building dataset...')
    dataset = SQuAD(args.test_record_file, True)
    data_loader = data.DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    # Evaluate
    log.info(f'Evaluating on {args.datasplit} split...')
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    with open(args.test_eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, c.max_ans_len, True)

            # Log info
            progress_bar.update(batch_size)

            # Not using the unlabeled test set
            #            if args.split != 'test':
            #                # No labels for the test set, so NLL would be invalid
            #                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(), True)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    results = util.eval_dicts(gold_dict, pred_dict, True)
    results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                    ('EM', results['EM'])]
    results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)
    # Log to console
    results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
    log.info(f'{args.datasplit} {results_str}')
    # Log to TensorBoard
    tbx = SummaryWriter(c.save_dir)
    util.visualize(tbx,
                   pred_dict=pred_dict,
                   eval_path=args.test_eval_file,
                   step=0,
                   split=args.datasplit,
                   num_visuals=args.num_visuals)