Esempio n. 1
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')

    model, step = get_model(word_vectors, char_vectors, log, args)
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    #optimizer = get_opt(model,args)
    #optimizer = optim.Adam(model.parameters(),lr=1,betas=[0.8,0.999],eps=1e-7,weight_decay=args.l2_wd)
    #warmup=args.warmup
    # cooldown=args.hidden_size*100
    #scheduler=sched.LambdaLR(optimizer,lambda step: min(0.001,0.001*(step/warmup)**0.5) if step<=cooldown else max(1e-4 ,1e-3*(1-min(1,(step-cooldown)/cooldown))**0.5))
    #scheduler=sched.LambdaLR(optimizer,lambda step: min(0.001,0.001*(step/warmup)**0.5))
    '''
        cr = 1.0 / math.log(warmup)
    scheduler = sched.LambdaLR(
        optimizer,
        lr_lambda=lambda ee: cr * math.log(ee + 1)
        if ee < warmup else 1)

    '''
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda step: 1)

    # Get data loader
    log.info('Building dataset...')
    if (args.model_name == "KnowGQA"):
        train_dataset = SQUADwithGraph(args.train_record_file_graph,
                                       args.use_squad_v2)
        dev_dataset = SQUADwithGraph(args.dev_record_file_graph,
                                     args.use_squad_v2)
        train_loader = data.DataLoader(train_dataset,
                                       batch_size=args.batch_size,
                                       shuffle=True,
                                       num_workers=args.num_workers,
                                       collate_fn=collate_fn_graph)

        dev_loader = data.DataLoader(dev_dataset,
                                     batch_size=4,
                                     shuffle=False,
                                     num_workers=args.num_workers,
                                     collate_fn=collate_fn_graph)
    else:
        train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
        dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
        train_loader = data.DataLoader(train_dataset,
                                       batch_size=args.batch_size,
                                       shuffle=True,
                                       num_workers=args.num_workers,
                                       collate_fn=collate_fn)

        dev_loader = data.DataLoader(dev_dataset,
                                     batch_size=args.batch_size,
                                     shuffle=False,
                                     num_workers=args.num_workers,
                                     collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            if (args.model_name == "KnowGQA"):
                for cw_idxs, cc_idxs, qw_idxs, qc_idxs, co_idxs, adjs, y1, y2, ids in train_loader:
                    # Setup for forward
                    cw_idxs = cw_idxs.to(device)
                    cc_idxs = cc_idxs.to(device)
                    qw_idxs = qw_idxs.to(device)
                    qc_idxs = qc_idxs.to(device)
                    co_idxs = co_idxs.to(device)
                    adjs = adjs.to(device)
                    batch_size = cw_idxs.size(0)
                    optimizer.zero_grad()

                    # Forward
                    log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs,
                                           co_idxs, adjs)

                    y1, y2 = y1.to(device), y2.to(device)
                    loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                    loss_val = loss.item()

                    # Backward
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(),
                                             args.max_grad_norm)
                    optimizer.step()
                    scheduler.step()
                    ema(model, step // batch_size)

                    # Log info
                    step += batch_size
                    progress_bar.update(batch_size)
                    progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                    tbx.add_scalar('train/NLL', loss_val, step)
                    tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                                   step)

                    steps_till_eval -= batch_size
                    if steps_till_eval <= 0:
                        steps_till_eval = args.eval_steps

                        # Evaluate and save checkpoint
                        log.info(f'Evaluating at step {step}...')
                        ema.assign(model)
                        results, pred_dict = evaluate(model, "KnowGQA",
                                                      dev_loader, device,
                                                      args.dev_eval_file,
                                                      args.max_ans_len,
                                                      args.use_squad_v2)
                        saver.save(step, model, results[args.metric_name],
                                   device)
                        ema.resume(model)

                        # Log to console
                        results_str = ', '.join(f'{k}: {v:05.2f}'
                                                for k, v in results.items())
                        log.info(f'Dev {results_str}')

                        # Log to TensorBoard
                        log.info('Visualizing in TensorBoard...')
                        for k, v in results.items():
                            tbx.add_scalar(f'dev/{k}', v, step)
                        util.visualize(tbx,
                                       pred_dict=pred_dict,
                                       eval_path=args.dev_eval_file,
                                       step=step,
                                       split='dev',
                                       num_visuals=args.num_visuals)
            elif args.model_name == "BiDAF_nochar":
                for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                    # Setup for forward
                    cw_idxs = cw_idxs.to(device)
                    qw_idxs = qw_idxs.to(device)
                    batch_size = cw_idxs.size(0)
                    optimizer.zero_grad()

                    # Forward
                    log_p1, log_p2 = model(cw_idxs, qw_idxs)

                    y1, y2 = y1.to(device), y2.to(device)
                    loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                    loss_val = loss.item()

                    # Backward
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(),
                                             args.max_grad_norm)
                    optimizer.step()
                    scheduler.step()
                    ema(model, step // batch_size)

                    # Log info
                    step += batch_size
                    progress_bar.update(batch_size)
                    progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                    tbx.add_scalar('train/NLL', loss_val, step)
                    tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                                   step)

                    steps_till_eval -= batch_size
                    if steps_till_eval <= 0:
                        steps_till_eval = args.eval_steps

                        # Evaluate and save checkpoint
                        log.info(f'Evaluating at step {step}...')
                        ema.assign(model)
                        results, pred_dict = evaluate(model, "BiDAF_nochar",
                                                      dev_loader, device,
                                                      args.dev_eval_file,
                                                      args.max_ans_len,
                                                      args.use_squad_v2)
                        saver.save(step, model, results[args.metric_name],
                                   device)
                        ema.resume(model)

                        # Log to console
                        results_str = ', '.join(f'{k}: {v:05.2f}'
                                                for k, v in results.items())
                        log.info(f'Dev {results_str}')

                        # Log to TensorBoard
                        log.info('Visualizing in TensorBoard...')
                        for k, v in results.items():
                            tbx.add_scalar(f'dev/{k}', v, step)
                        util.visualize(tbx,
                                       pred_dict=pred_dict,
                                       eval_path=args.dev_eval_file,
                                       step=step,
                                       split='dev',
                                       num_visuals=args.num_visuals)

            else:
                for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                    # Setup for forward
                    cw_idxs = cw_idxs.to(device)
                    cc_idxs = cc_idxs.to(device)
                    qw_idxs = qw_idxs.to(device)
                    qc_idxs = qc_idxs.to(device)
                    batch_size = cw_idxs.size(0)
                    optimizer.zero_grad()

                    # Forward
                    log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)

                    y1, y2 = y1.to(device), y2.to(device)
                    loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                    loss_val = loss.item()

                    # Backward
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(),
                                             args.max_grad_norm)
                    optimizer.step()
                    scheduler.step()
                    ema(model, step // batch_size)

                    # Log info
                    step += batch_size
                    progress_bar.update(batch_size)
                    progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                    tbx.add_scalar('train/NLL', loss_val, step)
                    tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                                   step)

                    steps_till_eval -= batch_size
                    if steps_till_eval <= 0:
                        steps_till_eval = args.eval_steps

                        # Evaluate and save checkpoint
                        log.info(f'Evaluating at step {step}...')
                        ema.assign(model)
                        results, pred_dict = evaluate(model, "BiDAF",
                                                      dev_loader, device,
                                                      args.dev_eval_file,
                                                      args.max_ans_len,
                                                      args.use_squad_v2)
                        saver.save(step, model, results[args.metric_name],
                                   device)
                        ema.resume(model)

                        # Log to console
                        results_str = ', '.join(f'{k}: {v:05.2f}'
                                                for k, v in results.items())
                        log.info(f'Dev {results_str}')

                        # Log to TensorBoard
                        log.info('Visualizing in TensorBoard...')
                        for k, v in results.items():
                            tbx.add_scalar(f'dev/{k}', v, step)
                        util.visualize(tbx,
                                       pred_dict=pred_dict,
                                       eval_path=args.dev_eval_file,
                                       step=step,
                                       split='dev',
                                       num_visuals=args.num_visuals)
Esempio n. 2
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir) # Writes entries directly to event files in the logdir to be consumed by TensorBoard.
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))  # 一个gpu: batch_size=64 看实际情况

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path: # default=None
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay) # ema_decay = 0.999
    # ema core =>  new_average = (1.0 - decay) * param.data + decay * self.shadow[name]

    # Get saver
    # metric_name: NLL or EM F1
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints, # max_checkpoints = 5
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(), args.lr,
                               weight_decay=args.l2_wd)  # lr : default=0.5    l2_wd : default=0
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)  # train_record_file = './data/train.npz'

    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size, # 64
                                   shuffle=True, # sampler = RandomSampler(dataset) batch_sampler = BatchSampler(sampler, batch_size, drop_last)
                                   num_workers=args.num_workers, # 4
                                   collate_fn=collate_fn) # merges a list of samples to form a mini-batch.
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) #  dev_record_file =  './data/dev.npz'
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)# Merge examples of different length by padding all examples to the maximum length in the batch.

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps # 50000
    epoch = step // len(train_dataset) # len(train_dataset)= 7   epoch=0
    while epoch != args.num_epochs: # 30
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)  # 64
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, qw_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)   # max_grad_norm : default=5.0
                optimizer.step()  # 进行1次optimize
                scheduler.step(step // batch_size)# train : step=0
                ema(model, step // batch_size)   # def __call__(self, model, num_updates):

                # Log info
                step += batch_size #step: 0   batch_size=64
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step) # Add scalar data to summary.
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps  # 50000

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model) ##
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file, # './data/dev_eval.json'
                                                  args.max_ans_len,  # 15
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get model
    log.info('Building model...')
    '''
        TODO: YOUR MODEL HERE
    '''
    model = None

    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = MyDataset(args.train_record_file)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = MyDataset(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for features, ys, ids in train_loader:
                # Setup for forward

                batch_size = 1  # TODO:

                optimizer.zero_grad()

                # Forward
                outputs = model(features)
                y = y.to(device)
                loss = loss_fn(outputs, y)  # TODO

                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 4
0
def main(args):

    # Set up logging and devices
    name = "train_exp2"
    args.save_dir = util.get_save_dir(args.logging_dir, name, training=True)
    log = get_logger(args.save_dir, name)
    tbx = SummaryWriter(args.save_dir)
    device, gpu_ids = util.get_available_devices()
    log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}")
    args.batch_size *= max(1, len(gpu_ids))

    # Set random seed
    log.info(f"Using random seed {args.random_seed}...")
    random.seed(args.random_seed)
    np.random.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)

    # Get embeddings
    log.info(f"Loading embeddings from {args.word_emb_file}...")
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info("Building model...")
    model = BiDAF(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, gpu_ids)
    if args.load_path:
        log.info(f"Loading checkpoint from {args.load_path}...")
        model, step = util.load_model(model, args.load_path, gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(), args.learning_rate,
                               weight_decay=args.learning_rate_decay)
    # scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR
    scheduler = sched.ReduceLROnPlateau(optimizer=optimizer,
                                        mode="min", factor=0.1,
                                        patience=2, verbose=True, cooldown=0 
                                        min_lr=0.0005)


    for epoch in range(args.num_epochs):
        log.info(f"Starting epoch {epoch}...")
        for i in range(args.num_train_chunks):
        # Get data loader
            train_rec_file = f"{args.train_record_file_exp2}_{i}.npz"
            log.info(f'Building dataset from {train_rec_file} ...')
            train_dataset = SQuAD(train_rec_file, args.exp2_train_topic_contexts, use_v2=True)
            train_loader = data.DataLoader(train_dataset,
                                           batch_size=args.batch_size,
                                           shuffle=True,
                                           num_workers=args.num_workers,
                                           collate_fn=collate_fn)

            # Train
            log.info('Training...')
            steps_till_eval = args.eval_steps
            epoch = 0
        # torch.set_num_threads(7)
            with torch.enable_grad(), tqdm(total=len(train_loader.dataset)) as progress_bar:
                for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                    # Setup for forward
                    cw_idxs = cw_idxs.to(device)
                    qw_idxs = qw_idxs.to(device)
                    batch_size = qw_idxs.size(0)
                    optimizer.zero_grad()

                    # Forward
                    log_p1, log_p2 = model(cw_idxs, qw_idxs)
                    y1, y2 = y1.to(device), y2.to(device)
                    loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                    loss_val = loss.item()

                    # Backward
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                    optimizer.step()
                    scheduler.step(step // batch_size)
                    ema(model, step // batch_size)

                    # Log info
                    step += batch_size
                    progress_bar.update(batch_size)
                    progress_bar.set_postfix(epoch=epoch,
                                             NLL=loss_val)
                    tbx.add_scalar('train/NLL', loss_val, step)
                    tbx.add_scalar('train/LR',
                                   optimizer.param_groups[0]['lr'],
                                   step)

                    steps_till_eval -= batch_size
                    if steps_till_eval <= 0:
                        steps_till_eval = args.eval_steps

                        # Evaluate and save checkpoint
                        log.info(f"Evaluating at step {step}...")
                        ema.assign(model)

                        for i in range(args.num_dev_chunks):
                        # Get data loader
                            all_pred_dicts = {}
                            all_results = OrderedDict() 
                            dev_rec_file = f"{args.dev_record_file_exp2}_{i}.npz"
                            log.info(f'Building evaluating dataset from {dev_rec_file} ...')
                            dev_dataset = SQuAD(dev_rec_file, 
                                                args.exp2_dev_topic_contexts, 
                                                use_v2=True)
                            dev_loader = data.DataLoader(dev_dataset,
                                                           batch_size=args.batch_size,
                                                           shuffle=True,
                                                           num_workers=args.num_workers,
                                                           collate_fn=collate_fn)
                            results, pred_dict = evaluate(model, dev_loader, device,
                                                          args.dev_eval_file,
                                                          args.max_ans_len,
                                                          use_squad_v2=True)
                            all_results.update(results)
                            all_pred_dicts.update(pred_dict)

                            del dev_dataset
                            del dev_loader
                            del results
                            del pred_dict
                            torch.cuda.empty_cache()

                        saver.save(step, model, all_results[args.metric_name], device)
                        ema.resume(model)

                        # Log to console
                        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in all_results.items())
                        log.info(f"Dev {results_str}")

                        # Log to TensorBoard
                        log.info('Visualizing in TensorBoard...')
                        for k, v in all_results.items():
                            tbx.add_scalar(f"dev/{k}", v, step)
                        util.visualize(tbx,
                                       pred_dict=all_pred_dicts,
                                       eval_path=args.dev_eval_file,
                                       step=step,
                                       split='dev',
                                       num_visuals=args.num_visuals)
                    torch.cuda.empty_cache()
            del train_dataset
            del train_loader
            torch.cuda.empty_cache()
Esempio n. 5
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # ######################################
    # tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)
    # train_examples = None
    # train_examples = read_squad_examples(
    #     input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
    # train_features = convert_examples_to_features(
    #     examples=train_examples,
    #     tokenizer=tokenizer,
    #     max_seq_length=args.max_seq_length,
    #     doc_stride=args.doc_stride,
    #     max_query_length=args.max_query_length,
    #     is_training=True)
    # if args.local_rank == -1 or torch.distributed.get_rank() == 0:
    #     logger.info("  Saving train features into cached file %s", cached_train_features_file)
    #     with open(cached_train_features_file, "wb") as writer:
    #         pickle.dump(train_features, writer)
    # all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    # x = all_input_ids
    ###########################################

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  char_vectors=char_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)

                # added_flag
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)

                optimizer.zero_grad()

                # Forward
                # log_p1, log_p2 = model(cw_idxs, qw_idxs)
                log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 6
0
def main(course_dir,
         text_embedding_size,
         audio_embedding_size,
         image_embedding_size,
         hidden_size,
         drop_prob,
         max_text_length,
         out_heatmaps_dir,
         args,
         batch_size=3,
         num_epochs=100):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Create Dataset objects
    text_dataset = TextDataset(course_dir, max_text_length)
    audio_dataset = AudioDataset(course_dir)
    target_dataset = TargetDataset(course_dir)
    # Preprocess the image in prescribed format
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    transform = transforms.Compose([
        transforms.RandomResizedCrop(256),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])
    image_dataset = ImageDataset(course_dir, transform)

    assert len(text_dataset) == len(audio_dataset) and len(
        audio_dataset) == len(image_dataset) and len(image_dataset) == len(
            target_dataset), "Unequal dataset lengths"

    # Creating data indices for training and validation splits:
    train_indices, val_indices = gen_train_val_indices(text_dataset)

    # Creating PT data samplers and loaders:
    train_sampler = torch.utils.data.SequentialSampler(train_indices)
    val_sampler = torch.utils.data.SequentialSampler(val_indices)

    # Get sentence embeddings
    train_text_loader = torch.utils.data.DataLoader(text_dataset,
                                                    batch_size=batch_size,
                                                    shuffle=False,
                                                    num_workers=2,
                                                    collate_fn=collator,
                                                    sampler=train_sampler)
    val_text_loader = torch.utils.data.DataLoader(text_dataset,
                                                  batch_size=batch_size,
                                                  shuffle=False,
                                                  num_workers=2,
                                                  collate_fn=collator,
                                                  sampler=val_sampler)

    # Get Audio embeddings
    train_audio_loader = torch.utils.data.DataLoader(audio_dataset,
                                                     batch_size=batch_size,
                                                     shuffle=False,
                                                     num_workers=2,
                                                     collate_fn=collator,
                                                     sampler=train_sampler)
    val_audio_loader = torch.utils.data.DataLoader(audio_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=False,
                                                   num_workers=2,
                                                   collate_fn=collator,
                                                   sampler=val_sampler)

    # Get images
    train_image_loader = torch.utils.data.DataLoader(image_dataset,
                                                     batch_size=batch_size,
                                                     shuffle=False,
                                                     num_workers=2,
                                                     collate_fn=collator,
                                                     sampler=train_sampler)
    val_image_loader = torch.utils.data.DataLoader(image_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=False,
                                                   num_workers=2,
                                                   collate_fn=collator,
                                                   sampler=val_sampler)

    # Load Target text
    train_target_loader = torch.utils.data.DataLoader(
        target_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2,
        collate_fn=target_collator,
        sampler=train_sampler)
    val_target_loader = torch.utils.data.DataLoader(target_dataset,
                                                    batch_size=batch_size,
                                                    shuffle=False,
                                                    num_workers=2,
                                                    collate_fn=target_collator,
                                                    sampler=val_sampler)

    # print("lens - train_text_loader {}, val_text_loader {}".format(len(train_text_loader), len(val_text_loader)))
    # print("lens - train_audio_loader {}, val_audio_loader {}".format(len(train_audio_loader), len(val_audio_loader)))
    # print("lens - train_image_loader {}, val_image_loader {}".format(len(train_image_loader), len(val_image_loader)))
    # print("lens - train_target_loader {}, val_target_loader {}".format(len(train_target_loader), len(val_target_loader)))

    # Create model
    model = MMBiDAF(hidden_size, text_embedding_size, audio_embedding_size,
                    image_embedding_size, device, drop_prob, max_text_length)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)  # For exponential moving average

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)  # Need to change the metric name

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Let's do this!
    loss = 0
    eps = 1e-8
    log.info("Training...")
    steps_till_eval = args.eval_steps
    epoch = step // len(TextDataset(course_dir, max_text_length))

    while epoch != args.num_epochs:
        epoch += 1
        log.info("Starting epoch {epoch}...")
        count_item = 0
        loss_epoch = 0
        with torch.enable_grad(), tqdm(
                total=len(train_text_loader.dataset)) as progress_bar:
            for (batch_text, original_text_lengths), (
                    batch_audio, original_audio_lengths), (
                        batch_images,
                        original_img_lengths), (batch_target_indices,
                                                batch_source_paths,
                                                batch_target_paths,
                                                original_target_len) in zip(
                                                    train_text_loader,
                                                    train_audio_loader,
                                                    train_image_loader,
                                                    train_target_loader):
                loss = 0
                max_dec_len = torch.max(
                    original_target_len
                )  # TODO check error : max decoder timesteps for each batch

                # Transfer tensors to GPU
                batch_text = batch_text.to(device)
                log.info("Loaded batch text")
                batch_audio = batch_audio.to(device)
                log.info("Loaded batch audio")
                batch_images = batch_images.to(device)
                log.info("Loaded batch image")
                batch_target_indices = batch_target_indices.to(device)
                log.info("Loaded batch targets")

                # Setup for forward
                batch_size = batch_text.size(0)
                optimizer.zero_grad()

                log.info("Starting forward pass")
                # Forward
                batch_out_distributions, loss = model(
                    batch_text, original_text_lengths, batch_audio,
                    original_audio_lengths, batch_images, original_img_lengths,
                    batch_target_indices, original_target_len, max_dec_len)
                loss_val = loss.item()  # numerical value of loss
                loss_epoch = loss_epoch + loss_val

                log.info("Starting backward")

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(
                    model.parameters(),
                    args.max_grad_norm)  # To tackle exploding gradients
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    # TODO
                    # scores, results = evaluate(model, dev_loader, device,
                    #                               args.dev_eval_file,
                    #                               args.max_ans_len,
                    #                               args.use_squad_v2)
                    saver.save(step, model, device)
                    ema.resume(model)

                # Generate summary
                print('Generated summary for iteration {}: '.format(epoch))
                summaries = get_generated_summaries(batch_out_distributions,
                                                    original_text_lengths,
                                                    batch_source_paths)
                print(summaries)

                # Evaluation
                # rouge = Rouge()
                # rouge_scores = rouge.get_scores(batch_source_paths, batch_target_paths, avg=True)
                # print('Rouge score at iteration {} is {}: '.format(epoch, rouge_scores))

                # Generate Output Heatmaps
                # sns.set()
                # for idx in range(len(out_distributions)):
                #     out_distributions[idx] = out_distributions[idx].squeeze(0).detach().numpy()      # Converting each timestep distribution to numpy array
                # out_distributions = np.asarray(out_distributions)   # Converting the timestep list to array
                # ax = sns.heatmap(out_distributions)
                # fig = ax.get_figure()
                # fig.savefig(out_heatmaps_dir + str(epoch) + '.png')
            print("Epoch loss is : {}".format(loss_epoch / count_item))
Esempio n. 7
0
def main(args):
    if args.large:
        args.train_record_file += '_large'
        args.dev_eval_file += '_large'
        model_name = "albert-xlarge-v2"
    else:
        model_name = "albert-base-v2"
    if args.xxlarge:
        args.train_record_file += '_xxlarge'
        args.dev_eval_file += '_xxlarge'
        model_name = "albert-xxlarge-v2"
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get model
    log.info('Building model...')
    if args.bidaf:
        char_vectors = util.torch_from_json(args.char_emb_file)

    if args.model_name == 'albert_highway':
        model = models.albert_highway(model_name)
    elif args.model_name == 'albert_lstm_highway':
        model = models.LSTM_highway(model_name, hidden_size=args.hidden_size)
    elif args.model_name == 'albert_bidaf':
        model = models.BiDAF(char_vectors=char_vectors,
                             hidden_size=args.hidden_size,
                             drop_prob=args.drop_prob)
    elif args.model_name == 'albert_bidaf2':
        model = models.BiDAF2(model_name=model_name,
                              char_vectors=char_vectors,
                              hidden_size=args.hidden_size,
                              drop_prob=args.drop_prob)
    else:
        model = AlbertForQuestionAnswering.from_pretrained(args.model_name)

    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2,
                          args.bidaf)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers)
    dev_dataset = SQuAD(args.dev_eval_file, args.use_squad_v2, args.bidaf)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers)

    with open(args.dev_gold_file) as f:
        gold_dict = json.load(f)

    tokenizer = AlbertTokenizer.from_pretrained(model_name)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for batch in train_loader:
                batch = tuple(t.to(device) for t in batch)
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    'start_positions': batch[3],
                    'end_positions': batch[4],
                }
                if args.bidaf:
                    inputs['char_ids'] = batch[6]
                y1 = batch[3]
                y2 = batch[4]
                # Setup for forward
                batch_size = inputs["input_ids"].size(0)
                optimizer.zero_grad()

                # Forward
                # log_p1, log_p2 = model(**inputs)
                y1, y2 = y1.to(device), y2.to(device)
                outputs = model(**inputs)
                loss = outputs[0]
                loss = loss.mean()
                # loss_fct = nn.CrossEntropyLoss()
                # loss = loss_fct(log_p1, y1) + loss_fct(log_p2, y2)
                # loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(args, model, dev_dataset,
                                                  dev_loader, gold_dict,
                                                  tokenizer, device,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
Esempio n. 8
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)


    # Get size of char vocab
    with open(args.char2idx_file, 'r') as fh:
        char_vocab_size = len(json_load(fh))

    # Get model
    log.info('Building model...')
    model = QANet(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  char_vocab_size = char_vocab_size,
                  char_emb_size = args.char_emb_size,
                  word_char_emb_size = args.word_char_emb_size,
                  drop_prob=args.drop_prob,
                  num_blocks_embd = args.num_blocks_embd,
                  num_conv_embd = args.num_conv_embd,
                  kernel_size = args.kernel_size,
                  num_heads = args.num_heads,
                  num_blocks_model = args.num_blocks_model,
                  num_conv_model = args.num_conv_model,
                  dropout_char = args.dropout_char,
                  dropout_word = args.dropout_word,
                  survival_prob = args.survival_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    #params = filter(lambda param: param.requires_grad, model.parameters())
    
    optimizer = optim.Adam(lr=1, betas=(args.beta1, args.beta2), eps=args.adam_eps, weight_decay=args.l2_wd, params=model.parameters())
    cr = args.lr / math.log2(args.warm_up)
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < args.warm_up else args.lr)


    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)

    torch.autograd.set_detect_anomaly(True)

    
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                
                cc_idxs=cc_idxs.to(device)
                qc_idxs=qc_idxs.to(device)
                
                
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')

    # set device
    if args.use_gpu and torch.cuda.is_available():
        device = torch.device("cuda:{}".format(args.gpu_ids[0]))
        args.batch_size *= max(1, len(args.gpu_ids))
        print(f"device is cuda: gpu_ids = {args.gpu_ids}")
    else:
        device = torch.device("cpu")
        print("device is cpu")

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')
    model = NAQANet(
        device,
        word_vectors,
        char_vectors,
        c_max_len=args.context_limit,
        q_max_len=args.question_limit,
        answering_abilities=['passage_span_extraction', 'counting'],
        max_count=args.max_count
    )  # doesn't large max_count lead to meaningless probability?

    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0

    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    lr = args.lr
    base_lr = 1.0
    warm_up = args.lr_warm_up_num
    params = filter(lambda param: param.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(lr=base_lr,
                                 betas=(args.beta1, args.beta2),
                                 eps=1e-7,
                                 weight_decay=3e-7,
                                 params=params)
    cr = lr / math.log(warm_up)
    scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < warm_up else lr)

    # Get data loader
    log.info('Building dataset...')
    train_dataset = DROP(args.train_record_file)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = DROP(args.dev_record_file)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, \
                    qw_idxs, qc_idxs, \
                    start_idxs, end_idxs, \
                    counts, ids  in train_loader:

                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                cc_idxs = cc_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                start_idxs = start_idxs.to(device)
                end_idxs = end_idxs.to(device)
                counts = counts.to(device)
                ids = ids.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                output_dict = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs, ids,
                                    start_idxs, end_idxs, counts)

                loss = output_dict["loss"]
                loss = torch.sum(loss, dim=0) / len(args.gpu_ids)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
                optimizer.step()
                scheduler.step()
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

    # Save the model
    print("Saving the model ...")
    torch.save(model.state_dict(), args.model_dir)
    print("Done!")
Esempio n. 10
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')
    """
    model = BiDAF(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    """
    '''
    model = charBiDAF(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      emb_size=char_vectors.size(1),
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)
    '''
    model = QANet(word_vectors=word_vectors,
                  char_vectors=char_vectors,
                  emb_size=char_vectors.size(1),
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    print(
        "YOU ARE ENTERING THE QA NET MODEL TRAINING_____________________________________________________________________"
    )
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    #print("Model status - ", cuda_or_cpu(model))
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    #torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL = 300
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)

    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=0,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:

                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                cc_idxs = cc_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()

                #print(torch.cuda.memory_allocated(device=None))
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)
                #torch.cuda.empty_cache()

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    # print('Memory 1: ', torch.cuda.memory_allocated())
                    ema.assign(model)
                    # print('Memory 2: ', torch.cuda.memory_allocated())
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, type="train")
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get your model
    log.info('Building model...')
    model, step = get_model(log, args)
    model = model.to(device)
    model.train()

    #Exponential moving average
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           betas=[0.8, 0.999],
                           eps=1e-7,
                           weight_decay=args.l2_wd)

    scheduler = sched.LambdaLR(optimizer, lambda step: 1)

    #get loss computer
    cri = FocalLoss(alpha=torch.tensor([args.alpha, 1]).to(device),
                    gamma=args.gamma)

    # Get data loader
    log.info('Building dataset...')

    dev_dataset = util.load_dataset(args.dev_file,
                                    args.PPI_dir,
                                    args.PPI_gene_feature_dir,
                                    args.PPI_gene_query_dict_dir,
                                    args.max_nodes,
                                    train=False)

    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=util.collate_fn)

    train_dataset = util.load_dataset(args.train_file, args.PPI_dir,
                                      args.PPI_gene_feature_dir,
                                      args.PPI_gene_query_dict_dir,
                                      args.max_nodes)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=util.collate_fn)
    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = 0
    while epoch != args.num_epochs:
        epoch += 1

        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for batch_a, batch_bio_a, batch_A, batch_b, batch_bio_b, batch_B, batch_y in train_loader:
                # Setup for forward
                batch_a = batch_a.to(device)
                batch_bio_a = batch_bio_a.to(device)
                batch_A = batch_A.to(device)
                batch_bio_b = batch_bio_b.to(device)
                batch_b = batch_b.to(device)
                batch_B = batch_B.to(device)
                batch_y = batch_y.to(device)
                batch_y = batch_y.long()
                batch_size = batch_bio_a.size(0)
                optimizer.zero_grad()
                # Forward
                output = model(batch_a, batch_bio_a, batch_A, batch_b,
                               batch_bio_b, batch_B)
                loss = cri(output, batch_y)
                #loss = F.nll_loss(output, batch_y)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step()
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/Loss', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results = evaluate(model, dev_loader, cri, device)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.5f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
Esempio n. 12
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    log.info('Loading word2Idx...')
    word2Idx = json.loads(open(args.word2idx_file).read())
    Idx2Word = {v: k for (k, v) in word2Idx.items()}

    vocab_size = len(word2Idx)
    print(f"Vocab Size is : {vocab_size}")

    def getWords(idxList):
        words = []
        for i in idxList:
            words.append(Idx2Word[i])
        return words

    def create_new_model():
        if args.model_type == "seq2seq":
            return Seq2Seq(word_vectors=word_vectors,
                           hidden_size=args.hidden_size,
                           output_size=vocab_size,
                           device=device)
        elif args.model_type == "seq2seq_attn":
            return Seq2SeqAttn(word_vectors=word_vectors,
                               hidden_size=args.hidden_size,
                               output_size=vocab_size,
                               device=device)
        elif args.model_type == "transformer":
            return TransformerModel(vocab_size,
                                    device,
                                    num_encoder_layers=2,
                                    num_decoder_layers=2,
                                    dropout=0.1)
            #return make_model(vocab_size, vocab_size, N=2, dropout=0.0)

    # Get model
    log.info('Building model...')
    model = create_new_model()
    model = nn.DataParallel(model, args.gpu_ids)

    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 args.best_model_name,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    model_save_path = os.path.join(args.save_dir, args.best_model_name)

    # Initialize optimizer and loss function
    optimizer = NoamOpt(
        model.module.src_embed[0].d_model, 1, 2000,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98),
                         eps=1e-9))

    criterion = nn.NLLLoss(ignore_index=PAD, reduction='sum')
    loss_compute = SimpleLossCompute(criterion, optimizer)

    # Default project starter code uses Adadelta, but we're going to use Adam
    # optimizer = torch.optim.Adam(model.parameters(), lr=float(args.lr))

    num_trial = 0
    train_iter = patience = total_loss = report_loss = total_words = report_words = 0
    total_examples = report_examples = epoch = valid_num = 0
    train_time = begin_time = time.time()

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:

                train_iter += 1

                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)

                batch_size = cw_idxs.size(0)

                # Setup for forward
                src_idxs = cw_idxs
                src_idxs = torch.cat(
                    (torch.zeros(
                        (batch_size, 1), device=device,
                        dtype=torch.long), src_idxs,
                     torch.zeros(
                         (batch_size, 1), device=device, dtype=torch.long)),
                    dim=-1)
                src_idxs[:, 0] = SOS
                src_idxs[:, -1] = EOS

                #optimizer.zero_grad()

                tgt_idxs = qw_idxs[:, :-1]
                tgt_idxs_y = qw_idxs[:, 1:]

                #c_mask = (cw_idxs != pad).unsqueeze(-2)
                src_mask = src_idxs == PAD
                tgt_mask = tgt_idxs == PAD
                #copy_idxs_tgt_mask = make_std_mask(copy_idxs_tgt, pad)

                # Forward

                if args.model_type in ['seq2seq', 'seq2seq_attn']:
                    log_p = model(src_idxs,
                                  tgt_idxs)  #(batch_size, q_len, vocab_size)
                elif args.model_type == 'transformer':
                    log_p = model(src_idxs, tgt_idxs, src_mask,
                                  tgt_mask)  #(batch_size, q_len, vocab_size)
                '''
                print("Context:")
                print(src_idxs[0])
                print("Question:")
                print(tgt_idxs[0])
                print("Predicted:")
                print(log_p[0].argmax(-1))
                '''

                log_p = log_p.contiguous().view(-1, log_p.size(-1))

                #qw_idxs_tgt = qw_idxs[:, 1:]     # omitting leading `SOS`
                #qw_idxs_tgt = copy_idxs[:, 1:]
                #print(qw_idxs_tgt.shape)
                #qw_idxs_tgt = qw_idxs_tgt.contiguous().view(qw_idxs_tgt.size(0) * qw_idxs_tgt.size(1))
                #print(qw_idxs_tgt.shape)
                #q_tgt_mask = torch.zeros_like(qw_idxs_tgt) != qw_idxs_tgt
                #q_len = q_tgt_mask.sum(-1)

                tgt_idxs_y = tgt_idxs_y.contiguous().view(-1)
                tgt_no_pad = tgt_idxs_y != PAD
                tgt_len = tgt_no_pad.sum(-1)

                #batch_loss = F.nll_loss(log_p, qw_idxs_tgt, ignore_index=0, reduction='sum')

                #loss = batch_loss / batch_size
                #loss_val = loss.item()

                # Backward
                #loss.backward()
                #nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                #optimizer.step()

                batch_words = torch.sum(tgt_len).item()
                report_words += batch_words
                total_words += batch_words
                report_examples += batch_size
                total_examples += batch_size

                batch_loss = loss_compute(log_p, tgt_idxs_y, batch_words,
                                          model.training)
                '''
                model_opt.optimizer.zero_grad()
                loss = criterion(log_p, copy_idxs_tgt_y) / batch_words
                loss.backward()
                model_opt.step()
                '''

                #batch_loss = loss.item() * batch_words
                report_loss += batch_loss
                total_loss += batch_loss

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=batch_loss)

                if train_iter % args.log_every == 0:
                    '''
                    log.info('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time))
                    '''
                    '''
                    print("Context Words:")
                    print(getWords(src_idxs[0].squeeze().tolist()))

                    #util.evaluateRandomly(model, word2Idx, Idx2Word, re_cw_idxs[batch_size-1].unsqueeze(0), device)
                    
                    print("Question Words:")
                    print(getWords(tgt_idxs[0].squeeze().tolist()))

                    print("Predicted Words:")
                    model.eval()
                    predicted_words = util.greedy_decode(model, src_idxs[0].unsqueeze(0), src_mask[0].unsqueeze(0), max_len=30, start_symbol=2)
                    print(predicted_words)
                    print(getWords(predicted_words.squeeze().tolist()))
                    model.train()
                    '''



                    log.info('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_words,
                                                                                         math.exp(report_loss / report_words),
                                                                                         total_examples,
                                                                                         report_words / (time.time() - train_time),
                                                                                         time.time() - begin_time))

                    train_time = time.time()
                    report_loss = report_words = report_examples = 0.

                # perform validation
                if train_iter % args.valid_niter == 0:
                    print('begin validation ...', file=sys.stderr)
                    # compute dev metrics
                    results = evaluate(model, dev_loader, device,
                                       args.use_squad_v2)
                    log.info(
                        'validation: iter %d, dev avg. loss %.2f, dev. ppl %f'
                        % (train_iter, results['NLL'], results['PPL']))

                    if saver.is_best(results[args.metric_name]):
                        log.info('save currently the best model to [%s]' %
                                 model_save_path)
                        saver.save(step, model, results[args.metric_name],
                                   device)
                '''
Esempio n. 13
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # NEW : load the tag embeddings
    pos_vectors = util.torch_from_json(args.pos_emb_file)
    ner_vectors = util.torch_from_json(args.ner_emb_file)

    # Add loss
    if 'loss' in args.name:
        distance_criterion = DistanceFromAnswerLoss(coefficient=.5, device=device,
                                                    normalization=True, penalization_type='quadratic',
                                                    reduction='mean')

    # Choose model
    log.info('Building model {}...'.format(args.name))

    if 'baseline' in args.name:
        model = BiDAF(word_vectors=word_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)

    elif args.name == 'BiDAF_char':
        model = BiDAF_char(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)

    elif (args.name == 'BiDAF_tag') or (args.name == 'BiDAF_tag_loss'):
        model = BiDAF_tag(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      pos_vectors=pos_vectors,
                      ner_vectors=ner_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)

    elif (args.name == 'BiDAF_tag_unfrozen') or (args.name == 'BiDAF_tag_unfrozen_loss'):
        model = BiDAF_tag(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      pos_vectors=pos_vectors,
                      ner_vectors=ner_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob,
                      freeze_tag=False)

    elif args.name == 'BiDAF_tag_ext':
        model = BiDAF_tag_ext(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      pos_vectors=pos_vectors,
                      ner_vectors=ner_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)

    elif args.name == 'BiDAF_tag_ext_unfrozen':
        model = BiDAF_tag_ext(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      pos_vectors=pos_vectors,
                      ner_vectors=ner_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob,
                      freeze_tag=False)

    elif args.name == 'coattn':
        model = CoattentionModel(hidden_dim=args.hidden_size,
                                embedding_matrix=word_vectors,
                                train_word_embeddings=False,
                                dropout=0.35,
                                pooling_size=16,
                                number_of_iters=4,
                                number_of_layers=2,
                                device=device)

    else:
        raise NameError('No model named ' + args.name)

    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0

    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(), args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn,
                                   drop_last=True)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn,
                                 drop_last=True)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, cpos_idxs, cner_idxs, cw_ems, cw_tfs, qw_idxs, qc_idxs, qpos_idxs, qner_idxs, qw_ems, qw_tfs, y1, y2, ids in train_loader: # NEW
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)

                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                if 'baseline' in args.name:
                    log_p1, log_p2 = model(cw_idxs, qw_idxs)

                elif args.name == 'BiDAF_char':
                    # Additional setup for forward
                    cc_idxs = cc_idxs.to(device)
                    qc_idxs = qc_idxs.to(device)
                    log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)

                elif (args.name == 'BiDAF_tag') or (args.name == 'BiDAF_tag_unfrozen') or (args.name == 'BiDAF_tag_loss') or (args.name == 'BiDAF_tag_unfrozen_loss'):
                    # Additional setup for forward
                    cc_idxs = cc_idxs.to(device)
                    cpos_idxs = cpos_idxs.to(device)
                    cner_idxs = cner_idxs.to(device)
                    qc_idxs = qc_idxs.to(device)
                    qpos_idxs = qpos_idxs.to(device)
                    qner_idxs = qner_idxs.to(device)

                    log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs)

                elif (args.name == 'BiDAF_tag_ext') or (args.name == 'BiDAF_tag_ext_unfrozen'):
                    # Additional setup for forward
                    cc_idxs = cc_idxs.to(device)
                    cpos_idxs = cpos_idxs.to(device)
                    cner_idxs = cner_idxs.to(device)
                    cw_ems = cw_ems.to(device)
                    cw_tfs = cw_tfs.to(device)
                    qc_idxs = qc_idxs.to(device)
                    qpos_idxs = qpos_idxs.to(device)
                    qner_idxs = qner_idxs.to(device)
                    qw_ems = qw_ems.to(device)
                    qw_tfs = qw_tfs.to(device)

                    log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs, cw_ems, qw_ems, cw_tfs, qw_tfs)

                elif args.name == 'coattn':
                    max_c_len = cw_idxs.size(1)
                    max_q_len = qw_idxs.size(1)

                    c_len = []
                    q_len = []

                    for i in range(cw_idxs.size(0)):
                        if len((cw_idxs[i] == 0).nonzero()) != 0:
                            c_len_i = (cw_idxs[i] == 0).nonzero()[0].item()
                        else:
                            c_len_i = cw_idxs.size(1)

                        if len((qw_idxs[i] == 0).nonzero()) != 0:
                            q_len_i = (qw_idxs[i] == 0).nonzero()[0].item()
                        else:
                            q_len_i = qw_idxs.size(1)

                        c_len.append(int(c_len_i))
                        q_len.append(int(q_len_i))

                    c_len = torch.Tensor(c_len).int()
                    q_len = torch.Tensor(q_len).int()

                    num_examples = int(cw_idxs.size(0) / len(args.gpu_ids))

                    log_p1, log_p2 = model(max_c_len, max_q_len, cw_idxs, qw_idxs, c_len, q_len, num_examples, True, True)

                else:
                    raise NameError('No model named ' + args.name)

                y1, y2 = y1.to(device), y2.to(device)

                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)

                # Add distance penalization
                if 'loss' in args.name:
                    loss += distance_criterion(log_p1, y1) + distance_criterion(log_p2, y2)

                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2,
                                                  args.name,
                                                  args.gpu_ids)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 14
0
def main(args):
    IMAGE_TRAIN_PIXEL_FILE = "./data/train.csv"
    MODEL_SAVED_DIR = "./save"
    MODEL_SAVED_DIR = util.get_save_dir(MODEL_SAVED_DIR, args[1], True)
    LOG_DIR = "./log"
    dr = digit_recognizer()
    log = util.get_basic_logger(LOG_DIR, "train")
    #Process the data before giving input to the model
    images_data, label_data = load_image_binary_data(IMAGE_TRAIN_PIXEL_FILE)
    load_first_image = images_data.iloc[:1, :]
    load_first_image = np.array(load_first_image).reshape(28, 28)
    pdf = PdfPages(
        './data/first_image_pass_first_cnnlayer_with_relu_maxpool_outchannel_5_with_topmost_original_image.pdf'
    )
    log.info("Display original image")
    display_image(load_first_image, pdf)
    load_first_image = torch.from_numpy(load_first_image).type(
        torch.FloatTensor)
    load_first_image = load_first_image.unsqueeze(0).unsqueeze(0)
    #output = dr(load_first_image)

    # Display the first image after passign through one cnn layer with relu and maxpool
    #    for filter_no in range(OUT_CHANNEL):
    #        display_image(output[0][filter_no].detach().numpy(),pdf)
    #    pdf.close()

    # Divide the train data into train data and dev eval data
    # It seems adding 1 as an extra dimension does not have any impact
    images_data = np.array(images_data).reshape(-1, 1, 28, 28)
    label_data = np.array(label_data)
    x_train, x_valid, y_train, y_valid = train_test_split(images_data,
                                                          label_data,
                                                          test_size=0.2)
    log.info('The shape of training data: {}'.format(x_train.shape))
    log.info("The shape of testing  data: {}".format(x_valid.shape))
    log.info("The shape of training label data: {}".format(y_train.shape))
    log.info("The shape of testing  label data: {}".format(y_valid.shape))

    #Initialize the model
    model = digit_recognizer()

    device, gpu_ids = util.get_available_devices()

    log.info(
        "The device on which the model is running is :: {}".format(device))

    model.to(device)

    train_dataset = torch.utils.data.TensorDataset(
        torch.from_numpy(x_train).float(),
        torch.from_numpy(y_train).long())

    eval_dataset = torch.utils.data.TensorDataset(
        torch.from_numpy(x_valid).float(),
        torch.from_numpy(y_valid).long())

    #create train dataloader object
    train_loader = data.DataLoader(dataset=train_dataset,
                                   batch_size=150,
                                   shuffle=True)

    #create eval dataloader object
    eval_loader = data.DataLoader(dataset=eval_dataset,
                                  batch_size=150,
                                  shuffle=False)

    #Initialize the optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    #initialize the cross entropy loss
    loss_fn = nn.CrossEntropyLoss()

    #Declare the checkpointsaver to save the model

    saver = util.CheckpointSaver(MODEL_SAVED_DIR,
                                 max_checkpoints=4,
                                 metric_name='Accuracy',
                                 maximize_metric=True,
                                 log=log)

    #initilaize the tensorboard with the path of the model saved dir
    tbx = SummaryWriter(MODEL_SAVED_DIR)

    epoch = int(args[2])
    global_step_train = 0
    global_step_eval = 0

    for i in range(epoch):

        log.info("Starting training")
        global_step_train = train(train_loader, model, loss_fn, device,
                                  optimizer, tbx, log, global_step_train)

        log.info("Starting evaluation ")
        accuracy, global_step_eval = evaluate(eval_loader, model, loss_fn,
                                              device, tbx, log,
                                              global_step_eval)

        log.info("Completed epoch {} ".format(i))
        saver.save(i, model, accuracy, device)
Esempio n. 15
0
def train(args):
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    if args.gpu_ids == 'cpu':
        device, args.gpu_ids = torch.device('cpu'), []
    else:
        device, args.gpu_ids = util.get_available_devices()
    log.info('training on device {} with gpu_id {}'.format(str(device), str(args.gpu_ids)))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    log.info('Building model...')
    if args.task == 'tag':
        model = SummarizerLinear()
#        model = SummarizerLinearAttended(128, 256)
#        model = SummarizerRNN(128, 256)
    else:
        model = SummarizerAbstractive(128, 256, device)
    if len(args.gpu_ids) > 0:
        model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()

    ## get a saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    optimizer = optim.Adam(model.parameters(), args.lr,
                               weight_decay=args.l2_wd)

    log.info('Building dataset...')
    data_path = PROCESSED_DATA_SUPER_TINY if args.split == 'super_tiny' else PROCESSED_DATA
    with open(data_path, 'rb') as f:
        all_data = pickle.load(f)
    if 'tiny' in args.split:
        train_split = all_data['tiny']
        dev_split = all_data['tiny']
    else:
        train_split = all_data['train']
        dev_split = all_data['dev']
    train_dataset = SummarizationDataset(
            train_split['X'], train_split['y'], train_split['gold'])
    dev_dataset = SummarizationDataset(
            dev_split['X'], dev_split['y'], dev_split['gold'])
    collate_fn = tag_collate_fn if args.task == 'tag' else decode_collate_fn
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers,
                                   shuffle=True,
                                   collate_fn=collate_fn)
    dev_loader = data.DataLoader(dev_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers,
                                   shuffle=False,
                                   collate_fn=collate_fn)
    ## Train!
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        batch_num = 0
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for X, y, _ in train_loader:
                batch_size = X.size(0)
                batch_num += 1
                X = X.to(device)
                y = y.float().to(device) # (batch_size, max_len) for tag, (batch_size, 110) for decode
                optimizer.zero_grad()
                if args.task == 'tag':
                    logits = model(X) # (batch_size, max_len)
                    mask = (X != PAD_VALUE).float() # 1 for real data, 0 for pad, size of (batch_size, max_len)
                    loss = (F.binary_cross_entropy_with_logits(logits, y, reduction='none') * mask).mean()
                    loss_val = loss.item()
                else:
                    logits = model(X, y[:, :-1]) # (batch_size, 109, max_len)
                    loss = sum(F.cross_entropy(logits[i], y[i, 1:], ignore_index=-1, reduction='mean')\
                               for i in range(batch_size)) / batch_size
                    loss_val = loss.item()
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                # scheduler.step(step // batch_size)

                # Log info
                step += args.batch_size
                progress_bar.update(args.batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         Loss=loss_val)
                tbx.add_scalar('train/Loss', loss_val, step)
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    results, pred_dict = evaluate(args, model, dev_loader, device)
                    if results is None:
                        log.info('Selected predicted no select for all in batch')
                        continue
                    saver.save(step, model, results[args.metric_name], device)

#                     # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
Esempio n. 16
0
def main(args):
    # Set up faulthandler
    faulthandler.enable()
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')
    model_params = {
        'word_vectors': word_vectors,
        'char_vectors': char_vectors,
        'args': args
    }
    model = get_model(args.model, model_params)
    print('Model size: {:f} MB'.format(
        sum(p.nelement() * p.element_size()
            for p in model.parameters()) / (1024 * 1024)))
    # model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                progress_bar.set_description(
                    'Batch data_loading finished'.ljust(30))
                progress_bar.refresh()

                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()
                progress_bar.set_description(
                    'Batch initialization finished'.ljust(30))
                progress_bar.refresh()

                # Forward
                faulthandler.dump_traceback_later(timeout=3)
                log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                faulthandler.cancel_dump_traceback_later()
                progress_bar.set_description(
                    'Batch forward finished'.ljust(30))
                progress_bar.refresh()
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                faulthandler.dump_traceback_later(timeout=3)
                loss.backward()
                faulthandler.cancel_dump_traceback_later()
                progress_bar.set_description(
                    'Batch backward finished'.ljust(30))
                progress_bar.refresh()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                progress_bar.set_description('Optimization finished'.ljust(30))
                progress_bar.refresh()
                scheduler.step()
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    progress_bar.set_description(
                        'Evaluation finished'.ljust(30))
                    progress_bar.refresh()
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
def main(data, flags):
    # Set up logging and devices
    log_dir = data.logging_dir
    log = util.get_logger(log_dir, "toy")
    tbx = SummaryWriter(data.logging_dir)
    device, data.gpu_ids = util.get_available_devices()
    log.info('Config: {}'.format(dumps(vars(data), indent=4, sort_keys=True)))
    data.batch_size *= max(1, len(data.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(data.random_seed))
    random.seed(data.random_seed)
    np.random.seed(data.random_seed)
    torch.manual_seed(data.random_seed)
    torch.cuda.manual_seed_all(data.random_seed)

    if flags[1] == "toy":
        word_emb_file = data.toy_word_emb_file
        training_data = data.toy_record_file_exp3
        test_data = data.dev_record_file_exp3
        eval_file = data.toy_eval_exp3
    elif flags[1] == "train":
        word_emb_file = data.word_emb_file
        training_data = data.train_record_file_exp3
        test_data = data.dev_record_file_exp3
        eval_file = data.train_eval_exp3
    elif flags[1] == "dev":
        word_emb_file = data.word_emb_file
        training_data = data.dev_record_file_exp3
        test_data = data.toy_record_file_exp3
        eval_file = data.dev_eval_exp3

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(word_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  hidden_size=data.hidden_size,
                  drop_prob=data.drop_prob)
    model = nn.DataParallel(model, data.gpu_ids)
    if data.load_path:
        log.info('Loading checkpoint from {}...'.format(data.load_path))
        model, step = util.load_model(model, data.load_path, data.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, data.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(data.logging_dir,
                                 max_checkpoints=10,
                                 metric_name=data.metric_name,
                                 maximize_metric=data.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               data.learning_rate,
                               weight_decay=data.learning_weight_decay)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    # np.load(data.toy_record_file_exp3)
    train_dataset = SQuAD3(training_data, use_v2=True)
    train_loader = torchdata.DataLoader(train_dataset,
                                        batch_size=data.batch_size,
                                        shuffle=True,
                                        num_workers=data.num_workers,
                                        collate_fn=collate_fn)

    test_dataset = SQuAD3(test_data, use_v2=True)
    test_loader = torchdata.DataLoader(test_dataset,
                                       batch_size=data.batch_size,
                                       shuffle=False,
                                       num_workers=data.num_workers,
                                       collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = data.eval_steps
    epoch = step // len(test_dataset)
    while epoch != data.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log.info("cw_idxs length: {}".format(str(len(cw_idxs))))
                log.info("qw_idxs length: {}".format(str(len(qw_idxs))))
                log.info("cw_idxs size: {}".format(str(
                    sys.getsizeof(cw_idxs))))
                log.info("qw_idxs size: {}".format(str(
                    sys.getsizeof(qw_idxs))))
                log.info("cw_idxs shape: {}".format(str(cw_idxs.shape)))
                log.info("qw_idxs shape: {}".format(str(qw_idxs.shape)))

                log_p1, log_p2 = model(cw_idxs, qw_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         data.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('toy/NLL', loss_val, step)
                tbx.add_scalar('toy/LR', optimizer.param_groups[0]['lr'], step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = data.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model,
                                                  test_loader,
                                                  device,
                                                  eval_path=eval_file,
                                                  max_len=sys.maxsize,
                                                  use_squad_v2=True)
                    saver.save(step, model, results[data.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=data.num_visuals)
Esempio n. 18
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    # To make the data generation of every experiment same
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)
    print(word_vectors.size())
    print(char_vectors.size())

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  char_vectors=char_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    # (context_idxs(context_len,): Indices of the words in the context.,
    # context_char_idx(context_len, max_word_len): Indices of the characters in the context,
    # question_idxs(question_len,): Indices of the words in the question,
    # question_char_idx(question_len, max_word_len): Indices of the characters in the question,
    # y1:start, -1 if no answer: answer start index,
    # y2:start, -1 if no answer: answer end index,
    # id ID of the example)
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                # cw_idxs: Indices of the words in the context
                # cc_idxs: Indices of the characters in the context
                # qw_idxs: Indices of the words in the query
                # qc_idxs: Indices of the characters in teh query
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                # cw_idx with shape(context_len, )
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                # L(theta) = - 1/N * sum(log(P1_yi_1) + log(P2_yi_2))
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 19
0
    def __init__(self,
                 args,
                 gpu_num: int,
                 train_loader,
                 val_loader,
                 crop_sz=125,
                 output_sz=121,
                 lambda0=1e-4,
                 padding=2.0,
                 output_sigma_factor=0.1):
        self.crop_sz = crop_sz
        self.output_sz = output_sz
        self.lambda0 = lambda0
        self.padding = padding
        output_sigma = crop_sz / (1 + padding) * output_sigma_factor

        self.args = args
        self.gpu_num = gpu_num
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.batch_size = args.batch_size * gpu_num

        self.best_loss = 1e6

        # shape: 121, 121
        self.y = torch.tensor(
            util.gaussian_shaped_labels(
                output_sigma,
                [self.output_sz, self.output_sz]).astype(np.float32)).cuda()
        # shape: 1, 1, 121, 61, 2
        self.yf = fft.rfftn(self.y.view(1, 1, self.output_sz, self.output_sz),
                            dim=[-2, -1])
        # Shape: 121, 121
        self.initial_y = self.y.clone()
        # Shape: batch, 1, 121, 61
        self.label = self.yf.repeat(self.batch_size, 1, 1, 1)

        self.model = DCFNet(lambda0=self.lambda0).cuda()

        print('GPU NUM: {:2d}'.format(gpu_num))
        if gpu_num > 1:
            self.model = torch.nn.DataParallel(self.model,
                                               list(range(gpu_num))).cuda()

        self.criterion = nn.MSELoss(reduction='sum').cuda()
        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=args.lr,
                                         momentum=args.momentum,
                                         weight_decay=args.weight_decay)

        self.lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer,
            gamma=util.compute_lr_gamma(args.lr, 1e-5, args.epochs))

        # Bring the lr scheduler to the first epoch
        for epoch in range(args.start_epoch):
            self.lr_scheduler.step()

        # for training
        self.target = self.y.unsqueeze(0).unsqueeze(0).repeat(
            args.batch_size * gpu_num, 1, 1, 1)

        # optionally resume from a checkpoint
        if args.resume:
            if isfile(args.resume):
                print(f"=> loading checkpoint '{args.resume}'")
                checkpoint = torch.load(args.resume)
                self.args.start_epoch = checkpoint['epoch']
                self.best_loss = checkpoint['best_loss']
                self.model.load_state_dict(checkpoint['state_dict'])
                self.optimizer.load_state_dict(checkpoint['optimizer'])
                print(
                    f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']})"
                )
            else:
                print(f"=> no checkpoint found at '{args.resume}'")
        cudnn.benchmark = True

        checkpoint_path = args.save if args.save else config.checkpoint_root
        self.checkpoint_saver = util.CheckpointSaver(save_path=os.path.join(
            checkpoint_path, f'crop_{args.input_sz:d}_{args.padding:1.1f}'),
                                                     verbose=True)
Esempio n. 20
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))
    max_len = 10

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    ch_vectors = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  ch_vectors=ch_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                #print("ckpt 1")
                ans_lens = y2 - y1
                loss = 0
                for i in range(max_len):
                    mask = ((torch.ones_like(y1) * i) == ans_lens).type(
                        torch.cuda.LongTensor)
                    y = y1 * mask
                    loss += F.nll_loss(log_p[:, :, i], y)
                #print("ckpt 2")
                loss_val = loss.item()
                #print("ckpt 3")
                # Backward
                loss.backward()
                #print("ckpt 4")
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                #print("ckpt 5")
                optimizer.step()
                #print("ckpt 6")
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)
                #print("ckpt 7")

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                if step % (50 * batch_size) == 0:
                    print(loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                #print("ckpt 8")
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 21
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    # Comment out to only use 1 GPU on nv12
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings

    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    model = None
    max_context_len, max_question_len = args.para_limit, args.ques_limit
    if (args.model_type == "bidaf" or args.model_type == "bert-bidaf"):
        model = BiDAF(word_vectors=word_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)
    elif (args.model_type == "dcn" or args.model_type == "bert-dcn"):
        model = DCN(word_vectors=word_vectors,
                    hidden_size=args.hidden_size,
                    max_context_len=max_context_len,
                    max_question_len=max_question_len,
                    drop_prob=args.drop_prob)
    elif (args.model_type == "bert-basic"):
        model = BERT(word_vectors=word_vectors,
                     hidden_size=args.hidden_size,
                     drop_prob=args.drop_prob)

    if model is None:
        raise ValueError('Model is unassigned. Please ensure --model_type \
        chooses between {bidaf, bert-bidaf, dcn, bert-dcn, bert-basic} ')

    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    count_skip = 0
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                batch_size = cw_idxs.size(0)
                count_skip += 1
                if (args.skip_examples == True
                        and (count_skip % 5 == 1 or count_skip % 5 == 2
                             or count_skip % 5 == 3 or count_skip % 5 == 4)):
                    step += batch_size
                    progress_bar.update(batch_size)
                    steps_till_eval -= batch_size
                    continue
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                ## Additions for BERT ##
                max_context_len, max_question_len = args.para_limit, args.ques_limit

                if "bert" in args.model_type:
                    bert_train_embeddings = get_embeddings(
                        "train", ids, args.para_limit, args.ques_limit)
                else:
                    bert_train_embeddings = None

                # Forward
                log_p1, log_p2 = model(cw_idxs, qw_idxs, bert_train_embeddings, \
                max_context_len, max_question_len, device)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2, args)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 22
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vec = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')
    if args.name == 'baseline':
        model = BiDAF(word_vectors=word_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)
    elif args.name == 'charembeddings':
        model = BiDAFChar(word_vectors=word_vectors,
                          char_vec=char_vec,
                          word_len=16,
                          hidden_size=args.hidden_size,
                          drop_prob=args.drop_prob)
    elif args.name == 'charembeddings2':
        model = BiDAFChar2(word_vectors=word_vectors,
                           char_vec=char_vec,
                           word_len=16,
                           hidden_size=args.hidden_size,
                           drop_prob=args.drop_prob)
    elif args.name == 'qanet':
        model = QANet(word_vectors=word_vectors,
                      char_vec=char_vec,
                      word_len=16,
                      emb_size=args.hidden_size,
                      drop_prob=args.drop_prob,
                      enc_size=args.enc_size,
                      n_head=args.n_head,
                      LN_train=args.ln_train,
                      DP_residual=args.dp_res,
                      mask_pos=args.mask_pos,
                      two_pos=args.two_pos,
                      total_prob=args.total_drop,
                      final_prob=args.final_prob)
    elif args.name == 'qanet2':
        model = QANet2(word_vectors=word_vectors,
                       char_vec=char_vec,
                       word_len=16,
                       emb_size=args.hidden_size,
                       drop_prob=args.drop_prob,
                       enc_size=args.enc_size,
                       n_head=args.n_head,
                       LN_train=args.ln_train,
                       DP_residual=args.dp_res,
                       mask_pos=args.mask_pos,
                       two_pos=args.two_pos,
                       rel=args.rel_att,
                       total_prob=args.total_drop,
                       final_prob=args.final_prob,
                       freeze=args.freeze_emb)
    elif args.name == 'qanet3':
        model = QANet3(word_vectors=word_vectors,
                       char_vec=char_vec,
                       word_len=16,
                       emb_size=args.hidden_size,
                       drop_prob=args.drop_prob,
                       enc_size=args.enc_size,
                       n_head=args.n_head,
                       LN_train=args.ln_train,
                       DP_residual=args.dp_res,
                       mask_pos=args.mask_pos,
                       two_pos=args.two_pos,
                       rel=args.rel_att,
                       total_prob=args.total_drop,
                       final_prob=args.final_prob,
                       freeze=args.freeze_emb)
    elif args.name == 'qanet4':
        model = QANet4(word_vectors=word_vectors,
                       char_vec=char_vec,
                       word_len=16,
                       emb_size=args.hidden_size,
                       drop_prob=args.drop_prob,
                       enc_size=args.enc_size,
                       n_head=args.n_head,
                       LN_train=args.ln_train,
                       DP_residual=args.dp_res,
                       mask_pos=args.mask_pos,
                       two_pos=args.two_pos,
                       rel=args.rel_att,
                       total_prob=args.total_drop,
                       final_prob=args.final_prob,
                       freeze=args.freeze_emb)
    else:
        raise ValueError('Wrong model name')

    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler

    if args.name == 'qanet':
        optimizer = optim.Adam(model.parameters(),
                               args.lr,
                               betas=(0.8, 0.999),
                               weight_decay=3 * 1e-7,
                               eps=1e-7)
        scheduler = warmup(optimizer, 1, 2000)
    elif args.opt == 'adam':
        if args.grad_cent:
            optimizer = AdamWGC(model.parameters(),
                                args.lr,
                                betas=(0.9, 0.999),
                                weight_decay=3 * 1e-7,
                                eps=1e-7,
                                use_gc=True)
        else:
            optimizer = AdamW(model.parameters(),
                              args.lr,
                              betas=(0.8, 0.999),
                              weight_decay=3 * 1e-7,
                              eps=1e-7)
        scheduler = warmup(optimizer, 1, 2000)
    elif args.opt == 'adadelta':
        optimizer = optim.Adadelta(model.parameters(),
                                   args.lr,
                                   weight_decay=3 * 1e-7)
        scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR
    elif args.opt == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              args.lr,
                              weight_decay=3 * 1e-7)
        scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    i = 0
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)

                # Forward
                log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()
                i += 1
                loss /= args.acc_step

                # Backward
                loss.backward()
                if i % args.acc_step == 0:
                    nn.utils.clip_grad_norm_(model.parameters(),
                                             args.max_grad_norm)
                    optimizer.step()
                    scheduler.step(i // (args.acc_step))
                    ema(model, i // (args.acc_step))
                    optimizer.zero_grad()

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0 and i % args.acc_step == 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 23
0
def main():
    torch.backends.cudnn.enabled = False

    # Make save dir for logfiles and state_dicts
    run_name = "optflow_nvidia"
    save_path = osp.join("save", run_name)
    save_dir = util.get_save_dir(save_path, run_name,
                                 training=True)  # unique save dir
    log = util.get_logger(save_dir, run_name)  # logger
    saver = util.CheckpointSaver(
        save_dir,  # save model
        max_checkpoints=10,
        maximize_metric=False,
        metric_name="MSE",
        log=log)
    # Data, batches & epochs
    max_epoch = 25
    batch_size = 32
    window = 7
    train_loader, val_loader = create_datasplit(batch_size=batch_size,
                                                window=window)

    # Model Creation
    log.info("Building model")
    # model = resnet18(sample_size=(240, 640), sample_duration=2*window+1, shortcut_type="A", num_classes=1)
    # model = C3D()
    model = nvidia()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = nn.DataParallel(
        model
    )  # If loading state dict throws KeyError ie. unexpected keys "module.convX.X"
    model.to(device)
    model.train()

    # Possibly load model state_dict here
    step = 0
    # load_path = 'c3d.pickle'
    # log.info('Loading checkpoint from {}...'.format(load_path))
    # model.load_state_dict(torch.load(load_path))
    # model = util.load_model(model, load_path, 0)  # uses the saved step num

    # Loss & Optimizer
    criterion = nn.MSELoss()
    lr = 1e-4
    weight_decay = 1e-5
    # optimizer = optim.SGD(model.parameters(), lr=lr, momentum=.9, weight_decay=weight_decay)
    optimizer = optim.Adam(model.parameters(),
                           lr=lr,
                           weight_decay=weight_decay)

    # Log args
    # log.info('Args: {}'.format(dumps(vars({"lr": lr, "max_epoch": max_epoch, "batch_size": batch_size,
    #                                        "window_size": window}), indent=4, sort_keys=True)))

    # Initialize epoch and step_to_eval counters
    steps_till_eval = int(.4 * len(train_loader.dataset))
    epoch = step // len(train_loader.dataset)

    while epoch != max_epoch:
        epoch += 1
        log.info("=============Epoch %i=============" % epoch)
        with torch.enable_grad(), tqdm(
                total=len(train_loader.dataset)) as progress_bar:

            for sample_batch in train_loader:
                model.zero_grad()  # Zero gradients after each batch

                x, y = sample_batch

                x = x.to(device)
                y = y.to(device)

                f = model(x)

                loss = criterion(f, y)
                loss.backward()
                optimizer.step()

                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, MSE=loss.item())

                steps_till_eval -= batch_size

                if steps_till_eval <= 0:
                    steps_till_eval = int(.4 * len(train_loader.dataset))

                    # Eval on validation set
                    val_mse = evalu(model, val_loader, device)

                    # Save checkpoint
                    saver.save(step, model, val_mse, device)

                    # Print to console
                    results_str = "MSE: " + str(val_mse)
                    log.info('Dev {}'.format(results_str))
Esempio n. 24
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')
    if args.model_name == 'sketchy':
        model = SketchyReader(word_vectors=word_vectors,
                              char_vectors=char_vectors,
                              hidden_size=args.hidden_size,
                              char_embed_drop_prob=args.char_embed_drop_prob,
                              num_heads=args.num_heads,
                              drop_prob=args.drop_prob)  # SKETCHY
    elif args.model_name == 'intensive':

        model = IntensiveReader(word_vectors=word_vectors,
                                char_vectors=char_vectors,
                                num_heads=args.num_heads,
                                char_embed_drop_prob=args.char_embed_drop_prob,
                                hidden_size=args.hidden_size,
                                drop_prob=args.drop_prob)  # INTENSIVE
    elif args.model_name == 'retro':

        model = RetroQANet(word_vectors=word_vectors,
                           char_vectors=char_vectors,
                           hidden_size=args.hidden_size,
                           num_heads=args.num_heads,
                           char_embed_drop_prob=args.char_embed_drop_prob,
                           intensive_path=args.load_path_i,
                           sketchy_path=args.load_path_s,
                           gpu_ids=args.gpu_ids,
                           drop_prob=args.drop_prob)  # Outer

    model = nn.DataParallel(model, args.gpu_ids)

    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # setup losses
    bceLoss = nn.BCELoss()

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(), args.lr,
                               weight_decay=args.l2_wd)
    if args.optim == "adam":
        optimizer = optim.Adam(
            model.parameters(), 0.001, betas=(0.8, 0.999), eps=1e-7, weight_decay=3e-7)

    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)

    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        counter = 0
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                counter += 1
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                y1, y2 = y1.to(device), y2.to(device)
                if args.model_name == 'sketchy':
                    yi = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)
                    loss = bceLoss(yi, torch.where(
                        y1 == 0, 0, 1).type(torch.FloatTensor))
                elif args.model_name == 'intensive':
                    yi, log_p1, log_p2 = model(
                        cw_idxs, qw_idxs, cc_idxs, qc_idxs)
                    # if counter % 100 == 0:
                    #print(torch.max(log_p1.exp(), dim=1)[0])
                    # $print(torch.max(log_p2.exp(), dim=1)[0])
                    #weights = torch.ones(log_p1.shape[1])
                    #weights[0] = 2/(log_p1.shape[1])
                    #nll_loss = nn.NLLLoss(weight=weights.to(device='cuda:0'))
                    # gt_0 = torch.zeros(yi.shape[0]).to(device)
                    # gt_1 = torch.ones(yi.shape[0]).to(device)
                    loss = args.alpha_1 * bceLoss(yi, torch.where(y1 == 0, 0, 1).type(
                        torch.FloatTensor)) + args.alpha_2 * (F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2))
                    #loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                elif args.model_name == 'retro':
                    log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)
                    loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                else:
                    raise ValueError(
                        'invalid --model_name, sketchy or intensive required')

                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(
                    model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)
                tbx.add_scalar('train/' + args.model_name, loss_val, step)
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2,
                                                  model_name=args.model_name,
                                                  a1=args.alpha_1,
                                                  a2=args.alpha_2)
                    saver.save(
                        step, model, results[args.metric_name], device, model_name=args.model_name)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(
                        f'{k}: {v:05.2f}' for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 25
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Load embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Build QA model
    log.info('Building model...')
    model = QA_Model(word_vectors=word_vectors,
                     hidden_size=args.hidden_size,
                     drop_prob=args.drop_prob,
                     attention_type=args.attention_type,
                     train_embeddings=args.train_embeddings)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        # Load QA model from file
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    #optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, qw_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')

                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')
Esempio n. 26
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    # Args:  word_vectors: word vector tensor of dimension [vocab_size * wemb_dim]
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # Get Model
    log.info('Building Model...')
    model = QANet(word_vectors,
                  char_vectors,
                  args.para_limit,
                  args.ques_limit,
                  args.f_model,
                  num_head=args.num_head,
                  train_cemb = (not args.pretrained_char))
    model = nn.DataParallel(model, args.gpu_ids)
    
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adam(
        params=parameters,
        lr=args.lr,
        betas=(args.beta1, args.beta2),
        eps=1e-8,
        weight_decay=3e-7)
    cr = 1.0 / math.log(args.lr_warm_up_num)
    scheduler = optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda ee: cr * math.log(ee + 1)
        if ee < args.lr_warm_up_num else 1)
    loss_f = torch.nn.CrossEntropyLoss()

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)

                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = torch.mean(loss_f(log_p1, y1) + loss_f(log_p2, y2))
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 27
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:

            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)
            optimizer.zero_grad()

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            loss_val = loss.item()

            # Backward
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            optimizer.step()
            scheduler.step(step // batch_size)
            ema(model, step // batch_size)

            # Log info
            step += batch_size
            if step % 1000 == 0 and step > 0:
                log.info(f'Step {step}: training loss {loss_val}...')

            steps_till_eval -= batch_size
            if steps_till_eval <= 0:
                steps_till_eval = args.eval_steps

                # Evaluate and save checkpoint
                log.info(f'Evaluating at step {step}...')
                ema.assign(model)
                results, pred_dict = evaluate(model, dev_loader, device,
                                              args.dev_eval_file,
                                              args.max_ans_len,
                                              args.use_squad_v2)
                saver.save(step, model, results[args.metric_name], device)
                ema.resume(model)

                # Log to console
                results_str = ', '.join(f'{k}: {v:05.2f}'
                                        for k, v in results.items())
                log.info(f'Dev {results_str}')

                # Log to TensorBoard
                log.info('Visualizing in TensorBoard...')
                for k, v in results.items():
                    tbx.add_scalar(f'dev/{k}', v, step)
                util.visualize(tbx,
                               pred_dict=pred_dict,
                               eval_path=args.dev_eval_file,
                               step=step,
                               split='dev',
                               num_visuals=args.num_visuals)
Esempio n. 28
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')

    model = BiDAF(vectors=(word_vectors, char_vectors),
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob,
                  p_sdd=args.p_sdd,
                  char_limit=args.char_limit,
                  use_transformer=args.use_transformer,
                  inter_size=args.inter_size,
                  heads=args.heads,
                  c2w_size=args.c2w_size,
                  enc_blocks=args.enc_blocks,
                  enc_convs=args.enc_convs,
                  mod_blocks=args.mod_blocks,
                  mod_convs=args.mod_convs,
                  use_GRU=args.use_GRU)

    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path,
                                      args.gpu_ids)  # uses the saved step num
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    # optimizer = optim.Adadelta(model.parameters(), args.lr,
    #                            weight_decay=args.l2_wd)

    # The scheduler MULTIPLIES the base LR, NOT replaces
    optimizer = optim.Adam(model.parameters(),
                           1.,
                           betas=(.9, .98),
                           eps=1e-9,
                           weight_decay=args.l2_wd)

    scheduler = sched.LambdaLR(
        optimizer, lambda s: 0.001 * math.log(s + 1) / math.log(1000 - 1)
        if s < 1000 else 0.001)  # Chute (must use math.log, else TypeError)
    # scheduler = sched.LambdaLR(optimizer, lambda s: (args.hidden_size**(-.5)) *
    #                            min((s+1e-9)**(-.5), s*(4000**(-1.5)))
    #                            )  # From Vaswani et. al 2017

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:

                # Setup for forward
                optimizer.zero_grad()
                batch_size = cw_idxs.size(0)

                cc_idxs = cc_idxs.to(device)  # (batch, c_limit, char_limit)
                qc_idxs = qc_idxs.to(device)
                cw_idxs = cw_idxs.to(device)  # (batch, c_limit)
                qw_idxs = qw_idxs.to(device)

                c_idxs, q_idxs = (cw_idxs, cc_idxs), (qw_idxs, qc_idxs)

                # Forward
                log_p1, log_p2 = model(c_idxs, q_idxs)

                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(
                    step // batch_size
                )  # By default, schedules per epoch; pass in step # as "epoch"
                ema(model, step // batch_size)

                # Log info
                step += batch_size  # Number of examples. Step is usually the number of (mini)-batches
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 29
0
def main(args):
  print("in main")
  print("args: ", args)

  if True: 
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')

    # CHECK IF WE NEED TO USE ALL OF THESE???? 
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(), args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR


    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    print("train dataset!: ", train_dataset)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)


# Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, qw_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
def main(args):
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    device, args.gpu_ids = util.get_available_devices()
    print(args.gpu_ids)
    tbx = SummaryWriter(args.save_dir)

    #this lets use save model
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=15,
                                 metric_name='accuracy',
                                 maximize_metric=True,
                                 log=log)

    #build model here
    log.info("Building model")
    #model = VGGLinear()
    #model = Baseline(8 * 96 * 64)
    #model = VGGLSTM()
    #model = Resnet()
    model = TimeCNN()
    model = model.double()
    print('bbefore data parallel')
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info("Loading checkpoints")
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    print('before to device')
    model = model.to(device)
    model.train()
    #optimizer = optim.Adam(model.parameters(), lr = 0.001, betas=(.9,.999), eps=1e-08)

    # These are the parameters with the VGGLinear
    #optimizer = optim.Adam(model.parameters(), lr = 0.001, betas=(.9,.999), eps=1e-08, weight_decay=.001)
    log.info("Building Dataset")

    # These are the parameters that worked with the best TimeCNN model
    optimizer = optim.Adam(model.parameters(),
                           lr=1e-3,
                           betas=(.9, .999),
                           eps=1e-08,
                           weight_decay=.005)
    train_dataset = Shots("videos/train.h5py", "labels/train.npy")
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=BATCH_SIZE,
                                   shuffle=True,
                                   num_workers=4,
                                   collate_fn=collate_fn)

    dev_dataset = Shots("videos/dev.h5py", "labels/dev.npy")
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=BATCH_SIZE,
                                 shuffle=False,
                                 num_workers=4,
                                 collate_fn=collate_fn)

    #print(len(train_loader.dataset))
    log.info("Training")
    steps_til_eval = 2000

    for epoch in range(30):
        with torch.enable_grad(), tqdm(
                total=len(train_loader.dataset)) as progress_bar:
            for frames, ys in train_loader:
                batch_size = frames.shape[0]
                step += batch_size
                frames = frames.to(device)
                ys = ys.to(device)
                optimizer.zero_grad()

                #forwards pass
                scores = model(frames)
                loss = F.cross_entropy(scores, ys)
                loss_val = loss.item()

                #Backwards pass
                loss.backward()
                optimizer.step()

                #some logging
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                steps_til_eval -= batch_size
                if steps_til_eval <= 0:
                    steps_til_eval = 2000
                    results, loss = evaluate(model, dev_loader, device)
                    # save checkpoint
                    saver.save(step, model, results, device)
                    log.info("Dev Accuracy " + str(results))
                    log.info("Dev loss " + str(loss))
                    #logging to tensorboard
                    tbx.add_scalar('dev_accuracy', results, step)
                    tbx.add_scalar("dev_loss", loss, step)

    tbx.close()