def train_worker(dataset, device, rank=0, world_size=None):
    torch.cuda.set_device(device)
    criterion = TripletMarginRankingLoss(args.loss_margin)
    model = TransformerPool(args.vocab_size, args.embedding_dim,
                            args.hidden_dim, pre_trained=GLOVE)
    if args.re_train:
        model.load_state_dict(torch.load(
            args.train_model, map_location='cuda:{}'.format(device)))
    else:
        model.apply(init_weights)
    model, criterion = model.to(device), criterion.to(device)
    triplet_dataset = TripletDataset(dataset)

    in_distributed_mode = True if world_size else False
    if in_distributed_mode:
        rank, device = torch.distributed.get_rank(), torch.cuda.current_device()
        print("rank:{}, device:{}".format(rank, device))

    if in_distributed_mode:
        model = DistributedDataParallel(
            model, device_ids=[device])
        datasampler = DistributedSampler(triplet_dataset)
        dataloader = DataLoader(triplet_dataset, shuffle=False,
                                pin_memory=True, num_workers=0,
                                batch_size=args.batch_size, sampler=datasampler)
    else:
        dataloader = DataLoader(triplet_dataset, shuffle=True,
                                pin_memory=True, num_workers=4,
                                batch_size=args.batch_size)

    optimizer = RAdam(
        model.parameters(), lr=args.learning_rate)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=args.t_max, eta_min=args.eta_min)

    model.train()
    best_avg_loss = None
    t1 = time.time()
    for epoch in range(args.epoch):
        datasampler.set_epoch(epoch) if in_distributed_mode else None
        total_loss = []
        bar = tqdm(desc='EPOCH {:02d}'.format(epoch), total=len(
            dataloader), leave=False) if rank == 0 else None

        for triplet in dataloader:
            optimizer.zero_grad()
            anchor, positive, negative = model(triplet)
            loss = criterion(anchor, positive, negative)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()
            total_loss.append(loss.item())
            bar.update() if rank == 0 else None

        if rank == 0:
            bar.close()
            epoch_avg_loss = np.mean(total_loss)
            scheduler.step(epoch_avg_loss)
            print("Epoch {:02d}, Time {:.02f}s, AvgLoss {:.08f}, lr {:.8f}".format(
                epoch, time.time()-t1, epoch_avg_loss, optimizer.param_groups[0]['lr']))
            if best_avg_loss is None or epoch_avg_loss < best_avg_loss:
                best_avg_loss = epoch_avg_loss
                state_dict = model.module.state_dict() if in_distributed_mode else model.state_dict()
                torch.save(state_dict, args.model_path)
            t1 = time.time()
        scheduler.step(epoch)
        torch.cuda.empty_cache()
    return
def run(proc_id, n_gpus, args, devices, dataset):
    dev_id = devices[proc_id]
    train_labels = dataset.train_labels
    train_truths = dataset.train_truths
    num_edges = train_truths.shape[0]

    reverse_types = {
        to_etype_name(k): 'rev-' + to_etype_name(k)
        for k in dataset.possible_rating_values
    }
    reverse_types.update({v: k for k, v in reverse_types.items()})
    sampler = dgl.dataloading.MultiLayerNeighborSampler([None],
                                                        return_eids=True)
    dataloader = dgl.dataloading.EdgeDataLoader(dataset.train_enc_graph, {
        to_etype_name(k): th.arange(
            dataset.train_enc_graph.number_of_edges(etype=to_etype_name(k)))
        for k in dataset.possible_rating_values
    },
                                                sampler,
                                                batch_size=args.minibatch_size,
                                                shuffle=True,
                                                drop_last=False)

    if proc_id == 0:
        valid_dataloader = dgl.dataloading.EdgeDataLoader(
            dataset.valid_dec_graph,
            th.arange(dataset.valid_dec_graph.number_of_edges()),
            sampler,
            g_sampling=dataset.valid_enc_graph,
            batch_size=args.minibatch_size,
            shuffle=False,
            drop_last=False)
        test_dataloader = dgl.dataloading.EdgeDataLoader(
            dataset.test_dec_graph,
            th.arange(dataset.test_dec_graph.number_of_edges()),
            sampler,
            g_sampling=dataset.test_enc_graph,
            batch_size=args.minibatch_size,
            shuffle=False,
            drop_last=False)

    if n_gpus > 1:
        dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
            master_ip='127.0.0.1', master_port='12345')
        world_size = n_gpus
        th.distributed.init_process_group(backend="nccl",
                                          init_method=dist_init_method,
                                          world_size=world_size,
                                          rank=dev_id)
    if n_gpus > 0:
        th.cuda.set_device(dev_id)

    nd_possible_rating_values = \
        th.FloatTensor(dataset.possible_rating_values)
    nd_possible_rating_values = nd_possible_rating_values.to(dev_id)

    start = time.time()
    net = Net(args=args, dev_id=dev_id)
    net = net.to(dev_id)
    if n_gpus > 1:
        net = DistributedDataParallel(net,
                                      device_ids=[dev_id],
                                      output_device=dev_id)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(),
                                                    lr=learning_rate)
    print("Loading network finished ...\n")

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_epoch = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0
    print("Start training ...")
    dur = []
    iter_idx = 1
    logging_str = None

    ### prepare the logger
    train_loss_logger = MetricLogger(
        ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(
                                         args.save_dir,
                                         'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(
                                        args.save_dir,
                                        'test_loss%d.csv' % args.save_id))

    for epoch in range(1, args.train_max_epoch):
        if epoch == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(
                torch_net_info(net,
                               save_path=os.path.join(
                                   args.save_dir, 'net%d.txt' % args.save_id)))
        if epoch > 1:
            t0 = time.time()
        net.train()
        with tqdm.tqdm(dataloader) as tq:
            for step, (input_nodes, pair_graph, blocks) in enumerate(tq):
                head_feat, tail_feat, blocks = load_subtensor(
                    input_nodes, pair_graph, blocks, dataset,
                    dataset.train_enc_graph)
                frontier = blocks[0]
                compact_g = flatten_etypes(pair_graph, dataset,
                                           'train').to(dev_id)
                true_relation_labels = compact_g.edata['label']
                true_relation_ratings = compact_g.edata['rating']

                head_feat = head_feat.to(dev_id)
                tail_feat = tail_feat.to(dev_id)
                frontier = frontier.to(dev_id)

                pred_ratings = net(compact_g, frontier, head_feat, tail_feat,
                                   dataset.possible_rating_values)
                loss = rating_loss_net(pred_ratings,
                                       true_relation_labels.to(dev_id)).mean()
                count_loss += loss.item()
                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(net.parameters(),
                                         args.train_grad_clip)
                optimizer.step()

                if proc_id == 0 and iter_idx == 1:
                    print("Total #Param of net: %d" %
                          (torch_total_param_num(net)))

                real_pred_ratings = (
                    th.softmax(pred_ratings, dim=1) *
                    nd_possible_rating_values.view(1, -1)).sum(dim=1)
                rmse = ((real_pred_ratings -
                         true_relation_ratings.to(dev_id))**2).sum()
                count_rmse += rmse.item()
                count_num += pred_ratings.shape[0]

                if iter_idx % args.train_log_interval == 0:
                    train_loss_logger.log(iter=iter_idx,
                                          loss=count_loss / (iter_idx + 1),
                                          rmse=count_rmse / count_num)

                tq.set_postfix(
                    {
                        'loss': '{:.4f}'.format(count_loss / iter_idx),
                        'rmse': '{:.4f}'.format(count_rmse / count_num)
                    },
                    refresh=False)

                iter_idx += 1

        if epoch > 1:
            epoch_time = time.time() - t0
            print("Epoch {} time {}".format(epoch, epoch_time))

        if epoch % args.train_valid_interval == 0:
            if n_gpus > 1:
                th.distributed.barrier()
            if proc_id == 0:
                valid_rmse = evaluate(args=args,
                                      dev_id=dev_id,
                                      net=net,
                                      dataset=dataset,
                                      dataloader=valid_dataloader,
                                      segment='valid')
                valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse)
                logging_str = 'Val RMSE={:.4f}'.format(valid_rmse)

                if valid_rmse < best_valid_rmse:
                    best_valid_rmse = valid_rmse
                    no_better_valid = 0
                    best_epoch = epoch
                    test_rmse = evaluate(args=args,
                                         dev_id=dev_id,
                                         net=net,
                                         dataset=dataset,
                                         dataloader=test_dataloader,
                                         segment='test')
                    best_test_rmse = test_rmse
                    test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                    logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
                else:
                    no_better_valid += 1
                    if no_better_valid > args.train_early_stopping_patience\
                        and learning_rate <= args.train_min_lr:
                        logging.info(
                            "Early stopping threshold reached. Stop training.")
                        break
                    if no_better_valid > args.train_decay_patience:
                        new_lr = max(
                            learning_rate * args.train_lr_decay_factor,
                            args.train_min_lr)
                        if new_lr < learning_rate:
                            logging.info("\tChange the LR to %g" % new_lr)
                            learning_rate = new_lr
                            for p in optimizer.param_groups:
                                p['lr'] = learning_rate
                            no_better_valid = 0
                            print("Change the LR to %g" % new_lr)
            # sync on evalution
            if n_gpus > 1:
                th.distributed.barrier()

        if logging_str is not None:
            print(logging_str)
    if proc_id == 0:
        print(
            'Best epoch Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.
            format(best_epoch, best_valid_rmse, best_test_rmse))

        train_loss_logger.close()
        valid_loss_logger.close()
        test_loss_logger.close()

        with open(
                os.path.join(args.save_dir, f'duration_{args.save_id:d}.txt'),
                'a') as f:
            print(f'wall: {time.time() - start}')
            f.write(f'wall: {time.time() - start}')
Exemple #3
0
def run(args, device, data):
    g, node_feats, num_of_ntype, num_classes, num_rels, \
        train_nid, val_nid, test_nid, labels, global_val_nid, global_test_nid = data

    fanouts = [int(fanout) for fanout in args.fanout.split(',')]
    val_fanouts = [int(fanout) for fanout in args.validation_fanout.split(',')]
    sampler = NeighborSampler(g, fanouts, dgl.distributed.sample_neighbors)
    # Create DataLoader for constructing blocks
    dataloader = DistDataLoader(dataset=train_nid.numpy(),
                                batch_size=args.batch_size,
                                collate_fn=sampler.sample_blocks,
                                shuffle=True,
                                drop_last=False)

    valid_sampler = NeighborSampler(g, val_fanouts,
                                    dgl.distributed.sample_neighbors)
    # Create DataLoader for constructing blocks
    valid_dataloader = DistDataLoader(dataset=val_nid.numpy(),
                                      batch_size=args.batch_size,
                                      collate_fn=valid_sampler.sample_blocks,
                                      shuffle=False,
                                      drop_last=False)

    test_sampler = NeighborSampler(g, [-1] * args.n_layers,
                                   dgl.distributed.sample_neighbors)
    # Create DataLoader for constructing blocks
    test_dataloader = DistDataLoader(dataset=test_nid.numpy(),
                                     batch_size=args.batch_size,
                                     collate_fn=test_sampler.sample_blocks,
                                     shuffle=False,
                                     drop_last=False)

    embed_layer = DistEmbedLayer(device,
                                 g,
                                 num_of_ntype,
                                 args.n_hidden,
                                 sparse_emb=args.sparse_embedding,
                                 dgl_sparse_emb=args.dgl_sparse)

    model = EntityClassify(device,
                           args.n_hidden,
                           num_classes,
                           num_rels,
                           num_bases=args.n_bases,
                           num_hidden_layers=args.n_layers - 2,
                           dropout=args.dropout,
                           use_self_loop=args.use_self_loop,
                           low_mem=args.low_mem,
                           layer_norm=args.layer_norm)
    model = model.to(device)
    if not args.standalone:
        model = th.nn.parallel.DistributedDataParallel(model)
        if args.sparse_embedding and not args.dgl_sparse:
            embed_layer = DistributedDataParallel(embed_layer,
                                                  device_ids=None,
                                                  output_device=None)

    if args.sparse_embedding:
        if args.dgl_sparse:
            emb_optimizer = dgl.distributed.SparseAdagrad(
                [embed_layer.node_embeds], lr=args.sparse_lr)
        else:
            emb_optimizer = th.optim.SparseAdam(
                embed_layer.module.node_embeds.parameters(), lr=args.sparse_lr)
        optimizer = th.optim.Adam(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.l2norm)
    else:
        all_params = list(model.parameters()) + list(embed_layer.parameters())
        optimizer = th.optim.Adam(all_params,
                                  lr=args.lr,
                                  weight_decay=args.l2norm)

    # training loop
    print("start training...")
    for epoch in range(args.n_epochs):
        tic = time.time()

        sample_time = 0
        copy_time = 0
        forward_time = 0
        backward_time = 0
        update_time = 0
        number_train = 0

        step_time = []
        iter_t = []
        sample_t = []
        feat_copy_t = []
        forward_t = []
        backward_t = []
        update_t = []
        iter_tput = []

        start = time.time()
        # Loop over the dataloader to sample the computation dependency graph as a list of
        # blocks.
        step_time = []
        for step, sample_data in enumerate(dataloader):
            seeds, blocks = sample_data
            number_train += seeds.shape[0]
            tic_step = time.time()
            sample_time += tic_step - start
            sample_t.append(tic_step - start)

            feats = embed_layer(blocks[0].srcdata[dgl.NID],
                                blocks[0].srcdata[dgl.NTYPE], node_feats)
            label = labels[seeds]
            copy_time = time.time()
            feat_copy_t.append(copy_time - tic_step)

            # forward
            logits = model(blocks, feats)
            loss = F.cross_entropy(logits, label)
            forward_end = time.time()

            # backward
            optimizer.zero_grad()
            if args.sparse_embedding and not args.dgl_sparse:
                emb_optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if args.sparse_embedding:
                emb_optimizer.step()
            compute_end = time.time()
            forward_t.append(forward_end - copy_time)
            backward_t.append(compute_end - forward_end)

            # Aggregate gradients in multiple nodes.
            optimizer.step()
            update_t.append(time.time() - compute_end)
            step_t = time.time() - start
            step_time.append(step_t)

            if step % args.log_every == 0:
                print('[{}] Epoch {:05d} | Step {:05d} | Loss {:.4f} | time {:.3f} s' \
                        '| sample {:.3f} | copy {:.3f} | forward {:.3f} | backward {:.3f} | update {:.3f}'.format(
                    g.rank(), epoch, step, loss.item(), np.sum(step_time[-args.log_every:]),
                    np.sum(sample_t[-args.log_every:]), np.sum(feat_copy_t[-args.log_every:]), np.sum(forward_t[-args.log_every:]),
                    np.sum(backward_t[-args.log_every:]), np.sum(update_t[-args.log_every:])))
            start = time.time()

        print(
            '[{}]Epoch Time(s): {:.4f}, sample: {:.4f}, data copy: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #number_train: {}'
            .format(g.rank(), np.sum(step_time), np.sum(sample_t),
                    np.sum(feat_copy_t), np.sum(forward_t), np.sum(backward_t),
                    np.sum(update_t), number_train))
        epoch += 1

        start = time.time()
        g.barrier()
        val_acc, test_acc = evaluate(g, model, embed_layer, labels,
                                     valid_dataloader, test_dataloader,
                                     node_feats, global_val_nid,
                                     global_test_nid)
        if val_acc >= 0:
            print('Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}'.format(
                val_acc, test_acc,
                time.time() - start))
Exemple #4
0
    def __init__(self, cfg, model_build_func):
        """
        Args:
            cfg (BaseConfig):
        """
        logger = logging.getLogger("cvpods")
        if not logger.isEnabledFor(
                logging.INFO):  # setup_logger is not called for d2
            setup_logger()

        self.start_iter = 0

        data_loader = self.build_train_loader(cfg)
        maybe_adjust_epoch_and_iter(cfg, data_loader)
        self.max_iter = cfg.SOLVER.LR_SCHEDULER.MAX_ITER
        self.max_epoch = cfg.SOLVER.LR_SCHEDULER.MAX_EPOCH

        model = model_build_func(cfg)
        model = maybe_convert_module(model)
        logger.info(f"Model structure: {model}")

        # Assume these objects must be constructed in this order.
        optimizer = self.build_optimizer(cfg, model)

        # For training, wrap with DDP. But don't need this for inference.
        if comm.get_world_size() > 1:
            if cfg.TRAINER.FP16.ENABLED:
                if cfg.TRAINER.FP16.TYPE == "APEX":
                    model, optimizer = amp.initialize(
                        model,
                        optimizer,
                        opt_level=cfg.TRAINER.FP16.OPTS.OPT_LEVEL)
            model = DistributedDataParallel(model,
                                            device_ids=[comm.get_local_rank()],
                                            broadcast_buffers=False,
                                            find_unused_parameters=True)
        # TODO: @wangfeng02, `batch_subdivisions`
        super().__init__(model, data_loader, optimizer,
                         cfg.SOLVER.BATCH_SUBDIVISIONS)

        if not cfg.SOLVER.LR_SCHEDULER.get("EPOCH_WISE", False):
            epoch_iters = -1
        else:
            epoch_iters = cfg.SOLVER.LR_SCHEDULER.get("EPOCH_ITERS")
            logger.warning(f"Setup LR Scheduler in EPOCH mode: {epoch_iters}")

        self.scheduler = self.build_lr_scheduler(cfg,
                                                 optimizer,
                                                 epoch_iters=epoch_iters)
        # Assume no other objects need to be checkpointed.
        # We can later make it checkpoint the stateful hooks
        optional = {}
        if cfg.TRAINER.FP16.ENABLED:
            optional["amp"] = amp
        self.checkpointer = DefaultCheckpointer(
            # Assume you want to save checkpoints together with logs/statistics
            model,
            cfg.OUTPUT_DIR,
            optimizer=optimizer,
            scheduler=self.scheduler,
            **optional,
        )

        self.cfg = cfg
        self.register_hooks(self.build_hooks())
Exemple #5
0
def main_worker(gpu, ngpus_per_node, args, cfg):
    args.gpu = gpu
    args.rank = args.rank * ngpus_per_node + gpu
    logger.info(f'rank: {args.rank} / {args.world_size}')
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=args.rank)
    torch.cuda.set_device(args.gpu)
    if args.gpu == 0:
        mkdir(args.outdir)
        filename = os.path.join(args.outdir, 'log.txt')
        fh = logging.StreamHandler(cached_log_stream(filename))
        fh.setLevel(logging.INFO)
        logger.addHandler(fh)
        plain_formatter = logging.Formatter(
            "[%(asctime)s] %(name)s %(levelname)s: %(message)s",
            datefmt="%m/%d %H:%M:%S")
        fh.setFormatter(plain_formatter)
        logger.info(args)

    # init the global
    global best_pred, acclist_train, acclist_val

    # seed
    torch.manual_seed(cfg.SEED)
    torch.cuda.manual_seed(cfg.SEED)

    # init dataloader
    transform_train, transform_val = get_transform(cfg.DATA.DATASET)(
        cfg.DATA.BASE_SIZE, cfg.DATA.CROP_SIZE, cfg.DATA.RAND_AUG)
    trainset = get_dataset(cfg.DATA.DATASET)(root=cfg.DATA.ROOT,
                                             transform=transform_train,
                                             train=True,
                                             download=True)
    valset = get_dataset(cfg.DATA.DATASET)(root=cfg.DATA.ROOT,
                                           transform=transform_val,
                                           train=False,
                                           download=True)

    train_sampler = torch.utils.data.distributed.DistributedSampler(trainset)
    train_loader = torch.utils.data.DataLoader(
        trainset,
        batch_size=cfg.TRAINING.BATCH_SIZE,
        shuffle=False,
        num_workers=cfg.TRAINING.WORKERS,
        pin_memory=True,
        sampler=train_sampler)

    val_sampler = torch.utils.data.distributed.DistributedSampler(
        valset, shuffle=False)
    val_loader = torch.utils.data.DataLoader(
        valset,
        batch_size=cfg.TRAINING.TEST_BATCH_SIZE,
        shuffle=False,
        num_workers=cfg.TRAINING.WORKERS,
        pin_memory=True,
        sampler=val_sampler)

    # init the model
    model_kwargs = {}
    if cfg.MODEL.FINAL_DROP > 0.0:
        model_kwargs['final_drop'] = cfg.MODEL.FINAL_DROP

    if cfg.TRAINING.LAST_GAMMA:
        model_kwargs['last_gamma'] = True

    model = get_model(cfg.MODEL.NAME)(**model_kwargs)

    if args.gpu == 0:
        logger.info(model)

    criterion, train_loader = get_criterion(cfg, train_loader, args.gpu)

    model.cuda(args.gpu)
    criterion.cuda(args.gpu)
    model = DistributedDataParallel(model, device_ids=[args.gpu])

    # criterion and optimizer
    if cfg.OPTIMIZER.DISABLE_BN_WD:
        parameters = model.named_parameters()
        param_dict = {}
        for k, v in parameters:
            param_dict[k] = v
        bn_params = [
            v for n, v in param_dict.items() if ('bn' in n or 'bias' in n)
        ]
        rest_params = [
            v for n, v in param_dict.items() if not ('bn' in n or 'bias' in n)
        ]
        if args.gpu == 0:
            logger.info(" Weight decay NOT applied to BN parameters ")
            logger.info(
                f'len(parameters): {len(list(model.parameters()))} = {len(bn_params)} + {len(rest_params)}'
            )
        optimizer = torch.optim.SGD([{
            'params': bn_params,
            'weight_decay': 0
        }, {
            'params': rest_params,
            'weight_decay': cfg.OPTIMIZER.WEIGHT_DECAY
        }],
                                    lr=cfg.OPTIMIZER.LR,
                                    momentum=cfg.OPTIMIZER.MOMENTUM,
                                    weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY)
    else:
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=cfg.OPTIMIZER.LR,
                                    momentum=cfg.OPTIMIZER.MOMENTUM,
                                    weight_decay=cfg.OPTIMIZER.WEIGHT_DECAY)
    # check point
    if args.resume is not None:
        if os.path.isfile(args.resume):
            if args.gpu == 0:
                logger.info(f"=> loading checkpoint '{args.resume}'")
            with PathManager.open(args.resume, "rb") as f:
                checkpoint = torch.load(f)
            cfg.TRAINING.START_EPOCHS = checkpoint['epoch'] + 1 if cfg.TRAINING.START_EPOCHS == 0 \
                    else cfg.TRAINING.START_EPOCHS
            best_pred = checkpoint['best_pred']
            acclist_train = checkpoint['acclist_train']
            acclist_val = checkpoint['acclist_val']
            model.module.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            if args.gpu == 0:
                logger.info(
                    f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']})"
                )
        else:
            raise RuntimeError(
                f"=> no resume checkpoint found at '{args.resume}'")

    scheduler = LR_Scheduler(cfg.OPTIMIZER.LR_SCHEDULER,
                             base_lr=cfg.OPTIMIZER.LR,
                             num_epochs=cfg.TRAINING.EPOCHS,
                             iters_per_epoch=len(train_loader),
                             warmup_epochs=cfg.OPTIMIZER.WARMUP_EPOCHS)

    def train(epoch):
        train_sampler.set_epoch(epoch)
        model.train()
        losses = AverageMeter()
        top1 = AverageMeter()
        global best_pred, acclist_train
        for batch_idx, (data, target) in enumerate(train_loader):
            scheduler(optimizer, batch_idx, epoch, best_pred)
            if not cfg.DATA.MIXUP:
                data, target = data.cuda(args.gpu), target.cuda(args.gpu)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            if not cfg.DATA.MIXUP:
                acc1 = accuracy(output, target, topk=(1, ))
                top1.update(acc1[0], data.size(0))

            losses.update(loss.item(), data.size(0))
            if batch_idx % 100 == 0 and args.gpu == 0:
                if cfg.DATA.MIXUP:
                    logger.info('Batch: %d| Loss: %.3f' %
                                (batch_idx, losses.avg))
                else:
                    logger.info('Batch: %d| Loss: %.3f | Top1: %.3f' %
                                (batch_idx, losses.avg, top1.avg))

        acclist_train += [top1.avg]

    def validate(epoch):
        model.eval()
        top1 = AverageMeter()
        top5 = AverageMeter()
        global best_pred, acclist_train, acclist_val
        is_best = False
        for batch_idx, (data, target) in enumerate(val_loader):
            data, target = data.cuda(args.gpu), target.cuda(args.gpu)
            with torch.no_grad():
                output = model(data)
                acc1, acc5 = accuracy(output, target, topk=(1, 5))
                top1.update(acc1[0], data.size(0))
                top5.update(acc5[0], data.size(0))

        # sum all
        sum1, cnt1, sum5, cnt5 = torch_dist_sum(args.gpu, top1.sum, top1.count,
                                                top5.sum, top5.count)
        top1_acc = sum(sum1) / sum(cnt1)
        top5_acc = sum(sum5) / sum(cnt5)

        if args.gpu == 0:
            logger.info('Validation: Top1: %.3f | Top5: %.3f' %
                        (top1_acc, top5_acc))
            if args.eval_only:
                return top1_acc, top5_acc

            # save checkpoint
            acclist_val += [top1_acc]
            if top1_acc > best_pred:
                best_pred = top1_acc
                is_best = True
            save_checkpoint(
                {
                    'epoch': epoch,
                    'state_dict': model.module.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'best_pred': best_pred,
                    'acclist_train': acclist_train,
                    'acclist_val': acclist_val,
                },
                directory=args.outdir,
                is_best=False,
                filename=f'checkpoint_{epoch}.pth')
        return top1_acc.item(), top5_acc.item()

    if args.export:
        if args.gpu == 0:
            with PathManager.open(args.export + '.pth', "wb") as f:
                torch.save(model.module.state_dict(), f)
        return

    if args.eval_only:
        top1_acc, top5_acc = validate(cfg.TRAINING.START_EPOCHS)
        metrics = {
            "top1": top1_acc,
            "top5": top5_acc,
        }
        if args.gpu == 0:
            with PathManager.open(os.path.join(args.outdir, 'metrics.json'),
                                  "w") as f:
                json.dump(metrics, f)
        return

    for epoch in range(cfg.TRAINING.START_EPOCHS, cfg.TRAINING.EPOCHS):
        tic = time.time()
        train(epoch)
        if epoch % 10 == 0:
            top1_acc, top5_acc = validate(epoch)
        elapsed = time.time() - tic
        if args.gpu == 0:
            logger.info(f'Epoch: {epoch}, Time cost: {elapsed}')

    # final evaluation
    top1_acc, top5_acc = validate(cfg.TRAINING.START_EPOCHS - 1)
    if args.gpu == 0:
        # save final checkpoint
        save_checkpoint(
            {
                'epoch': cfg.TRAINING.EPOCHS - 1,
                'state_dict': model.module.state_dict(),
                'optimizer': optimizer.state_dict(),
                'best_pred': best_pred,
                'acclist_train': acclist_train,
                'acclist_val': acclist_val,
            },
            directory=args.outdir,
            is_best=False,
            filename='checkpoint_final.pth')

        # save final model weights
        with PathManager.open(os.path.join(args.outdir, 'model_weights.pth'),
                              "wb") as f:
            torch.save(model.module.state_dict(), f)

        metrics = {
            "top1": top1_acc,
            "top5": top5_acc,
        }
        with PathManager.open(os.path.join(args.outdir, 'metrics.json'),
                              "w") as f:
            json.dump(metrics, f)
Exemple #6
0
    def prepare(self,
                ckpt_dir: str,
                optimizer: str = 'lars',
                learning_rate: float = 0.2,
                weight_decay: float = 1.5 * 1e-6,
                cosine_warmup: int = 10,
                cosine_cycles: int = 1,
                cosine_min_lr: float = 0.,
                epochs: int = 1000,
                batch_size: int = 256,
                num_workers: int = 0,
                distributed: bool = False,
                local_rank: int = 0,
                mixed_precision: bool = True,
                resume: str = None):
        """Prepare BYOL pre-training."""

        # Set attributes
        self.ckpt_dir = ckpt_dir
        self.epochs = epochs
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.distributed = distributed
        self.local_rank = local_rank
        self.mixed_precision = mixed_precision
        self.resume = resume

        self.optimizer = get_optimizer(
            params=[
                {
                    'params': self.online_net.parameters()
                },
                {
                    'params': self.online_predictor.parameters()
                },
            ],
            name=optimizer,
            lr=learning_rate,
            weight_decay=weight_decay  # TODO: remove params from batch norm
        )

        self.scheduler = get_cosine_scheduler(
            self.optimizer,
            epochs=self.epochs,
            warmup_steps=cosine_warmup,
            cycles=cosine_cycles,
            min_lr=cosine_min_lr,
        )

        # Resuming from previous checkpoint (optional)
        if resume is not None:
            if not os.path.exists(resume):
                raise FileNotFoundError
            self.load_model_from_checkpoint(resume)

        # Distributed training (optional, disabled by default.)
        if distributed:
            self.online_net = DistributedDataParallel(
                module=self.online_net.to(local_rank), device_ids=[local_rank])
            self.online_predictor = DistributedDataParallel(
                module=self.online_predictor.to(local_rank),
                device_ids=[local_rank])
        else:
            self.online_net.to(local_rank)
            self.online_predictor.to(local_rank)

        # No DDP wrapping for target network; no gradient updates
        self.target_net.to(local_rank)

        # Mixed precision training (optional, enabled by default)
        self.scaler = torch.cuda.amp.GradScaler() if mixed_precision else None

        # TensorBoard
        self.writer = SummaryWriter(ckpt_dir) if local_rank == 0 else None

        # Ready to train
        self.prepared = True
Exemple #7
0
def train():

    print("started")

    args = parser.parse_args()
    if not os.path.exists(
            args.output_dir
    ):  # !!!NOTICE: change output dir for each different settings.
        os.makedirs(args.output_dir)
        print(args.output_dir)

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", args.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info(
        "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning"
    )

    # Load pretrained model and tokenizer
    config_class, model_class, tokenizer_class = MODEL_CLASSES["gpt2"]
    config = config_class.from_pretrained(args.model_name_or_path)
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    tokenizer.add_tokens(SPECIAL_TOKENS)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config)
    # model.set_num_special_tokens(len(SPECIAL_TOKENS))
    model.resize_token_embeddings(len(tokenizer))
    model.to(args.device)
    # Prepare optimizer and schedule (linear warmup and decay)
    # optimizer = OpenAIAdam(model.parameters(), lr=args.lr)
    optimizer = AdamW(model.parameters(), lr=args.lr)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        args, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        cur_input_ids = batch[0]
        cur_lm_labels = batch[1]
        cur_token_type_ids = batch[2]
        model_outputs = model(input_ids=cur_input_ids,
                              labels=cur_lm_labels,
                              token_type_ids=cur_token_type_ids)
        lm_loss = model_outputs[0]
        loss = lm_loss / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            input_ids, lm_labels, token_type_ids = batch

            # logger.info(tokenizer.decode(input_ids[0, :].tolist()))
            model_outputs = model(input_ids, token_type_ids=token_type_ids)
            lm_logits = model_outputs[0]

            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)

            return lm_logits_flat_shifted, lm_labels_flat_shifted

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))}
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        # tb_logger = TensorboardLogger(log_dir=args.output_dir)
        # tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
        # tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        # tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(
            args.output_dir, 'checkpoint', save_interval=1, n_saved=3
        )  # !!!NOTICE: if fill exist, it will report error. set require_empty=False can avoid this.
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" take care of distributed encapsulation

        torch.save(args, args.output_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(
            os.path.join(args.output_dir, CONFIG_NAME))
        tokenizer.save_vocabulary(args.output_dir)

    # Run the training
    print(train_loader)
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            checkpoint_handler._saved[-1][1][-1],
            os.path.join(args.output_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
Exemple #8
0
def train(proc_id, n_gpus, args, dataset, g, feats, paper_offset):
    dev_id = devices[proc_id]
    if n_gpus > 1:
        dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
            master_ip='127.0.0.1', master_port='12346')
        world_size = n_gpus
        torch.distributed.init_process_group(backend='nccl',
                                             init_method=dist_init_method,
                                             world_size=world_size,
                                             rank=proc_id)

    torch.cuda.set_device(dev_id)

    print('Loading masks and labels')
    train_idx = torch.LongTensor(dataset.get_idx_split('train')) + paper_offset
    valid_idx = torch.LongTensor(dataset.get_idx_split('valid')) + paper_offset
    label = dataset.paper_label

    print('Initializing dataloader...')
    sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 25])

    train_collator = ExternalNodeCollator(g, train_idx, sampler, paper_offset,
                                          feats, label)
    valid_collator = ExternalNodeCollator(g, valid_idx, sampler, paper_offset,
                                          feats, label)
    # Necessary according to https://yangkky.github.io/2019/07/08/distributed-pytorch-tutorial.html
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_collator.dataset,
        num_replicas=world_size,
        rank=proc_id,
        shuffle=True,
        drop_last=False)
    valid_sampler = torch.utils.data.distributed.DistributedSampler(
        valid_collator.dataset,
        num_replicas=world_size,
        rank=proc_id,
        shuffle=True,
        drop_last=False)

    train_dataloader = torch.utils.data.DataLoader(
        train_collator.dataset,
        batch_size=1024,
        collate_fn=train_collator.collate,
        num_workers=4,
        sampler=train_sampler)

    valid_dataloader = torch.utils.data.DataLoader(
        valid_collator.dataset,
        batch_size=1024,
        collate_fn=valid_collator.collate,
        num_workers=2,
        sampler=valid_sampler)

    print('Initializing model...')
    model = RGAT(dataset.num_paper_features, dataset.num_classes, 1024, 5, 2,
                 4, 0.5, 'paper').to(dev_id)

    # convert BN to SyncBatchNorm. see https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html
    model = nn.SyncBatchNorm.convert_sync_batchnorm(model)

    model = DistributedDataParallel(model,
                                    device_ids=[dev_id],
                                    output_device=dev_id)
    opt = torch.optim.Adam(model.parameters(), lr=0.001)
    sched = torch.optim.lr_scheduler.StepLR(opt, step_size=25, gamma=0.25)

    best_acc = 0

    for i in range(args.epochs):
        # make shuffling work properly across multiple epochs.
        # see https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler
        train_sampler.set_epoch(i)
        model.train()
        with tqdm.tqdm(train_dataloader) as tq:
            for i, (input_nodes, output_nodes, mfgs) in enumerate(tq):
                mfgs = [g.to(dev_id) for g in mfgs]
                x = mfgs[0].srcdata['x']
                y = mfgs[-1].dstdata['y']
                y_hat = model(mfgs, x)
                loss = F.cross_entropy(y_hat, y)
                opt.zero_grad()
                loss.backward()
                opt.step()
                acc = (y_hat.argmax(1) == y).float().mean()
                tq.set_postfix(
                    {
                        'loss': '%.4f' % loss.item(),
                        'acc': '%.4f' % acc.item()
                    },
                    refresh=False)

        # eval in each process
        model.eval()
        correct = torch.LongTensor([0]).to(dev_id)
        total = torch.LongTensor([0]).to(dev_id)
        for i, (input_nodes, output_nodes,
                mfgs) in enumerate(tqdm.tqdm(valid_dataloader)):
            with torch.no_grad():
                mfgs = [g.to(dev_id) for g in mfgs]
                x = mfgs[0].srcdata['x']
                y = mfgs[-1].dstdata['y']
                y_hat = model(mfgs, x)
                correct += (y_hat.argmax(1) == y).sum().item()
                total += y_hat.shape[0]

        # `reduce` data into process 0
        torch.distributed.reduce(correct,
                                 dst=0,
                                 op=torch.distributed.ReduceOp.SUM)
        torch.distributed.reduce(total,
                                 dst=0,
                                 op=torch.distributed.ReduceOp.SUM)
        acc = (correct / total).item()

        sched.step()

        # process 0 print accuracy and save model
        if proc_id == 0:
            print('Validation accuracy:', acc)

            if best_acc < acc:
                best_acc = acc
                print('Updating best model...')
                torch.save(model.state_dict(), args.model_path)
def run(proc_id, n_gpus, args, devices, data):
    # Unpack data
    device = th.device(devices[proc_id])
    if n_gpus > 0:
        th.cuda.set_device(device)
    if n_gpus > 1:
        dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
            master_ip='127.0.0.1', master_port='12345')
        world_size = n_gpus
        th.distributed.init_process_group(backend="nccl",
                                          init_method=dist_init_method,
                                          world_size=world_size,
                                          rank=proc_id)
    train_nid, val_nid, test_nid, n_classes, g, nfeat, labels = data

    if args.data_device == 'gpu':
        nfeat = nfeat.to(device)
        labels = labels.to(device)
    elif args.data_device == 'uva':
        nfeat = dgl.contrib.UnifiedTensor(nfeat, device=device)
        labels = dgl.contrib.UnifiedTensor(labels, device=device)
    in_feats = nfeat.shape[1]

    # Create PyTorch DataLoader for constructing blocks
    n_edges = g.num_edges()
    train_seeds = th.arange(n_edges)

    if args.graph_device == 'gpu':
        train_seeds = train_seeds.to(device)
        g = g.to(device)
        args.num_workers = 0
    elif args.graph_device == 'uva':
        train_seeds = train_seeds.to(device)
        g.pin_memory_()
        args.num_workers = 0

    # Create sampler
    sampler = dgl.dataloading.MultiLayerNeighborSampler(
        [int(fanout) for fanout in args.fan_out.split(',')])
    dataloader = dgl.dataloading.EdgeDataLoader(
        g,
        train_seeds,
        sampler,
        exclude='reverse_id',
        # For each edge with ID e in Reddit dataset, the reverse edge is e ± |E|/2.
        reverse_eids=th.cat(
            [th.arange(n_edges // 2, n_edges),
             th.arange(0, n_edges // 2)]).to(train_seeds),
        negative_sampler=NegativeSampler(
            g, args.num_negs, args.neg_share,
            device if args.graph_device == 'uva' else None),
        device=device,
        use_ddp=n_gpus > 1,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=False,
        num_workers=args.num_workers)

    # Define model and optimizer
    model = SAGE(in_feats, args.num_hidden, args.num_hidden, args.num_layers,
                 F.relu, args.dropout)
    model = model.to(device)
    if n_gpus > 1:
        model = DistributedDataParallel(model,
                                        device_ids=[device],
                                        output_device=device)
    loss_fcn = CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # Training loop
    avg = 0
    iter_pos = []
    iter_neg = []
    iter_d = []
    iter_t = []
    best_eval_acc = 0
    best_test_acc = 0
    for epoch in range(args.num_epochs):
        tic = time.time()

        # Loop over the dataloader to sample the computation dependency graph as a list of
        # blocks.
        tic_step = time.time()
        for step, (input_nodes, pos_graph, neg_graph,
                   blocks) in enumerate(dataloader):
            batch_inputs = nfeat[input_nodes].to(device)
            pos_graph = pos_graph.to(device)
            neg_graph = neg_graph.to(device)
            blocks = [block.int().to(device) for block in blocks]
            d_step = time.time()

            # Compute loss and prediction
            batch_pred = model(blocks, batch_inputs)
            loss = loss_fcn(batch_pred, pos_graph, neg_graph)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            t = time.time()
            pos_edges = pos_graph.num_edges()
            neg_edges = neg_graph.num_edges()
            iter_pos.append(pos_edges / (t - tic_step))
            iter_neg.append(neg_edges / (t - tic_step))
            iter_d.append(d_step - tic_step)
            iter_t.append(t - d_step)
            if step % args.log_every == 0 and proc_id == 0:
                gpu_mem_alloc = th.cuda.max_memory_allocated(
                ) / 1000000 if th.cuda.is_available() else 0
                print(
                    '[{}]Epoch {:05d} | Step {:05d} | Loss {:.4f} | Speed (samples/sec) {:.4f}|{:.4f} | Load {:.4f}| train {:.4f} | GPU {:.1f} MB'
                    .format(proc_id, epoch, step, loss.item(),
                            np.mean(iter_pos[3:]), np.mean(iter_neg[3:]),
                            np.mean(iter_d[3:]), np.mean(iter_t[3:]),
                            gpu_mem_alloc))
            tic_step = time.time()

            if step % args.eval_every == 0 and proc_id == 0:
                eval_acc, test_acc = evaluate(model, g, nfeat, labels,
                                              train_nid, val_nid, test_nid,
                                              device)
                print('Eval Acc {:.4f} Test Acc {:.4f}'.format(
                    eval_acc, test_acc))
                if eval_acc > best_eval_acc:
                    best_eval_acc = eval_acc
                    best_test_acc = test_acc
                print('Best Eval Acc {:.4f} Test Acc {:.4f}'.format(
                    best_eval_acc, best_test_acc))
        toc = time.time()
        if proc_id == 0:
            print('Epoch Time(s): {:.4f}'.format(toc - tic))
        if epoch >= 5:
            avg += toc - tic
        if n_gpus > 1:
            th.distributed.barrier()

    if proc_id == 0:
        print('Avg epoch time: {}'.format(avg / (epoch - 4)))
Exemple #10
0
def main(config, embedding, model_path, run, imagedataset, local_rank, resnet,
         bkg):

    rank, world_size, device_id, device = setup(local_rank)
    print("Local rank: {} Rank: {} World Size: {} Device_id: {} Device: {}".
          format(local_rank, rank, world_size, device_id, device))
    pth_extn = '.pth.tar'

    # Configuration
    CONFIG = Dict(yaml.load(open(config)))

    datadir = os.path.join('data/datasets', imagedataset)
    print("Split dir: ", datadir)
    savedir = osp.dirname(model_path)
    epoch = re.findall("checkpoint_(.*)\." + pth_extn[1:],
                       osp.basename(model_path))[-1]

    if run == 'zlss' or run == 'flss':
        val = np.load(datadir + '/split/test_list.npy')
        visible_classes = np.load(datadir + '/split/novel_cls.npy')
        if bkg:
            visible_classes = np.asarray(np.concatenate(
                [np.array([0]), visible_classes]),
                                         dtype=int)
    elif run == 'gzlss' or run == 'gflss':
        val = np.load(datadir + '/split/test_list.npy')

        vals_cls = np.asarray(np.concatenate([
            np.load(datadir + '/split/seen_cls.npy'),
            np.load(datadir + '/split/val_cls.npy')
        ]),
                              dtype=int)

        if bkg:
            vals_cls = np.asarray(np.concatenate([np.array([0]), vals_cls]),
                                  dtype=int)
        valu_cls = np.load(datadir + '/split/novel_cls.npy')
        visible_classes = np.concatenate([vals_cls, valu_cls])
    else:
        print("invalid run ", run)
        sys.exit()

    cls_map = np.array([255] * 256)
    for i, n in enumerate(visible_classes):
        cls_map[n] = i

    if run == 'gzlss' or run == 'gflss':

        novel_cls_map = np.array([255] * 256)
        for i, n in enumerate(list(valu_cls)):
            novel_cls_map[cls_map[n]] = i

        seen_cls_map = np.array([255] * 256)
        for i, n in enumerate(list(vals_cls)):
            seen_cls_map[cls_map[n]] = i

    all_labels = np.genfromtxt(datadir + '/labels_2.txt',
                               delimiter='\t',
                               usecols=1,
                               dtype='str')

    print("Visible Classes: ", visible_classes)

    # Dataset
    dataset = get_dataset(CONFIG.DATASET)(
        train=None,
        test=val,
        root=CONFIG.ROOT,
        split=CONFIG.SPLIT.TEST,
        base_size=CONFIG.IMAGE.SIZE.TEST,
        mean=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R),
        warp=CONFIG.WARP_IMAGE,
        scale=None,
        flip=False,
    )

    random.seed(42)

    if embedding == 'word2vec':
        class_emb = pickle.load(
            open(datadir + '/word_vectors/word2vec.pkl', "rb"))
    elif embedding == 'fasttext':
        class_emb = pickle.load(
            open(datadir + '/word_vectors/fasttext.pkl', "rb"))
    elif embedding == 'fastnvec':
        class_emb = np.concatenate([
            pickle.load(open(datadir + '/word_vectors/fasttext.pkl', "rb")),
            pickle.load(open(datadir + '/word_vectors/word2vec.pkl', "rb"))
        ],
                                   axis=1)
    else:
        print("invalid emb ", embedding)
        sys.exit()

    class_emb = class_emb[visible_classes]
    class_emb = F.normalize(torch.tensor(class_emb), p=2, dim=1).cuda()

    print("Embedding dim: ", class_emb.shape[1])
    print("# Visible Classes: ", class_emb.shape[0])

    # DataLoader
    loader = torch.utils.data.DataLoader(dataset=dataset,
                                         batch_size=CONFIG.BATCH_SIZE.TEST,
                                         num_workers=CONFIG.NUM_WORKERS,
                                         shuffle=False,
                                         sampler=DistributedSampler(
                                             dataset,
                                             num_replicas=world_size,
                                             rank=rank,
                                             shuffle=False),
                                         pin_memory=True,
                                         drop_last=True)

    torch.set_grad_enabled(False)

    # Model
    model = DeepLabV2_ResNet101_MSC(class_emb.shape[1],
                                    class_emb,
                                    resnet=resnet)

    state_dict = torch.load(model_path, map_location='cpu')
    model = DistributedDataParallel(model.to(device), device_ids=[rank])
    new_state_dict = OrderedDict()
    if resnet == 'spnet':
        for k, v in state_dict['state_dict'].items():
            name = k.replace("scale", "base")  # 'scale'->base
            name = name.replace("stages.", "")
            new_state_dict[name] = v
    else:
        new_state_dict = state_dict['state_dict']
    model.load_state_dict(new_state_dict)
    del state_dict

    model.eval()
    targets, outputs = [], []

    loader_iter = iter(loader)
    iterations = len(loader_iter)
    print("Iterations: {}".format(iterations))

    pbar = tqdm(loader,
                total=iterations,
                leave=False,
                dynamic_ncols=True,
                position=rank)
    for iteration in pbar:

        data, target, img_id = next(loader_iter)
        # Image
        data = data.to(device)
        # Forward propagation
        output = model(data)
        output = F.interpolate(output,
                               size=data.shape[2:],
                               mode="bilinear",
                               align_corners=False)

        output = F.softmax(output, dim=1)
        target = cls_map[target.numpy()]

        remote_target = torch.tensor(target).to(device)
        if rank == 0:
            remote_target = torch.zeros_like(remote_target).to(device)

        output = torch.argmax(output, dim=1).cpu().numpy()

        remote_output = torch.tensor(output).to(device)
        if rank == 0:
            remote_output = torch.zeros_like(remote_output).to(device)

        for o, t in zip(output, target):
            outputs.append(o)
            targets.append(t)

        torch.distributed.reduce(remote_output, dst=0)
        torch.distributed.reduce(remote_target, dst=0)

        torch.distributed.barrier()

        if rank == 0:
            remote_output = remote_output.cpu().numpy()
            remote_target = remote_target.cpu().numpy()
            for o, t in zip(remote_output, remote_target):
                outputs.append(o)
                targets.append(t)

    if rank == 0:

        if run == 'gzlss' or run == 'gflss':
            score, class_iou = scores_gzsl(targets,
                                           outputs,
                                           n_class=len(visible_classes),
                                           seen_cls=cls_map[vals_cls],
                                           unseen_cls=cls_map[valu_cls])
        else:
            score, class_iou = scores(targets,
                                      outputs,
                                      n_class=len(visible_classes))

        for k, v in score.items():
            print(k, v)

        score["Class IoU"] = {}
        for i in range(len(visible_classes)):
            score["Class IoU"][all_labels[visible_classes[i]]] = class_iou[i]

        name = ""
        name = model_path.replace(pth_extn, "_" + run + ".json")

        if bkg == True:
            with open(name.replace('.json', '_bkg.json'), "w") as f:
                json.dump(score, f, indent=4, sort_keys=True)
        else:
            with open(name, "w") as f:
                json.dump(score, f, indent=4, sort_keys=True)

        print(score["Class IoU"])

    return
Exemple #11
0
def train():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default=
        "/Users/tetracycline/repos/datascience/datascience/projects/counsel_chat_all_data_300-tokens.json",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="openai-gpt",
                        help="Path, url or short name of the model")
    parser.add_argument("--num_candidates",
                        type=int,
                        default=2,
                        help="Number of candidates for training")
    parser.add_argument("--max_history",
                        type=int,
                        default=2,
                        help="Number of previous exchanges to keep in history")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=8,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-5,
                        help="Learning rate")
    parser.add_argument("--lm_coef",
                        type=float,
                        default=1.0,
                        help="LM loss coefficient")
    parser.add_argument("--mc_coef",
                        type=float,
                        default=1.0,
                        help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=3,
                        help="Number of training epochs")
    parser.add_argument("--personality_permutations",
                        type=int,
                        default=1,
                        help="Number of permutations of personality sentences")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", args.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info("Prepare tokenizer, pretrained model and optimizer.")
    tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer  # cant use Autotokenizer because checkpoint could be a Path
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)

    model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    # Add special tokens if they are not already added
    add_special_tokens_(model, tokenizer)
    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        args, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
        (lm_loss), (mc_loss), *_ = model(input_ids,
                                         token_type_ids=token_type_ids,
                                         mc_token_ids=mc_token_ids,
                                         mc_labels=mc_labels,
                                         lm_labels=lm_labels)
        loss = (lm_loss * args.lm_coef +
                mc_loss * args.mc_coef) / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            # if we dont send labels to model, it doesnt return losses
            lm_logits, mc_logits, *_ = model(
                input_ids,
                token_type_ids=token_type_ids,
                mc_token_ids=mc_token_ids,
            )
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted,
                    mc_logits), (lm_labels_flat_shifted, mc_labels)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-100),
             output_transform=lambda x: (x[0][0], x[1][0])),
        "accuracy":
        Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args),
        "average_accuracy":
        MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.model_checkpoint)
        tb_logger = TensorboardLogger(log_dir)

        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator,
                         log_handler=OutputHandler(tag="validation",
                                                   metric_names=list(
                                                       metrics.keys()),
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        getattr(model, 'module',
                model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
        tokenizer.save_pretrained(log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            os.path.join(log_dir, checkpoint_handler._saved[-1][1]),
            os.path.join(log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Exemple #12
0
    def __init__(self, args, train_samples, dev_samples, dev_ace_samples,
                 test_ace_samples, cpt_model, id2cpt, id2et, cpt_id2et_id):
        self.id2cpt = id2cpt
        self.id2et = id2et
        self.cpt_id2et_id = cpt_id2et_id

        # init distributed
        self.device = torch.device("cuda:{}".format(args.local_rank) if torch.
                                   cuda.is_available() else "cpu")

        self.logger = logger.setLevel(logging.INFO if dist.get_rank() ==
                                      0 else logging.WARNING)
        # Setup logging
        # 同步start_time
        sync_time = torch.tensor(time.time(),
                                 dtype=torch.double).to(self.device)
        dist.broadcast(sync_time, src=0)
        # self.start_time = datetime.fromtimestamp(sync_time.item()).strftime('%Y-%m-%d-%H-%M-%S-%f')

        self.n_gpu = len(args.device_id.split(','))
        self.rank = args.local_rank
        self.world_size = dist.get_world_size()

        # self.model = cpt_model
        self.logger = logger
        self.logger.info("model name: {}".format("CptNllODEE"))
        self.logger.info("add_mlm_object tag: {}".format(args.add_mlm_object))
        self.args = args

        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)

        # init data loader
        # ************** train data ************************
        # train_sampler = DistributedSampler(train_samples)
        self.train_loader = train_samples
        # ************** dev data ************************
        self.dev_loader = dev_samples
        self.dev_ace_loader = dev_ace_samples
        # ************** test data ************************
        # self.test_loader = DataLoader(test_ace_samples, batch_size=args.per_gpu_eval_batch_size,
        #                               collate_fn=test_ace_samples.collate_fn)

        cpt_model.to(self.device)

        self.optim_mode = 'AdamW'

        self.n_steps = len(self.train_loader) * args.train_epoch_num
        self.logger.info("dataloader length: {}".format(len(
            self.train_loader)))
        self.print_step = self.args.train_record_steps
        self.update_step = self.args.gradient_accumulation_steps
        self.dev_step = self.args.dev_record_steps * self.args.gradient_accumulation_steps / self.args.gradient_average
        self.test_step = self.args.test_record_steps * self.args.gradient_accumulation_steps / self.args.gradient_average

        self.optimizer, self.scheduler = adam_optimizer(
            args,
            cpt_model,
            self.optim_mode,
            t_total=self.n_steps,
            warmup_steps=int(self.n_steps * args.warmup_ratio))

        # init fp16, must before DataParallel init
        if len(args.fp16):
            assert isinstance(
                args.fp16, str
            ), "Please set Apex AMP optimization level selected in ['O0', 'O1', 'O2', 'O3']"
            cpt_model, self.optimizer = amp.initialize(cpt_model,
                                                       self.optimizer,
                                                       opt_level=args.fp16)

        # init DataParallel
        self.ddp_model = DistributedDataParallel(cpt_model,
                                                 device_ids=[args.local_rank],
                                                 output_device=args.local_rank,
                                                 find_unused_parameters=True)

        self.model = self.ddp_model.module

        self.logger.info("Setup Distributed Trainer")
        self.logger.warning(
            "Process pid: {}, rank: {}, local rank: {}, device: {}, fp16: {}".
            format(os.getpid(), self.rank, args.local_rank, self.device,
                   args.fp16 if args.fp16 else False))
        self.logger.info("Num of processes: {}".format(self.world_size))
        self.logger.info("Use device: {}".format(self.device))
        self.logger.info(
            "Training with fp16: {}, optimization level: {}".format(
                len(args.fp16) > 0, args.fp16 if args.fp16 else None))
Exemple #13
0
def train_detector(model, dataset, cfg, distributed=False, validate=False, logger=None):
    if logger is None:
        logger = get_root_logger(cfg.log_level)

    # start training
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, dist=distributed
        )
        for ds in dataset
    ]

    total_steps = cfg.total_epochs * len(data_loaders[0])
    # print(f"total_steps: {total_steps}")

    if cfg.lr_config.type == "one_cycle":
        # build trainer
        optimizer = build_one_cycle_optimizer(model, cfg.optimizer)
        lr_scheduler = _create_learning_rate_scheduler(
            optimizer, cfg.lr_config, total_steps
        )
        cfg.lr_config = None
    else:
        optimizer = build_optimizer(model, cfg.optimizer)
        lr_scheduler = None

    # put model on gpus
    if distributed:
        # model = apex.parallel.convert_syncbn_model(model)
        model = DistributedDataParallel(
            model.cuda(cfg.local_rank),
            device_ids=[cfg.local_rank],
            output_device=cfg.local_rank,
            # broadcast_buffers=False,
            find_unused_parameters=True,
        )
    else:
        model = model.cuda()

    logger.info(f"model structure: {model}")

    trainer = Trainer(
        model, batch_processor, optimizer, lr_scheduler, cfg.work_dir, cfg.log_level
    )

    if distributed:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    trainer.register_training_hooks(
        cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config
    )

    if distributed:
        trainer.register_hook(DistSamplerSeedHook())

    # # register eval hooks
    # if validate:
    #     val_dataset_cfg = cfg.data.val
    #     eval_cfg = cfg.get('evaluation', {})
    #     dataset_type = DATASETS.get(val_dataset_cfg.type)
    #     trainer.register_hook(
    #         KittiEvalmAPHookV2(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        trainer.resume(cfg.resume_from)
    elif cfg.load_from:
        trainer.resume(cfg.load_from, load_only_weights=True)

    # import ipdb; ipdb.set_trace()

    trainer.run(data_loaders, cfg.workflow, cfg.total_epochs, local_rank=cfg.local_rank)
Exemple #14
0
def main(args):

    # model = modeling.VOSNet(model=args.model).cuda()
    # model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    model = modeling.VOSNet(model=args.model, sync_bn=True).cuda()
    model = DistributedDataParallel(model,
                                    device_ids=[args.local_rank],
                                    broadcast_buffers=False)

    criterion = CrossEntropy(temperature=args.temperature).cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=0.9,
                                nesterov=True,
                                weight_decay=args.wd)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                           args.epochs,
                                                           eta_min=4e-5)
    if args.dataset == 'davis':
        train_dataset = dataset.DavisTrain(
            os.path.join(args.data, 'DAVIS_train/JPEGImages/480p'),
            os.path.join(args.data, 'DAVIS_train/Annotations/480p'),
            frame_num=args.frame_num,
            color_jitter=args.cj)
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.bs // dist.get_world_size(),
            shuffle=False,
            sampler=train_sampler,
            pin_memory=True,
            num_workers=4 // dist.get_world_size(),
            drop_last=True)
        val_dataset = dataset.DavisTrain(
            os.path.join(args.data, 'DAVIS_val/JPEGImages/480p'),
            os.path.join(args.data, 'DAVIS_val/Annotations/480p'),
            frame_num=args.frame_num,
            color_jitter=args.cj)
        val_sampler = torch.utils.data.distributed.DistributedSampler(
            val_dataset)
        val_loader = torch.utils.data.DataLoader(
            val_dataset,
            batch_size=args.bs // dist.get_world_size(),
            shuffle=False,
            sampler=val_sampler,
            pin_memory=True,
            num_workers=4 // dist.get_world_size(),
            drop_last=True)
    else:
        raise NotImplementedError
    start_epoch = 0
    if args.resume:
        if os.path.isfile(args.resume):
            logger.info("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            scheduler.load_state_dict(checkpoint['scheduler'])
            logger.info("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            logger.info("=> no checkpoint found at '{}'".format(args.resume))

    for epoch in range(start_epoch, start_epoch + args.epochs):

        train_loss = train(train_loader, model, criterion, optimizer, epoch,
                           args)

        with torch.no_grad():
            val_loss = validate(val_loader, model, criterion, args)

        scheduler.step()

        if dist.get_rank() == 0:
            os.makedirs(args.save_model, exist_ok=True)
            checkpoint_name = 'checkpoint-epoch-{}.pth.tar'.format(epoch)
            save_path = os.path.join(args.save_model, checkpoint_name)
            torch.save(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict(),
                }, save_path)
def main():
    args = parse_args()
    utils.gpu_affinity.set_affinity(args.local_rank)

    # Initialize device and distributed backend
    torch.cuda.set_device(args.local_rank)
    l2_promote()
    device = torch.device('cuda' if args.cuda else 'cpu')
    utils.distributed.init_distributed(args.cuda)

    args.work_dir = utils.exp_utils.build_work_dir_name(
        args.work_dir,
        args.dataset,
        args.append_dataset,
        args.append_time,
    )

    with utils.distributed.sync_workers() as rank:
        if rank == 0:
            create_exp_dir(args.work_dir,
                           scripts_to_save=['train.py', 'mem_transformer.py'],
                           debug=args.debug)

    # Setup logging
    if args.log_all_ranks:
        log_file = f'train_log_rank_{utils.distributed.get_rank()}.log'
    else:
        log_file = args.txtlog_file
    dllog_file = args.dllog_file
    log_file = os.path.join(args.work_dir, log_file)
    dllog_file = os.path.join(args.work_dir, dllog_file)

    if args.debug:
        log_file = os.devnull
        dllog_file = os.devnull

    utils.exp_utils.setup_logging(
        log_all_ranks=args.log_all_ranks,
        filename=log_file,
    )
    utils.exp_utils.setup_dllogger(enabled=True, filename=dllog_file)

    if args.local_batch_size is not None:
        world_size = utils.distributed.get_world_size()
        args.batch_size = world_size * args.local_batch_size
        logging.info(f'--local_batch_size was set, adjusting global batch size'
                     f' to {args.batch_size} (local_batch_size * world_size)')

    logging.info(args)
    dllogger.log(step='PARAMETER', data=vars(args))

    logging.info(f'world size: {utils.distributed.get_world_size()}')

    if not args.no_env:
        log_env_info()

    register_ignoring_timeout_handler()

    # Set the random seed manually for reproducibility.
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    ###########################################################################
    # Load data
    ###########################################################################
    corpus = get_lm_corpus(args.data, args.dataset, args.vocab)
    ntokens = len(corpus.vocab)
    vocab = corpus.vocab
    args.n_token = ntokens

    if args.mem_len == 0:
        eval_mem_len = 0
    else:
        eval_mem_len = args.mem_len + args.tgt_len - args.eval_tgt_len

    tr_iter = corpus.get_iterator('train',
                                  args.batch_size,
                                  args.tgt_len,
                                  device=device,
                                  ext_len=args.ext_len)
    va_iter = corpus.get_iterator('valid',
                                  args.eval_batch_size,
                                  args.eval_tgt_len,
                                  device=device,
                                  mem_len=eval_mem_len,
                                  ext_len=args.ext_len)
    te_iter = corpus.get_iterator('test',
                                  args.eval_batch_size,
                                  args.eval_tgt_len,
                                  device=device,
                                  mem_len=eval_mem_len,
                                  ext_len=args.ext_len)

    # adaptive softmax / embedding
    cutoffs, tie_projs = [], [False]
    if args.adaptive:
        assert args.dataset in ['wt103', 'lm1b']
        if args.dataset == 'wt103':
            cutoffs = [19997, 39997, 199997]
            tie_projs += [True] * len(cutoffs)
        elif args.dataset == 'lm1b':
            cutoffs = [59997, 99997, 639997]
            tie_projs += [False] * len(cutoffs)

    ###########################################################################
    # Build the model
    ###########################################################################
    model_config = {
        'n_token': ntokens,
        'n_layer': args.n_layer,
        'n_head': args.n_head,
        'd_model': args.d_model,
        'd_head': args.d_head,
        'd_inner': args.d_inner,
        'dropout': args.dropout,
        'dropatt': args.dropatt,
        'dtype': None,
        'tie_weight': args.tied,
        'd_embed': args.d_embed,
        'div_val': args.div_val,
        'tie_projs': tie_projs,
        'pre_lnorm': args.pre_lnorm,
        'tgt_len': args.tgt_len,
        'ext_len': args.ext_len,
        'mem_len': args.mem_len,
        'cutoffs': cutoffs,
        'same_length': args.same_length,
        'attn_type': args.attn_type,
        'clamp_len': args.clamp_len,
        'sample_softmax': args.sample_softmax,
    }

    model = MemTransformerLM(**model_config)

    model.apply(functools.partial(weights_init, args=args))
    # ensure embedding init is not overridden by out_layer in case of weight sharing
    model.word_emb.apply(functools.partial(weights_init, args=args))

    args.n_all_param = sum([p.nelement() for p in model.parameters()])
    args.n_nonemb_param = sum(
        [p.nelement() for p in model.layers.parameters()])

    # optimizer
    if args.optim.lower() == 'sgd':
        if args.sample_softmax > 0:
            dense_params, sparse_params = [], []
            for param in model.parameters():
                if param.size() == model.word_emb.weight.size():
                    sparse_params.append(param)
                else:
                    dense_params.append(param)
            optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2)
            optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom)
        else:
            optimizer = optim.SGD(model.parameters(),
                                  lr=args.lr,
                                  momentum=args.mom)
            optimizer_sparse = None
    elif args.optim.lower() == 'adam':
        if args.sample_softmax > 0:
            dense_params, sparse_params = [], []
            for param in model.parameters():
                if param.size() == model.word_emb.weight.size():
                    sparse_params.append(param)
                else:
                    dense_params.append(param)
            optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr)
            optimizer = optim.Adam(dense_params,
                                   lr=args.lr,
                                   weight_decay=args.weight_decay)
        else:
            optimizer = optim.Adam(model.parameters(),
                                   lr=args.lr,
                                   weight_decay=args.weight_decay)
            optimizer_sparse = None
    elif args.optim.lower() == 'adagrad':
        optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
        optimizer_sparse = None
    elif args.optim.lower() == 'lamb':
        optimizer = lamb.Lamb(model.parameters(),
                              lr=args.lr,
                              weight_decay=args.weight_decay)
        optimizer_sparse = None
    elif args.optim.lower() == 'jitlamb':
        optimizer = lamb.JITLamb(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
        optimizer_sparse = None

    model = model.to(device)

    scaler = None
    if args.fp16:
        if args.amp == 'pytorch':
            scaler = torch.cuda.amp.GradScaler()
        elif args.amp == 'apex':
            model, optimizer = amp.initialize(
                model,
                optimizer,
                opt_level=args.apex_amp_opt_level,
            )

    if args.multi_gpu == 'ddp' and torch.distributed.is_initialized():
        para_model = DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            broadcast_buffers=False,
            find_unused_parameters=True,
        )
    elif args.multi_gpu == 'dp':
        if args.gpu0_bsz >= 0:
            para_model = BalancedDataParallel(args.gpu0_bsz //
                                              args.batch_chunk,
                                              model,
                                              dim=1).to(device)
        else:
            para_model = nn.DataParallel(model, dim=1).to(device)
    else:
        para_model = model

    # scheduler
    if args.scheduler == 'cosine':
        if args.max_step_scheduler:
            max_step = args.max_step_scheduler
        else:
            max_step = args.max_step

        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         max_step -
                                                         args.warmup_step,
                                                         eta_min=args.eta_min)
        if args.sample_softmax > 0 and optimizer_sparse is not None:
            scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(
                optimizer_sparse,
                max_step - args.warmup_step,
                eta_min=args.eta_min)
        else:
            scheduler_sparse = None
    elif args.scheduler == 'inv_sqrt':
        # originally used for Transformer (in Attention is all you need)
        def lr_lambda(step):
            # return a multiplier instead of a learning rate
            if step == 0 and args.warmup_step == 0:
                return 1.
            else:
                return 1. / (step ** 0.5) if step > args.warmup_step \
                    else step / (args.warmup_step ** 1.5)

        scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
        if args.sample_softmax > 0 and optimizer_sparse is not None:
            scheduler_sparse = optim.lr_scheduler.LambdaLR(optimizer_sparse,
                                                           lr_lambda=lr_lambda)
        else:
            scheduler_sparse = None
    elif args.scheduler == 'dev_perf':
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            factor=args.decay_rate,
            patience=args.patience,
            min_lr=args.lr_min,
        )
        if args.sample_softmax > 0 and optimizer_sparse is not None:
            scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(
                optimizer_sparse,
                factor=args.decay_rate,
                patience=args.patience,
                min_lr=args.lr_min,
            )
        else:
            scheduler_sparse = None
    elif args.scheduler == 'constant':
        pass

    logging.info('=' * 100)
    for k, v in args.__dict__.items():
        logging.info('    - {} : {}'.format(k, v))
    logging.info('=' * 100)
    logging.info('#params = {}'.format(args.n_all_param))
    logging.info('#non emb params = {}'.format(args.n_nonemb_param))

    train_step = 0
    start_epoch = 1
    last_batch = 0
    last_iter = 0
    best_val_loss = None

    if args.restart:
        try:
            checkpoint = load_checkpoint(args.restart)
            model.load_state_dict(checkpoint['model_state'])
            optimizer.load_state_dict(checkpoint['optimizer_state'])
            scheduler.load_state_dict(checkpoint['scheduler_state'])
            if args.fp16:
                if args.amp == 'pytorch':
                    scaler.load_state_dict(checkpoint['amp_state'])
                elif args.amp == 'apex':
                    amp.load_state_dict(checkpoint['amp_state'])
            train_step = checkpoint['train_step']
            start_epoch = checkpoint['epoch']
            last_batch = checkpoint['batch']
            last_iter = checkpoint['last_iter']
            best_val_loss = checkpoint['best_val_loss']

            if train_step >= args.max_step:
                logging.info(
                    f'Loaded checkpoint after {train_step} steps, but '
                    f'this run was scheduled for a total of '
                    f'{args.max_step} steps, exiting')
                sys.exit(1)

            model.apply(functools.partial(update_dropout, args=args))
            model.apply(functools.partial(update_dropatt, args=args))
        except FileNotFoundError:
            logging.info(f'Could not load checkpoint from {args.restart}, '
                         f'starting training from random init')

    meters = {}
    warmup = args.mem_len // args.tgt_len + 2
    meters['train_throughput'] = AverageMeter(warmup=warmup)
    ###########################################################################
    # Train
    ###########################################################################
    # Loop over epochs.
    # At any point you can hit Ctrl + C to break out of training early.
    start_time = time.time()
    with TimeoutHandler() as timeout_handler:
        try:
            for epoch in itertools.count(start=start_epoch):
                if args.roll:
                    tr_iter.roll(seed=args.seed + epoch)
                train_step, best_val_loss = train(
                    tr_iter, va_iter, model, para_model, model_config,
                    optimizer, optimizer_sparse, scheduler, scheduler_sparse,
                    scaler, vocab, epoch, last_batch, last_iter, train_step,
                    best_val_loss, meters, timeout_handler, device, args)

                last_batch = 0
                last_iter = 0

                if train_step == args.max_step:
                    logging.info('-' * 100)
                    logging.info('End of training')
                    break
        except KeyboardInterrupt:
            logging.info('-' * 100)
            logging.info('Exiting from training early')
    elapsed = time.time() - start_time

    ###########################################################################
    # Test
    ###########################################################################
    summary = {}
    test_path = os.path.join(args.work_dir, 'checkpoint_best.pt')
    if not args.debug and not args.no_eval and os.path.exists(test_path):
        # Load the best saved model.
        checkpoint = load_checkpoint(test_path)
        model.load_state_dict(checkpoint['model_state'])

        # Run on test data.
        test_start_time = time.time()
        test_loss = evaluate(te_iter, model, args)
        test_loss = utils.distributed.all_reduce_item(test_loss, 'mean')
        test_elapsed = time.time() - test_start_time

        logging.info('=' * 100)
        if args.dataset in ['enwik8', 'text8']:
            logging.info(
                '| End of training | test time: {:5.2f}s | test loss {:5.2f} | test bpc {:9.5f}'
                .format(test_elapsed, test_loss, test_loss / math.log(2)))
        else:
            logging.info(
                '| End of training | test time: {:5.2f}s | test loss {:5.2f} | test ppl {:9.3f}'
                .format(test_elapsed, test_loss, math.exp(test_loss)))
        logging.info('=' * 100)

        summary.update({
            'test_elapsed': test_elapsed,
            'test_loss': test_loss,
        })

        if args.dataset in ['enwik8', 'text8']:
            summary['test_bits_per_character'] = test_loss / math.log(2)
        else:
            summary['test_perplexity'] = math.exp(test_loss)

    logging.info(f'Training time: {(elapsed / 60):.2f} minutes')
    logging.info(
        f'Training throughput: {meters["train_throughput"].avg:.2f} tok/s')

    if best_val_loss:
        val_perplexity = math.exp(best_val_loss)
    else:
        val_perplexity = None

    summary.update({
        'train_throughput': meters['train_throughput'].avg,
        'train_elapsed': elapsed / 60,
        'valid_loss': best_val_loss,
        'valid_perplexity': val_perplexity,
    })
    dllogger.log(step=tuple(), data=summary)

    passed = benchmark(target_perplexity=args.target_perplexity,
                       test_perplexity=val_perplexity,
                       target_throughput=args.target_throughput,
                       test_throughput=meters['train_throughput'].avg)
    if not passed:
        sys.exit(1)
Exemple #16
0
def train_ai2thor(model, args, rank=0, b=None):

    seed = args.seed + 10000 * rank
    torch.manual_seed(seed)
    np.random.seed(seed)

    # torch.cuda.set_device(rank)
    device = torch.device(f'cuda:{rank}')
    os.environ['DISPLAY'] = f':{rank}'

    model = model.to(device)
    model.share_memory()

    # Experience buffer
    storage = PPOBuffer(model.obs_shape,
                        args.steps,
                        args.num_workers,
                        args.state_size,
                        args.gamma,
                        device=device)
    storage.share_memory()

    #torch.multiprocessing.set_start_method('spawn')
    # start multiple processes
    ready_to_works = [Event() for _ in range(args.num_workers)]
    exit_flag = Value('i', 0)
    queue = SimpleQueue()

    processes = []
    task_config_file = "config_files/multiMugTaskTrain.json"
    # start workers
    for worker_id in range(args.num_workers):
        p = Process(target=worker,
                    args=(worker_id, model, storage, ready_to_works[worker_id],
                          queue, exit_flag, task_config_file))
        p.start()
        processes.append(p)

    # start trainer
    train_params = {
        "epochs": args.epochs,
        "steps": args.steps,
        "world_size": args.world_size,
        "num_workers": args.num_workers
    }
    ppo_params = {
        "clip_param": args.clip_param,
        "train_iters": args.train_iters,
        "mini_batch_size": args.mini_batch_size,
        "value_loss_coef": args.value_loss_coef,
        "entropy_coef": args.entropy_coef,
        "rnn_steps": args.rnn_steps,
        "lr": args.lr,
        "max_kl": args.max_kl
    }

    distributed = False
    if args.world_size > 1:
        distributed = True
        # Initialize Process Group, distributed backend type
        dist_backend = 'nccl'
        # Url used to setup distributed training
        dist_url = "tcp://127.0.0.1:23456"
        print("Initialize Process Group... pid:", os.getpid())
        dist.init_process_group(backend=dist_backend,
                                init_method=dist_url,
                                rank=rank,
                                world_size=args.world_size)
        # Make model DistributedDataParallel
        model = DistributedDataParallel(model,
                                        device_ids=[rank],
                                        output_device=rank)

    learner(model, storage, train_params, ppo_params, ready_to_works, queue,
            exit_flag, rank, distributed, b)

    for p in processes:
        print("process ", p.pid, " joined")
        p.join()
Exemple #17
0
def train(procid, args):
    # load and preprocess dataset
    assert procid >= 0
    os.environ['MASTER_ADDR'] = args.MASTER_ADDR
    os.environ['MASTER_PORT'] = args.MASTER_PORT

    if args.dataset == 'cora':
        data = CoraGraphDataset()
    elif args.dataset == 'citeseer':
        data = CiteseerGraphDataset()
    elif args.dataset == 'pubmed':
        data = PubmedGraphDataset()
    elif args.dataset == 'reddit':
        data = RedditDataset()
    else:
        raise ValueError('Unknown dataset: {}'.format(args.dataset))

    g = data[0]

    #data = args.data
    #g = args.data[0]
    #g.create_formats_()
    print("New Proc! ", procid)
    #return g
    device = torch.device(args.devices_name_list[procid])
    torch.cuda.set_device(device)
    dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
        master_ip=args.MASTER_ADDR, master_port=args.MASTER_PORT)
    world_size = args.ngpus
    torch.distributed.init_process_group(backend="nccl",
                                         init_method=dist_init_method,
                                         world_size=world_size,
                                         rank=procid)
    #torch.cuda.set_device(device)
    #st = pg.Storage(g,[device],[args.PV_list[procid]],[args.TV_list[procid]])

    # use pagraph
    st = pg.Storage(g=g,
                    data=g.ndata,
                    cache_rate=args.cache_rate,
                    nodes=args.PV_list[procid],
                    gpu=args.devices_name_list[procid],
                    cpu='cpu')
    if (True):
        features = g.ndata.pop('feat')
        labels = g.ndata.pop('label')
        train_mask = g.ndata.pop('train_mask')
        val_mask = g.ndata.pop('val_mask')
        test_mask = g.ndata.pop('test_mask')
        in_feats = features.shape[1]
        n_classes = data.num_labels
        n_edges = data.graph.number_of_edges()

    print("""----Data statistics------'
      #Edges %d
      #Classes %d
      #Train samples %d
      #Val samples %d
      #Test samples %d""" %
          (n_edges, n_classes, train_mask.int().sum().item(),
           val_mask.int().sum().item(), test_mask.int().sum().item()))

    del features  #release memory

    # add self loop
    '''
    if args.self_loop:
        g = dgl.remove_self_loop(g)
        g = dgl.add_self_loop(g)

    '''
    # create GCN model
    model = MyGCN(
        in_feats,
        args.n_hidden,
        n_classes,
        args.n_layers,
        F.relu,
        args.dropout,
    )
    model = model.to(device)
    model = DistributedDataParallel(
        model, device_ids=[device],
        output_device=device)  #device_ids = [device], output_device = device

    # set sampler
    fanouts = []
    for i in range(args.n_layers):
        fanouts.append(args.neighbor_number)
        '''
        example: fanout=[2,2,2,2] or [3,3,3] ...
        '''
    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
    train_nids = args.TV_list[procid]
    dataloader = dgl.dataloading.NodeDataLoader(g,
                                                train_nids,
                                                sampler,
                                                batch_size=args.batch_size,
                                                shuffle=False,
                                                drop_last=True,
                                                num_workers=0)

    # set loss function
    loss_fcn = torch.nn.CrossEntropyLoss()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    # initialize graph
    dur = []

    # Sync
    #if(args.ngpus > 1):
    #    torch.distributed.barrier()

    #Start trainning
    model.train()

    for epoch in range(args.n_epochs):
        # time record
        #if epoch >= 3:
        tS = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        t0 = time.time()

        # forward

        #Loss=torch.tensor([0.0],device=device,required_grad=False)

        for count, (in_nodes, out_nodes, blocks) in enumerate(dataloader):

            t1 = time.time()
            blocks = [b.to(device) for b in blocks]

            t2 = time.time()
            feat_in = st.Query(fname='feat', nodes=in_nodes)
            labels_out = st.Query(fname='label', nodes=out_nodes)

            t3 = time.time()
            # forward
            feat_out = model(blocks, feat_in)
            t4 = time.time()

            loss = loss_fcn(feat_out, labels_out)
            #Loss=Loss+loss.detach()
            t5 = time.time()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            t6 = time.time()

            tS[1] = tS[1] + t2 - t1
            tS[2] = tS[2] + t3 - t2
            tS[3] = tS[3] + t4 - t3
            tS[4] = tS[4] + t5 - t4
            tS[5] = tS[5] + t6 - t5

        tE = time.time()
        #logits = model(features)
        #loss = loss_fcn(logits[train_mask], labels[train_mask])
        #optimizer.zero_grad()
        #loss.backward()
        #optimizer.step()

        #if epoch >= 3:
        dur.append(time.time() - t0)

        acc = 0.0  #evaluate(model, features, labels, val_mask)
        if (procid >= 0):
            print(
                "Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
                "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.item(),
                                              acc,
                                              n_edges / np.mean(dur) / 1000))
            #for i in range(1,6):
            print(tS[1:], '\nTotal:', tE - t0, " s ")

    #Finish trainning

    # Sync
    #if(args.ngpus > 1000):
    #    torch.distributed.barrier()
    model.eval()
    time.sleep(3)

    print("____________________________")
def DistributedFairseqModel(args, model, process_group, device):
    """
    Wrap a *model* to support distributed data parallel training.

    This is similar to the built-in DistributedDataParallel, but allows
    additional configuration of the DistributedDataParallel class to
    use, and also provides easier access to the wrapped model by
    forwarding requests for missing attributes to the wrapped model.

    Args:
        args (argparse.Namespace): fairseq args
        model (BaseFairseqModel): model to wrap
        process_group: the c10d process group to be used for distributed data
            parallel all-reduction.
        device: device to move model to
    """
    assert isinstance(model, nn.Module)
    if args.tpu:
        wrapped_model = TPUDistributedDataParallel(
            module=model.to(device),
            process_group=process_group,
        )
        # forward missing getattr and state_dict/load_state_dict to orig model
        wrapped_model = ModuleProxyWrapper(wrapped_model)
    elif args.ddp_backend in {"c10d", "pytorch_ddp"}:
        wrapped_model = DistributedDataParallel(
            module=model.to(device),
            device_ids=[args.device_id],
            output_device=args.device_id,
            broadcast_buffers=args.broadcast_buffers,
            bucket_cap_mb=args.bucket_cap_mb,
            process_group=process_group,
            find_unused_parameters=args.find_unused_parameters,
        )
        # forward missing getattr and state_dict/load_state_dict to orig model
        wrapped_model = ModuleProxyWrapper(wrapped_model)
    elif args.ddp_backend in {"no_c10d", "legacy_ddp"}:
        wrapped_model = LegacyDistributedDataParallel(
            module=model.to(device),
            buffer_size=2**28,
            process_group=process_group,
        )
        # forward missing getattr and state_dict/load_state_dict to orig model
        wrapped_model = ModuleProxyWrapper(wrapped_model)
    elif args.ddp_backend == "slow_mo":
        if _GOSSIP_DISABLED:
            raise ImportError(
                "Cannot find gossip library. Please install from: "
                "github.com/facebookresearch/stochastic_gradient_push")

        # The values of slowmo_momentum below were obtained by tuning on the
        # En-De 16 dataset by training the transformer_wmt_en_de_large model
        if args.slowmo_momentum is None:
            if args.distributed_world_size <= 16:
                args.slowmo_momentum = 0.0
            elif args.distributed_world_size <= 32:
                args.slowmo_momentum = 0.2
            elif args.distributed_world_size <= 64:
                args.slowmo_momentum = 0.5
            else:
                args.slowmo_momentum = 0.6

        wrapped_model = gossip.GossipDataParallel(
            module=model.to(device),
            device_ids=[args.device_id],
            output_device=args.device_id,
            broadcast_buffers=args.broadcast_buffers,
            nprocs_per_node=args.nprocs_per_node,
            slowmo_momentum=args.slowmo_momentum,
            localsgd=(args.slowmo_algorithm == "LocalSGD"),
            localsgd_frequency=args.localsgd_frequency,
        )
        # forward missing getattr and state_dict/load_state_dict to orig model
        wrapped_model = ModuleProxyWrapper(wrapped_model)
    elif args.ddp_backend == "fully_sharded":
        try:
            from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP
        except ImportError:
            raise ImportError(
                "Cannot find FullyShardedDataParallel. "
                "Please install fairscale with: pip install fairscale")
        assert isinstance(model,
                          FSDP), "expected model to already be wrapped in FSDP"
        wrapped_model = model
        if args.memory_efficient_fp16:
            wrapped_model = wrapped_model.half()
        if not args.cpu_offload:
            wrapped_model = wrapped_model.to(device=device)
    else:
        raise ValueError("Unknown --ddp-backend: " + args.ddp_backend)

    # kill hung distributed jobs after a timeout
    if getattr(args, "heartbeat_timeout", -1) > 0:
        wrapped_model = DistributedTimeoutWrapper(wrapped_model,
                                                  timeout=getattr(
                                                      args,
                                                      "heartbeat_timeout", -1))

    return wrapped_model
Exemple #19
0
def worker(proc_id, gpu_ranks, args, model):
    """
    Args:
        proc_id: The id of GPU for single GPU mode;
                 The id of process (and GPU) for multiprocessing distributed mode.
        gpu_ranks: List of ranks of each process.
    """
    set_seed(args.seed)

    if args.dist_train:
        rank = gpu_ranks[proc_id]
        gpu_id = proc_id
    elif args.single_gpu:
        rank = None
        gpu_id = proc_id
    else:
        rank = None
        gpu_id = None

    if args.dist_train:
        train_loader = globals()[args.target.capitalize() + "DataLoader"](
            args, args.dataset_path, args.batch_size, rank, args.world_size,
            True)
    else:
        train_loader = globals()[args.target.capitalize() + "DataLoader"](
            args, args.dataset_path, args.batch_size, 0, 1, True)

    if gpu_id is not None:
        torch.cuda.set_device(gpu_id)
        model.cuda(gpu_id)

    # Build optimizer.
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      correct_bias=False)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=train_steps * args.warmup,
                                     t_total=train_steps)

    if args.dist_train:
        # Initialize multiprocessing distributed training environment.
        dist.init_process_group(backend=args.backend,
                                init_method=args.master_ip,
                                world_size=args.world_size,
                                rank=rank)
        model = DistributedDataParallel(model, device_ids=[gpu_id])
        print("Worker %d is training ... " % rank)
    else:
        print("Worker is training ...")

    globals().get("train_" + args.target)(args, gpu_id, rank, train_loader,
                                          model, optimizer, scheduler)
Exemple #20
0
    def register(self, *, models, optimizers, criterion=None, schedulers=None):
        """Registers parameters with Ray SGD and sets up training components.

        By calling this method to register your models, optimizers,
        criterion, and schedulers, Ray SGD will automatically handle
        necessary setup such as GPU/devices, Distributed Data Parallel, and
        Fp16. The registered components are returned and should be set as
        instance attributes to access during training/validation.

        If more than one model, optimizer, or scheduler is passed in,
        you should implement your own custom training loop.

        .. code-block:: python

            class MyTrainingOperator(TrainingOperator):
                def setup(self, config):
                    model = ...
                    optimizer = ...
                    train_loader = ...
                    val_loader = ...
                    loss = ...

                    self.model, self.optimizer, self.criterion = self.register(
                    models=model, optimizers=optimizer, criterion=loss)

                    # At this point DDP, Cuda, and Fp16
                    # are set up for all our components. We then use
                    # self.model, self.optimizer, etc. in our training loop.

                    self.register_data(train_loader=train_loader,
                    validation_loader=val_loader)


        Args:
            models (torch.nn.Module or Iterable[nn.Module]): Pytorch model or
                multiple Pytorch models to use for training. If
                `use_gpu=True` is passed into ``TorchTrainer``, and Cuda is
                available, models will automatically be placed on GPU.
                If ``wrap_ddp=True`` is passed into ``TorchTrainer``,
                models will be wrapped in DDP. If wrap_ddp is False,
                you should handle DDP for your models in setup.
            optimizers (torch.optim.Optimizer or Iterable[
                torch.optim.Optimizer]): Pytorch optimizer or multiple Pytorch
                optimizers to use for training.
            criterion (Callable, optional): Function to return loss
                metric given features and target. If not provided,
                must implement a custom training loop.
            schedulers (torch.optim.lr_scheduler or Iterable[
                torch.optim.lr_scheduler], optional): A learning rate
                scheduler or multiple learning rate schedulers.

        Returns:
            Tuple of model, optimizer, criterion if not None, and scheduler
            if not None.

        """
        return_vals = []
        logger.debug("Registering models.")
        self._original_models = models
        if not isinstance(self._original_models, Iterable):
            self._original_models = [self._original_models]
        assert all(
            isinstance(model, nn.Module) for model in self._original_models), (
                f"All models must be PyTorch models: {self._original_models}.")
        if self.use_gpu and torch.cuda.is_available():
            self._original_models = [
                model.cuda() for model in self._original_models
            ]

        logger.debug("Registering optimizers.")
        self._optimizers = optimizers
        if not isinstance(self._optimizers, Iterable):
            self._optimizers = [self._optimizers]

        if schedulers:
            logger.debug("Registering scheduler.")
            self._schedulers = schedulers
            if not isinstance(self._schedulers, Iterable):
                self._schedulers = [self._schedulers]
        else:
            self._schedulers = None

        if criterion:
            logger.debug("Registering loss.")
            self._criterion = criterion
            if self.use_gpu and torch.cuda.is_available():
                if hasattr(self._criterion, "cuda"):
                    self._criterion = self._criterion.cuda()
        else:
            self._criterion = None

        if self.use_fp16 and amp:
            logger.debug("Setting up Apex.")
            self._models, self._optimizers = amp.initialize(
                self._models, self._optimizers, **self._apex_args)
            self._amp = amp

        if self._wrap_ddp:
            logging.debug("Setting up DDP for models.")
            self._models = [
                DistributedDataParallel(model, device_ids=self.device_ids)
                for model in self._original_models
            ]
        else:
            self._models = self._original_models

        if len(self._models) == 1:
            return_vals.append(self._models[0])
        else:
            return_vals.append(self._models)

        if len(self._optimizers) == 1:
            return_vals.append(self._optimizers[0])
        else:
            return_vals.append(self._optimizers)

        if self._criterion is not None:
            return_vals.append(self._criterion)

        if self._schedulers is not None:
            if self.scheduler_step_freq is None:
                raise ValueError("scheduler_step_freq passed into "
                                 "TorchTrainer cannot be None if you "
                                 "are registering schedulers. Set this to "
                                 "'manual' if you will be manually stepping "
                                 "the schedulers.")
            if len(self._schedulers) == 1:
                return_vals.append(self._schedulers[0])
            else:
                return_vals.append(self._schedulers)

        return tuple(return_vals)
Exemple #21
0
    def make_model_env(self, gpu, ngpus_per_node):
        if self.args.distributed:
            self.args.gpu = self.args.devices[gpu]
        else:
            self.args.gpu = 0

        if self.args.use_cuda and self.args.distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            self.args.rank = self.args.rank * ngpus_per_node + gpu
            dist.init_process_group(backend=self.args.dist_backend,
                                    init_method=self.args.dist_url,
                                    world_size=self.args.world_size,
                                    rank=self.args.rank)

        self.model = DomainQA(self.args.bert_model, self.args.num_classes,
                              self.args.hidden_size, self.args.num_layers,
                              self.args.dropout, self.args.dis_lambda,
                              self.args.concat, self.args.anneal)

        if self.args.load_model is not None:
            print("Loading model from ", self.args.load_model)
            self.model.load_state_dict(
                torch.load(self.args.load_model,
                           map_location=lambda storage, loc: storage))

        if self.args.freeze_bert:
            for param in self.model.bert.parameters():
                param.requires_grad = False

        max_len = max([len(f) for f in self.features_lst])
        num_train_optimization_steps = math.ceil(
            max_len / self.args.batch_size) * self.args.epochs * len(
                self.features_lst)

        qa_params = list(self.model.bert.named_parameters()) + list(
            self.model.qa_outputs.named_parameters())
        dis_params = list(self.model.discriminator.named_parameters())
        self.qa_optimizer = get_opt(qa_params, num_train_optimization_steps,
                                    self.args)
        self.dis_optimizer = get_opt(dis_params, num_train_optimization_steps,
                                     self.args)

        if self.args.use_cuda:
            if self.args.distributed:
                torch.cuda.set_device(self.args.gpu)
                self.model.cuda(self.args.gpu)
                self.args.batch_size = int(self.args.batch_size /
                                           ngpus_per_node)
                self.args.workers = int(
                    (self.args.workers + ngpus_per_node - 1) / ngpus_per_node)
                self.model = DistributedDataParallel(
                    self.model,
                    device_ids=[self.args.gpu],
                    find_unused_parameters=True)
            else:
                self.model.cuda()
                self.model = DataParallel(self.model,
                                          device_ids=self.args.devices)

        cudnn.benchmark = True
Exemple #22
0
    def __init__(self, opt, edge_enhance=True):
        super(SRGANModel, self).__init__(opt)
        self.edge_enhance = edge_enhance
        if opt['dist']:
            self.rank = torch.distributed.get_rank()
        else:
            self.rank = -1  # non dist training
        train_opt = opt['train']

        self.netG = networks.define_G(opt).to(self.device)
        if opt['dist']:
            self.netG = DistributedDataParallel(
                self.netG, device_ids=[torch.cuda.current_device()])
        else:
            self.netG = DataParallel(self.netG)
        if self.is_train:
            self.netD = networks.define_D(opt).to(self.device)
            if opt['dist']:
                self.netD = DistributedDataParallel(
                    self.netD, device_ids=[torch.cuda.current_device()])
            else:
                self.netD = DataParallel(self.netD)

            self.netG.train()
            self.netD.train()

        if self.is_train:

            if train_opt['pixel_weight'] > 0:
                l_pix_type = train_opt['pixel_criterion']
                if l_pix_type == 'l1':
                    self.cri_pix = nn.L1Loss().to(self.device)
                elif l_pix_type == 'l2':
                    self.cri_pix = nn.MSELoss().to(self.device)
                else:
                    raise NotImplementedError(
                        'Loss type [{:s}] not recognized.'.format(l_pix_type))
                self.l_pix_w = train_opt['pixel_weight']
            else:
                logger.info('Remove pixel loss.')
                self.cri_pix = None

            if train_opt['feature_weight'] > 0:
                l_fea_type = train_opt['feature_criterion']
                if l_fea_type == 'l1':
                    self.cri_fea = nn.L1Loss().to(self.device)
                elif l_fea_type == 'l2':
                    self.cri_fea = nn.MSELoss().to(self.device)
                else:
                    raise NotImplementedError(
                        'Loss type [{:s}] not recognized.'.format(l_fea_type))
                self.l_fea_w = train_opt['feature_weight']
            else:
                logger.info('Remove feature loss.')
                self.cri_fea = None
            if self.cri_fea:  # load VGG perceptual loss
                self.netF = networks.define_F(opt,
                                              use_bn=False).to(self.device)
                if opt['dist']:
                    self.netF = DistributedDataParallel(
                        self.netF, device_ids=[torch.cuda.current_device()])
                else:
                    self.netF = DataParallel(self.netF)

            self.cri_gan = GANLoss(train_opt['gan_type'], 1.0,
                                   0.0).to(self.device)
            self.l_gan_w = train_opt['gan_weight']
            self.D_update_ratio = train_opt['D_update_ratio'] if train_opt[
                'D_update_ratio'] else 1
            self.D_init_iters = train_opt['D_init_iters'] if train_opt[
                'D_init_iters'] else 0

            self.WGAN_QC_regul = QC_GradientPenaltyLoss()

            if self.edge_enhance:
                self.l_edge_w = train_opt['edge_weight']
                if train_opt['edge_type'] == 'sobel':
                    self.cril_edge = sobel
                elif train_opt['edge_type'] == 'canny':
                    self.cril_edge = canny
                elif train_opt['edge_type'] == 'hednet':
                    self.netEdge = HedNet().cuda()
                    for p in self.netEdge.parameters():
                        p.requires_grad = False
                    self.cril_edge = self.netEdge
                else:
                    raise NotImplementedError(
                        'Loss type [{:s}] not recognized.'.format(
                            train_opt['edge_type']))
            else:
                logger.info('Remove edge loss.')
                self.cril_edge = None

            wd_G = train_opt['weight_decay_G'] if train_opt[
                'weight_decay_G'] else 0
            optim_params = []
            for k, v in self.netG.named_parameters(
            ):  # can optimize for a part of the model
                if v.requires_grad:
                    optim_params.append(v)
                else:
                    if self.rank <= 0:
                        logger.warning(
                            'Params [{:s}] will not optimize.'.format(k))
            self.optimizer_G = torch.optim.Adam(optim_params,
                                                lr=train_opt['lr_G'],
                                                weight_decay=wd_G,
                                                betas=(train_opt['beta1_G'],
                                                       train_opt['beta2_G']))
            self.optimizers.append(self.optimizer_G)
            wd_D = train_opt['weight_decay_D'] if train_opt[
                'weight_decay_D'] else 0
            self.optimizer_D = torch.optim.Adam(self.netD.parameters(),
                                                lr=train_opt['lr_D'],
                                                weight_decay=wd_D,
                                                betas=(train_opt['beta1_D'],
                                                       train_opt['beta2_D']))
            self.optimizers.append(self.optimizer_D)

            if train_opt['lr_scheme'] == 'MultiStepLR':
                for optimizer in self.optimizers:
                    self.schedulers.append(
                        lr_scheduler.MultiStepLR_Restart(
                            optimizer,
                            train_opt['lr_steps'],
                            restarts=train_opt['restarts'],
                            weights=train_opt['restart_weights'],
                            gamma=train_opt['lr_gamma'],
                            clear_state=train_opt['clear_state']))
            elif train_opt['lr_scheme'] == 'CosineAnnealingLR_Restart':
                for optimizer in self.optimizers:
                    self.schedulers.append(
                        lr_scheduler.CosineAnnealingLR_Restart(
                            optimizer,
                            train_opt['T_period'],
                            eta_min=train_opt['eta_min'],
                            restarts=train_opt['restarts'],
                            weights=train_opt['restart_weights']))
            else:
                raise NotImplementedError(
                    'MultiStepLR learning rate scheme is enough.')

            self.log_dict = OrderedDict()

        self.load()
Exemple #23
0
    def __init__(self, opt):
        super(SRGANModel, self).__init__(opt)
        if opt['dist']:
            self.rank = torch.distributed.get_rank()
        else:
            self.rank = -1  # non dist training
        train_opt = opt['train']

        # define networks and load pretrained models
        self.netG = networks.define_G(opt).to(self.device)
        if opt['dist']:
            self.netG = DistributedDataParallel(
                self.netG, device_ids=[torch.cuda.current_device()])
        else:
            self.netG = DataParallel(self.netG)
        if self.is_train:
            self.netD = networks.define_D(opt).to(self.device)
            if opt['dist']:
                self.netD = DistributedDataParallel(
                    self.netD, device_ids=[torch.cuda.current_device()])
            else:
                self.netD = DataParallel(self.netD)

            self.netG.train()
            self.netD.train()

        # define losses, optimizer and scheduler
        if self.is_train:
            # G pixel loss
            if train_opt['pixel_weight'] > 0:
                l_pix_type = train_opt['pixel_criterion']
                if l_pix_type == 'l1':
                    self.cri_pix = nn.L1Loss().to(self.device)
                elif l_pix_type == 'l2':
                    self.cri_pix = nn.MSELoss().to(self.device)
                else:
                    raise NotImplementedError(
                        'Loss type [{:s}] not recognized.'.format(l_pix_type))
                self.l_pix_w = train_opt['pixel_weight']
            else:
                logger.info('Remove pixel loss.')
                self.cri_pix = None

            # G feature loss
            if train_opt['feature_weight'] > 0:
                l_fea_type = train_opt['feature_criterion']
                if l_fea_type == 'l1':
                    self.cri_fea = nn.L1Loss().to(self.device)
                elif l_fea_type == 'l2':
                    self.cri_fea = nn.MSELoss().to(self.device)
                else:
                    raise NotImplementedError(
                        'Loss type [{:s}] not recognized.'.format(l_fea_type))
                self.l_fea_w = train_opt['feature_weight']
            else:
                logger.info('Remove feature loss.')
                self.cri_fea = None
            if self.cri_fea:  # load VGG perceptual loss
                self.netF = networks.define_F(opt,
                                              use_bn=False).to(self.device)
                if opt['dist']:
                    pass  # do not need to use DistributedDataParallel for netF
                else:
                    self.netF = DataParallel(self.netF)

            # GD gan loss
            self.cri_gan = GANLoss(train_opt['gan_type'], 1.0,
                                   0.0).to(self.device)
            self.l_gan_w = train_opt['gan_weight']
            # D_update_ratio and D_init_iters
            self.D_update_ratio = train_opt['D_update_ratio'] if train_opt[
                'D_update_ratio'] else 1
            self.D_init_iters = train_opt['D_init_iters'] if train_opt[
                'D_init_iters'] else 0

            # optimizers
            # G
            wd_G = train_opt['weight_decay_G'] if train_opt[
                'weight_decay_G'] else 0
            optim_params = []
            for k, v in self.netG.named_parameters(
            ):  # can optimize for a part of the model
                if v.requires_grad:
                    optim_params.append(v)
                else:
                    if self.rank <= 0:
                        logger.warning(
                            'Params [{:s}] will not optimize.'.format(k))
            self.optimizer_G = torch.optim.Adam(optim_params,
                                                lr=train_opt['lr_G'],
                                                weight_decay=wd_G,
                                                betas=(train_opt['beta1_G'],
                                                       train_opt['beta2_G']))
            self.optimizers.append(self.optimizer_G)
            # D
            wd_D = train_opt['weight_decay_D'] if train_opt[
                'weight_decay_D'] else 0
            self.optimizer_D = torch.optim.Adam(self.netD.parameters(),
                                                lr=train_opt['lr_D'],
                                                weight_decay=wd_D,
                                                betas=(train_opt['beta1_D'],
                                                       train_opt['beta2_D']))
            self.optimizers.append(self.optimizer_D)

            # schedulers
            if train_opt['lr_scheme'] == 'MultiStepLR':
                for optimizer in self.optimizers:
                    self.schedulers.append(
                        lr_scheduler.MultiStepLR_Restart(
                            optimizer,
                            train_opt['lr_steps'],
                            restarts=train_opt['restarts'],
                            weights=train_opt['restart_weights'],
                            gamma=train_opt['lr_gamma'],
                            clear_state=train_opt['clear_state']))
            elif train_opt['lr_scheme'] == 'CosineAnnealingLR_Restart':
                for optimizer in self.optimizers:
                    self.schedulers.append(
                        lr_scheduler.CosineAnnealingLR_Restart(
                            optimizer,
                            train_opt['T_period'],
                            eta_min=train_opt['eta_min'],
                            restarts=train_opt['restarts'],
                            weights=train_opt['restart_weights']))
            else:
                raise NotImplementedError(
                    'MultiStepLR learning rate scheme is enough.')

            self.log_dict = OrderedDict()

        self.print_network()  # print network
        self.load()  # load G and D if needed
        if self.opt['use_wandb_logger'] and 'debug' not in self.opt['name']:
            wandb.watch(self.netG)
            wandb.watch(self.netD)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train', type=Path, required=True)
    parser.add_argument('--dev', type=Path, required=True)
    parser.add_argument('--model-name', type=str, required=True)

    parser.add_argument('--ckpt', type=str, default='ckpt')
    parser.add_argument('--batch_size', type=int, default=8)
    parser.add_argument('--epochs', type=int, default=3)
    parser.add_argument('--lr', type=float, default=1e-4)
    parser.add_argument('-a', '--accumulation_steps', type=int, default=1)
    parser.add_argument('--neg_sample', type=bool, default=False)

    parser.add_argument('--fp16', action='store_true')
    # Automatically supplied by torch.distributed.launch
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    
    # Based on code from SciFact : 
    class FeverLabelPredictionDataset(Dataset):
        def __init__(self, file):
            
            claims, rationales, labels = self._read(file)
          
            self._claims = claims
            self._rationales = rationales
            self._labels = labels
            
        def _read(self, file):
            claims = []
            rationales = []
            labels = []
            #labels = {'SUPPORTS': 2, 'NOT ENOUGH INFO': 1, 'REFUTES': 0} # From SciFact
            
            label_idx = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT ENOUGH INFO': 2} # To Match COVIDLies
            for data in jsonlines.open(file):
                if data['label'] == 'NOT ENOUGH INFO':
                    if data['sentences']:
                        indices = sorted(random.sample(range(len(data['sentences'])), k=1))
                        sentences = [data['sentences'][i] for i in indices]
                        claims.append(data['claim'])
                        rationales.append(' '.join(sentences))
                        labels.append(label_idx['NOT ENOUGH INFO'])
    
                else:
                    for evidence_set in data['evidence_sets']:
                        claims.append(data['claim'])
                        rationales.append(' '.join([data['sentences'][i] for i in evidence_set]))
                        labels.append(label_idx[data['label']])
                
                    if args.neg_sample:
                        # Add negative samples
                        non_evidence_indices = set(range(len(data['sentences']))) - set(
                            s for es in data['evidence_sets'] for s in es)
                        if non_evidence_indices:
                            non_evidence_indices = random.sample(non_evidence_indices,
                                                                 k=random.randint(1, min(1, len(non_evidence_indices))))
                            sentences = [data['sentences'][i] for i in non_evidence_indices]
                            claims.append(data['claim'])
                            rationales.append(' '.join(sentences))
                            labels.append(label_idx['NOT ENOUGH INFO'])
                            
            return claims, rationales, labels
    

        def __len__(self):
            return len(self._labels)
    
        def __getitem__(self, index):
            claim = self._claims[index]
            rationale = self._rationales[index]
            label = self._labels[index]
            return claim, rationale, label


    # Additional janky distributed stuff
    args.distributed = False
    world_size = int(os.environ.get('WORLD_SIZE', 1))
    args.distributed = world_size > 1
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')

    logger.info('Loading training data')
    train_dataset = FeverLabelPredictionDataset(args.train)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        sampler=get_sampler(train_dataset, world_size, args.local_rank)
    )

    logger.info('Loading dev data')
    dev_dataset = FeverLabelPredictionDataset(args.dev)
    dev_dataloader = DataLoader(
        dev_dataset,
        batch_size=args.batch_size,
        sampler=get_sampler(dev_dataset, world_size, args.local_rank),
        shuffle=False  # Seems weird but the HuggingFace guys do it so...
    )
    

    model = SentenceBertClassifier(model_name=args.model_name, num_classes=3).cuda()
    optimizer = transformers.AdamW(model.parameters(), lr=args.lr)
    if args.fp16:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
    if args.distributed:
        model = DistributedDataParallel(model)
    loss_fn = torch.nn.CrossEntropyLoss()  # Do we need to ignore padding?

    for epoch in range(args.epochs):
        logger.info(f'Epoch: {epoch}')

        logger.info('Training...')
        model.train()
        if args.local_rank == 0:
            iterable = tqdm(train_dataloader)
        else:
            iterable = train_dataloader
        for i, (claims, rationales, labels) in enumerate(iterable):
            if not i % args.accumulation_steps:
                optimizer.step()
                optimizer.zero_grad()

            logits = model(claims, rationales)
            _, preds = logits.max(dim=-1)
            labels = torch.tensor(labels).cuda()
            acc = (preds == labels).float().mean()
            loss = loss_fn(logits, labels)
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            if args.local_rank == 0:
                iterable.set_description(f'Loss: {loss : 0.4f} - Acc: {acc : 0.4f}')

        logger.info('Evaluating...')
        model.eval()
        correct = 0.
        total = 0.
        if args.local_rank == 0:
            iterable = tqdm(dev_dataloader)
        else:
            iterable = dev_dataloader
        for claims, rationales, labels in iterable:
            with torch.no_grad():
                logits = model(claims, rationales)
            _, preds = logits.max(dim=-1)
            labels = torch.tensor(labels).cuda()
            correct += (preds == labels).float().sum()
            total += labels.size(0)
            if args.local_rank == 0:
                acc = correct / total
                iterable.set_description(f'Accuracy: {acc.item() : 0.4f}')

        logger.info('Saving...')
        if args.local_rank == 0:
            torch.save(model.state_dict(), f'{args.ckpt}-{epoch}.pt')
Exemple #25
0
    def __init__(
        self,
        model: Model,
        optimizer: torch.optim.Optimizer,
        iterator: DataIterator,
        train_dataset: Iterable[Instance],
        validation_dataset: Optional[Iterable[Instance]] = None,
        patience: Optional[int] = None,
        validation_metric: str = "-loss",
        validation_iterator: DataIterator = None,
        shuffle: bool = True,
        num_epochs: int = 20,
        serialization_dir: Optional[str] = None,
        num_serialized_models_to_keep: int = 20,
        keep_serialized_model_every_num_seconds: int = None,
        checkpointer: Checkpointer = None,
        model_save_interval: float = None,
        cuda_device: int = -1,
        grad_norm: Optional[float] = None,
        grad_clipping: Optional[float] = None,
        learning_rate_scheduler: Optional[LearningRateScheduler] = None,
        momentum_scheduler: Optional[MomentumScheduler] = None,
        summary_interval: int = 100,
        histogram_interval: int = None,
        should_log_parameter_statistics: bool = True,
        should_log_learning_rate: bool = False,
        log_batch_size_period: Optional[int] = None,
        moving_average: Optional[MovingAverage] = None,
        distributed: bool = False,
        rank: int = 0,
        world_size: int = 1,
        num_gradient_accumulation_steps: int = 1,
    ) -> None:
        """
        A trainer for doing supervised learning. It just takes a labeled dataset
        and a ``DataIterator``, and uses the supplied ``Optimizer`` to learn the weights
        for your model over some fixed number of epochs. You can also pass in a validation
        dataset and enable early stopping. There are many other bells and whistles as well.

        Parameters
        ----------
        model : ``Model``, required.
            An AllenNLP model to be optimized. Pytorch Modules can also be optimized if
            their ``forward`` method returns a dictionary with a "loss" key, containing a
            scalar tensor representing the loss function to be optimized.

            If you are training your model using GPUs, your model should already be
            on the correct device. (If you use `Trainer.from_params` this will be
            handled for you.)
        optimizer : ``torch.nn.Optimizer``, required.
            An instance of a Pytorch Optimizer, instantiated with the parameters of the
            model to be optimized.
        iterator : ``DataIterator``, required.
            A method for iterating over a ``Dataset``, yielding padded indexed batches.
        train_dataset : ``Dataset``, required.
            A ``Dataset`` to train on. The dataset should have already been indexed.
        validation_dataset : ``Dataset``, optional, (default = None).
            A ``Dataset`` to evaluate on. The dataset should have already been indexed.
        patience : Optional[int] > 0, optional (default=None)
            Number of epochs to be patient before early stopping: the training is stopped
            after ``patience`` epochs with no improvement. If given, it must be ``> 0``.
            If None, early stopping is disabled.
        validation_metric : str, optional (default="loss")
            Validation metric to measure for whether to stop training using patience
            and whether to serialize an ``is_best`` model each epoch. The metric name
            must be prepended with either "+" or "-", which specifies whether the metric
            is an increasing or decreasing function.
        validation_iterator : ``DataIterator``, optional (default=None)
            An iterator to use for the validation set.  If ``None``, then
            use the training `iterator`.
        shuffle: ``bool``, optional (default=True)
            Whether to shuffle the instances in the iterator or not.
        num_epochs : int, optional (default = 20)
            Number of training epochs.
        serialization_dir : str, optional (default=None)
            Path to directory for saving and loading model files. Models will not be saved if
            this parameter is not passed.
        num_serialized_models_to_keep : ``int``, optional (default=20)
            Number of previous model checkpoints to retain.  Default is to keep 20 checkpoints.
            A value of None or -1 means all checkpoints will be kept.
        keep_serialized_model_every_num_seconds : ``int``, optional (default=None)
            If num_serialized_models_to_keep is not None, then occasionally it's useful to
            save models at a given interval in addition to the last num_serialized_models_to_keep.
            To do so, specify keep_serialized_model_every_num_seconds as the number of seconds
            between permanently saved checkpoints.  Note that this option is only used if
            num_serialized_models_to_keep is not None, otherwise all checkpoints are kept.
        checkpointer : ``Checkpointer``, optional (default=None)
            An instance of class Checkpointer to use instead of the default. If a checkpointer is specified,
            the arguments num_serialized_models_to_keep and keep_serialized_model_every_num_seconds should
            not be specified. The caller is responsible for initializing the checkpointer so that it is
            consistent with serialization_dir.
        model_save_interval : ``float``, optional (default=None)
            If provided, then serialize models every ``model_save_interval``
            seconds within single epochs.  In all cases, models are also saved
            at the end of every epoch if ``serialization_dir`` is provided.
        cuda_device : ``int``, optional (default = -1)
            An integer specifying the CUDA device(s) to use for this process. If -1, the CPU is used.
            Data parallelism is controlled at the allennlp train level, so each trainer will have a single
            GPU.
        grad_norm : ``float``, optional, (default = None).
            If provided, gradient norms will be rescaled to have a maximum of this value.
        grad_clipping : ``float``, optional (default = ``None``).
            If provided, gradients will be clipped `during the backward pass` to have an (absolute)
            maximum of this value.  If you are getting ``NaNs`` in your gradients during training
            that are not solved by using ``grad_norm``, you may need this.
        learning_rate_scheduler : ``LearningRateScheduler``, optional (default = None)
            If specified, the learning rate will be decayed with respect to
            this schedule at the end of each epoch (or batch, if the scheduler implements
            the ``step_batch`` method). If you use :class:`torch.optim.lr_scheduler.ReduceLROnPlateau`,
            this will use the ``validation_metric`` provided to determine if learning has plateaued.
            To support updating the learning rate on every batch, this can optionally implement
            ``step_batch(batch_num_total)`` which updates the learning rate given the batch number.
        momentum_scheduler : ``MomentumScheduler``, optional (default = None)
            If specified, the momentum will be updated at the end of each batch or epoch
            according to the schedule.
        summary_interval: ``int``, optional, (default = 100)
            Number of batches between logging scalars to tensorboard
        histogram_interval : ``int``, optional, (default = ``None``)
            If not None, then log histograms to tensorboard every ``histogram_interval`` batches.
            When this parameter is specified, the following additional logging is enabled:
                * Histograms of model parameters
                * The ratio of parameter update norm to parameter norm
                * Histogram of layer activations
            We log histograms of the parameters returned by
            ``model.get_parameters_for_histogram_tensorboard_logging``.
            The layer activations are logged for any modules in the ``Model`` that have
            the attribute ``should_log_activations`` set to ``True``.  Logging
            histograms requires a number of GPU-CPU copies during training and is typically
            slow, so we recommend logging histograms relatively infrequently.
            Note: only Modules that return tensors, tuples of tensors or dicts
            with tensors as values currently support activation logging.
        should_log_parameter_statistics : ``bool``, optional, (default = True)
            Whether to send parameter statistics (mean and standard deviation
            of parameters and gradients) to tensorboard.
        should_log_learning_rate : ``bool``, optional, (default = False)
            Whether to send parameter specific learning rate to tensorboard.
        log_batch_size_period : ``int``, optional, (default = ``None``)
            If defined, how often to log the average batch size.
        moving_average: ``MovingAverage``, optional, (default = None)
            If provided, we will maintain moving averages for all parameters. During training, we
            employ a shadow variable for each parameter, which maintains the moving average. During
            evaluation, we backup the original parameters and assign the moving averages to corresponding
            parameters. Be careful that when saving the checkpoint, we will save the moving averages of
            parameters. This is necessary because we want the saved model to perform as well as the validated
            model if we load it later. But this may cause problems if you restart the training from checkpoint.
        distributed: ``bool``, optional, (default = False)
            If set, PyTorch's `DistributedDataParallel` is used to train the model in multiple GPUs. This also
            requires `world_size` to be greater than 1.
        rank: ``int``, optional, (default = 0)
            This is the unique identifier of the `Trainer` in a distributed process group. The GPU device id is
            used as the rank.
        world_size: ``int``, (default = 1)
            The number of `Trainer` workers participating in the distributed training.
        num_gradient_accumulation_steps: ``int``, optional, (default = 1)
            Gradients are accumulated for the given number of steps before doing an optimizer step. This can
            be useful to accommodate batches that are larger than the RAM size. Refer Thomas Wolf's
            [post](https://tinyurl.com/y5mv44fw) for details on Gradient Accumulation.
        """
        super().__init__(serialization_dir, cuda_device, distributed, rank,
                         world_size)

        # I am not calling move_to_gpu here, because if the model is
        # not already on the GPU then the optimizer is going to be wrong.
        self.model = model

        self.iterator = iterator
        self._validation_iterator = validation_iterator
        self.shuffle = shuffle
        self.optimizer = optimizer
        self.train_data = train_dataset
        self._validation_data = validation_dataset

        if patience is None:  # no early stopping
            if validation_dataset:
                logger.warning(
                    "You provided a validation dataset but patience was set to None, "
                    "meaning that early stopping is disabled")
        elif (not isinstance(patience, int)) or patience <= 0:
            raise ConfigurationError(
                '{} is an invalid value for "patience": it must be a positive integer '
                "or None (if you want to disable early stopping)".format(
                    patience))

        # For tracking is_best_so_far and should_stop_early
        self._metric_tracker = MetricTracker(patience, validation_metric)
        # Get rid of + or -
        self._validation_metric = validation_metric[1:]

        self._num_epochs = num_epochs

        if checkpointer is not None:
            # We can't easily check if these parameters were passed in, so check against their default values.
            # We don't check against serialization_dir since it is also used by the parent class.
            if (num_serialized_models_to_keep != 20
                    or keep_serialized_model_every_num_seconds is not None):
                raise ConfigurationError(
                    "When passing a custom Checkpointer, you may not also pass in separate checkpointer "
                    "args 'num_serialized_models_to_keep' or 'keep_serialized_model_every_num_seconds'."
                )
            self._checkpointer = checkpointer
        else:
            self._checkpointer = Checkpointer(
                serialization_dir,
                keep_serialized_model_every_num_seconds,
                num_serialized_models_to_keep,
            )

        self._model_save_interval = model_save_interval

        self._grad_norm = grad_norm
        self._grad_clipping = grad_clipping

        self._learning_rate_scheduler = learning_rate_scheduler
        self._momentum_scheduler = momentum_scheduler
        self._moving_average = moving_average

        # We keep the total batch number as an instance variable because it
        # is used inside a closure for the hook which logs activations in
        # ``_enable_activation_logging``.
        self._batch_num_total = 0

        self._tensorboard = TensorboardWriter(
            get_batch_num_total=lambda: self._batch_num_total,
            serialization_dir=serialization_dir,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
        )

        self._log_batch_size_period = log_batch_size_period

        self._last_log = 0.0  # time of last logging

        self._num_gradient_accumulation_steps = num_gradient_accumulation_steps

        # Enable activation logging.
        if histogram_interval is not None:
            self._tensorboard.enable_activation_logging(self.model)

        # Using `DistributedDataParallel`(ddp) brings in a quirk wrt AllenNLP's `Model` interface and its
        # usage. A `Model` object is wrapped by `ddp`, but assigning the wrapped model to `self.model`
        # will break the usages such as `Model.get_regularization_penalty`, `Model.get_metrics`, etc.
        #
        # Hence a reference to Pytorch's object is maintained in the case of distributed training and in the
        # normal case, reference to `Model` is retained. This reference is only used in
        # these places: `model.__call__`, `model.train` and `model.eval`.
        if self._distributed:
            self._pytorch_model = DistributedDataParallel(
                self.model, device_ids=[self.cuda_device])
        else:
            self._pytorch_model = self.model
Exemple #26
0
def train_func(config):
    is_distributed = config.get("is_distributed", False)
    use_gpu = config["use_gpu"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    num_layers = config["num_layers"]
    num_hidden = config["num_hidden"]
    dropout_every = config["dropout_every"]
    dropout_prob = config["dropout_prob"]
    num_features = config["num_features"]

    print("Defining model, loss, and optimizer...")

    # Setup device.
    if is_distributed:
        device = torch.device(f"cuda:{train.local_rank()}" if use_gpu
                              and torch.cuda.is_available() else "cpu")
    else:
        device = torch.device(
            "cuda:0" if use_gpu and torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Setup data.
    if is_distributed:
        train_dataset_pipeline = train.get_dataset_shard("train_dataset")
        train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs()
        test_dataset = train.get_dataset_shard("test_dataset")
    else:
        train_dataset_epoch_iterator = config["train_dataset"].iter_epochs()
        test_dataset = config["test_dataset"]
    test_torch_dataset = test_dataset.to_torch(label_column="label",
                                               batch_size=batch_size)

    # Setup Tensorboard and MLflow.
    if is_distributed:
        # Setup is done through Callback.
        pass
    else:
        writer = SummaryWriter()
        mlflow.start_run()
        mlflow_config = config.copy()
        mlflow_config.pop("test_dataset")
        mlflow_config.pop("train_dataset")
        mlflow.log_params(mlflow_config)

    net = Net(
        n_layers=num_layers,
        n_features=num_features,
        num_hidden=num_hidden,
        dropout_every=dropout_every,
        drop_prob=dropout_prob,
    ).to(device)
    print(net.parameters)

    if is_distributed:
        net = DistributedDataParallel(net)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(net.parameters(), weight_decay=0.0001)

    print("Starting training...")
    for epoch in range(num_epochs):
        train_dataset = next(train_dataset_epoch_iterator)

        train_torch_dataset = train_dataset.to_torch(label_column="label",
                                                     batch_size=batch_size)

        train_running_loss, train_num_correct, train_num_total = train_epoch(
            train_torch_dataset, net, device, criterion, optimizer)
        train_acc = train_num_correct / train_num_total
        print(
            f"epoch [{epoch + 1}]: training accuracy: {train_num_correct} / {train_num_total} = {train_acc:.4f}"
        )

        test_running_loss, test_num_correct, test_num_total = test_epoch(
            test_torch_dataset, net, device, criterion)
        test_acc = test_num_correct / test_num_total
        print(
            f"epoch [{epoch + 1}]: testing accuracy: {test_num_correct} / {test_num_total} = {test_acc:.4f}"
        )

        # Record and log stats.
        if is_distributed:
            train.report(train_acc=train_acc,
                         train_loss=train_running_loss,
                         test_acc=test_acc,
                         test_loss=test_running_loss)
        else:
            writer.add_scalar("Accuracy/train", train_acc, epoch)
            writer.add_scalar("Loss/train", train_running_loss, epoch)
            writer.add_scalar("Accuracy/test", test_acc, epoch)
            writer.add_scalar("Loss/test", test_running_loss, epoch)
            writer.flush()

            mlflow.log_metrics({
                "train_acc": train_acc,
                "train_loss": train_running_loss,
                "test_acc": test_acc,
                "test_loss": test_running_loss
            })

        # Checkpoint model.
        if is_distributed:
            import copy
            model_copy = copy.deepcopy(net.module)
            train.save_checkpoint(
                model_state_dict=model_copy.cpu().state_dict())
        else:
            torch.save(net.state_dict(), f"models/model-epoch-{epoch}.torch")

    # Shutdown Tensorboard and MLflow.
    if is_distributed:
        pass
    else:
        writer.close()
        # mlflow.end_run()

    if is_distributed:
        if train.world_rank() == 0:
            return net.module.cpu()
        else:
            return None
    else:
        return net
Exemple #27
0
def run(proc_id, n_gpus, args, devices, data):
    dropout = 0.2

    dev_id = devices[proc_id]
    if n_gpus > 1:
        dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
            master_ip='127.0.0.1', master_port='12345')
        world_size = n_gpus
        th.distributed.init_process_group(backend="nccl",
                                          init_method=dist_init_method,
                                          world_size=world_size,
                                          rank=proc_id)
    th.cuda.set_device(dev_id)

    # Unpack data
    train_mask, val_mask, in_feats, labels, n_classes, g = data
    train_nid = train_mask.nonzero().squeeze()
    val_nid = val_mask.nonzero().squeeze()

    # Split train_nid
    train_nid = th.split(train_nid,
                         math.ceil(len(train_nid) / n_gpus))[proc_id]

    # Create sampler
    sampler = NeighborSampler(g, [int(_) for _ in args.fan_out.split(',')])

    # Create PyTorch DataLoader for constructing blocks
    dataloader = DataLoader(dataset=train_nid.numpy(),
                            batch_size=args.batch_size,
                            collate_fn=sampler.sample_blocks,
                            shuffle=True,
                            drop_last=False,
                            num_workers=args.num_workers_per_gpu)

    # Define model
    model = SAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu)

    # Move the model to GPU and define optimizer
    model = model.to(dev_id)
    if n_gpus > 1:
        model = DistributedDataParallel(model,
                                        device_ids=[dev_id],
                                        output_device=dev_id)
    loss_fcn = nn.CrossEntropyLoss()
    loss_fcn = loss_fcn.to(dev_id)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # Compute history tensor and their aggregation before training on CPU
    model.eval()
    if n_gpus > 1:
        if proc_id == 0:
            init_history(g, model.module, dev_id, args.val_batch_size)
        th.distributed.barrier()
    else:
        init_history(g, model, dev_id, args.val_batch_size)
    model.train()

    # Training loop
    avg = 0
    iter_tput = []
    for epoch in range(args.num_epochs):
        tic = time.time()
        model.train()
        for step, (blocks, hist_blocks) in enumerate(dataloader):
            if proc_id == 0:
                tic_step = time.time()

            # The nodes for input lies at the LHS side of the first block.
            # The nodes for output lies at the RHS side of the last block.
            seeds = blocks[-1].dstdata[dgl.NID]

            blocks, hist_blocks = load_subtensor(g, labels, blocks,
                                                 hist_blocks, dev_id, True)

            # forward
            batch_pred = model(blocks)
            # update history
            update_history(g, blocks)
            # compute loss
            batch_labels = blocks[-1].dstdata['label']
            loss = loss_fcn(batch_pred, batch_labels)
            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if proc_id == 0:
                iter_tput.append(
                    len(seeds) * n_gpus / (time.time() - tic_step))
            if step % args.log_every == 0 and proc_id == 0:
                acc = compute_acc(batch_pred, batch_labels)
                print(
                    'Epoch {:05d} | Step {:05d} | Loss {:.4f} | Train Acc {:.4f} | Speed (samples/sec) {:.4f}'
                    .format(epoch, step, loss.item(), acc.item(),
                            np.mean(iter_tput[3:])))

        if n_gpus > 1:
            th.distributed.barrier()

        toc = time.time()
        if proc_id == 0:
            print('Epoch Time(s): {:.4f}'.format(toc - tic))
            if epoch >= 5:
                avg += toc - tic
            if epoch % args.eval_every == 0 and epoch != 0:
                model.eval()
                eval_acc = evaluate(model if n_gpus == 1 else model.module, g,
                                    labels, val_nid, args.val_batch_size,
                                    dev_id)
                print('Eval Acc {:.4f}'.format(eval_acc))

    if n_gpus > 1:
        th.distributed.barrier()
    if proc_id == 0:
        print('Avg epoch time: {}'.format(avg / (epoch - 4)))
Exemple #28
0
    ARGS_RESET_EVERY = 100
    print("Loading {} for {}".format(params['model'].get('type', 'WTF?'),
                                     args.mode),
          flush=True)

    model = Model.from_params(params=params['model'])
    for submodule in model.trunk.detector.backbone.modules():
        if isinstance(submodule, BatchNorm2d):
            submodule.track_running_stats = False
        for p in submodule.parameters():
            p.requires_grad = False

    if distributed:
        model.cuda()
        model = DistributedDataParallel(model)
    elif NUM_GPUS > 1:
        model = DataParallel(model).cuda()
    else:
        model.cuda()
    optimizer = Optimizer.from_params(
        [x for x in model.named_parameters() if x[1].requires_grad],
        params['trainer']['optimizer'])

    lr_scheduler_params = params['trainer'].pop("learning_rate_scheduler",
                                                None)
    scheduler = LearningRateScheduler.from_params(
        optimizer, lr_scheduler_params) if lr_scheduler_params else None

    if os.path.exists(args.folder):
        print("Found folder! restoring", flush=True)
Exemple #29
0
def train():
    parser = ArgumentParser()
    parser.add_argument('--gpt2', action='store_true', help="use gpt2")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="config/cgpt/",
                        help="Path or URL of the model")
    parser.add_argument("--from_step",
                        type=int,
                        default=-1,
                        help="Init learning rate from this step")
    parser.add_argument('--pretrained',
                        action='store_true',
                        help="If False train from scratch")
    parser.add_argument("--data_path",
                        type=str,
                        default="",
                        help="Path or url of the dataset. ")
    parser.add_argument("--train_path",
                        type=str,
                        default="data/toy_train.txt",
                        help="Path of the train dataset for dist dataset. ")
    parser.add_argument("--valid_path",
                        type=str,
                        default="data/toy_valid.txt",
                        help="Path of the valid dataset for dist dataset. ")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default="dataset_cache",
                        help="Path or url of the dataset cache")
    parser.add_argument('--log_file',
                        '-log_file',
                        type=str,
                        default="",
                        help="Output logs to a file under this path")
    parser.add_argument("--num_workers",
                        type=int,
                        default=8,
                        help="Number of subprocesses for data loading")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=70,
                        help="Number of training epochs")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=2,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=2,
                        help="Batch size for validation")
    parser.add_argument("--max_history",
                        type=int,
                        default=15,
                        help="Number of previous exchanges to keep in history")
    parser.add_argument("--scheduler",
                        type=str,
                        default="noam",
                        choices=['noam', 'linear'],
                        help="method of optim")
    parser.add_argument("--n_emd",
                        type=int,
                        default=768,
                        help="Number of n_emd in config file (for noam)")
    parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--warmup_steps",
                        type=int,
                        default=5000,
                        help="Warm up steps")
    parser.add_argument("--valid_steps",
                        type=int,
                        default=5000,
                        help="Perfom validation every X steps")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=64,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process.
    # logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d", args.local_rank)
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info(
        "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning"
    )
    model_class = OpenAIGPTLMHeadModel if not args.gpt2 else GPT2LMHeadModel
    config_class = OpenAIGPTConfig if not args.gpt2 else GPT2Config
    tokenizer_class = BertTokenizer
    if args.pretrained:
        tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint,
                                                    do_lower_case=True)
        model = model_class.from_pretrained(args.model_checkpoint)
    else:
        tokenizer = tokenizer_class(os.path.join(args.model_checkpoint,
                                                 "vocab.txt"),
                                    do_lower_case=True)
        config = config_class.from_json_file(
            os.path.join(args.model_checkpoint, CONFIG_NAME))
        model = model_class(config)
    model.to(args.device)

    optimizer = AdamW([{
        'params': model.parameters(),
        'initial_lr': args.lr
    }],
                      lr=args.lr,
                      correct_bias=True)

    logger.info("Prepare datasets")
    loader_class = build_dist_loaders if not args.data_path else build_dataloaders
    train_loader, val_loader, train_sampler, valid_sampler = loader_class(
        args, tokenizer, logger)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    # Training function and trainer
    def update(engine, batch):
        input_ids, token_type_ids, lm_labels = tuple(
            input_tensor.to(args.device) for input_tensor in batch)
        model.train()
        (lm_loss), *_ = model(input_ids,
                              labels=lm_labels,
                              token_type_ids=token_type_ids)
        loss = lm_loss / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item(), optimizer.param_groups[0]['lr']

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            input_ids, token_type_ids, lm_labels = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            lm_logits, *_ = model(input_ids, token_type_ids=token_type_ids)
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return lm_logits_flat_shifted, lm_labels_flat_shifted

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Evaluation during training
    @trainer.on(Events.ITERATION_STARTED)
    def log_iterations(engine):
        # if engine.state.iteration % max(int(0.1 * len(train_loader)), 1) == 0:
        if engine.state.iteration % args.valid_steps == 0:
            evaluator.run(val_loader)

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # noam decrease the learning rate
    # model_size = model.config.n_embd
    model_size = args.n_emd
    noam_lambda = lambda step: (model_size**(-0.5) * min(
        (step + 1)**(-0.5), (step + 1) * args.warmup_steps**(-1.5)))
    noam_scheduler = LambdaLR(optimizer,
                              lr_lambda=noam_lambda,
                              last_epoch=args.from_step)
    scheduler = LRScheduler(noam_scheduler)
    if args.scheduler == "linear":
        scheduler = PiecewiseLinear(optimizer, "lr",
                                    [(0, args.lr),
                                     (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss")
    RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "lr")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-1),
             output_transform=lambda x: (x[0], x[1]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints
    # And save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True, mininterval=2)
        pbar.attach(trainer, metric_names=["loss", "lr"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=None)
        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator,
                         log_handler=OutputHandler(tag="validation",
                                                   metric_names=list(
                                                       metrics.keys()),
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3)
        # save model after evaluation
        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)})
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" take care of distributed encapsulation

        torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(
            os.path.join(tb_logger.writer.logdir, CONFIG_NAME))
        tokenizer.save_vocabulary(tb_logger.writer.logdir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint
    # (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            checkpoint_handler._saved[-1][1][-1],
            os.path.join(tb_logger.writer.logdir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Exemple #30
0
def main(args):
    print('_' * 60 + f'\nmain <- {args}')
    if 'setup(args)':
        cfg = get_cfg()
        cfg.merge_from_file(args.config_file)
        cfg.merge_from_list(args.opts)
        cfg.freeze()
        default_setup(
            cfg, args
        )  # if you don't like any of the default setup, write your own setup code
        global CONFIG
        CONFIG = cfg

    if True:  # N_GPU > 0:
        # __________________ For Debug _____________________________
        # mem_stats_df.record('Before-Build-Model')
        if 'build_model(cfg)':
            meta_arch = cfg.MODEL.META_ARCHITECTURE
            model = META_ARCH_REGISTRY.get(meta_arch)(cfg)
            # for param in model.backbone.parameters():
            #     param.requires_grad = False
            model.to(torch.device(cfg.MODEL.DEVICE))
        # __________________ For Debug _____________________________
        # mem_stats_df.record('After-Build-Model')

    if args.eval_only:
        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
            cfg.MODEL.WEIGHTS, resume=args.resume
        )
        return do_test(cfg, model)

    distributed = comm.get_world_size() > 1
    if distributed:
        model = DistributedDataParallel(
            model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
        )

    if 'do-train':
        dataloader = build_train_dataloader(cfg)

        if N_GPUS > 0:
            cfg, model, resume = cfg, model, args.resume

            model.train()
            optimizer = build_optimizer(cfg, model)
            scheduler = build_lr_scheduler(cfg, optimizer)

            checkpointer = DetectionCheckpointer(
                model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler,
            )
            # "iteration" always be loaded whether resume or not.
            # "model" state_dict will always be loaded whether resume or not.
            start_iter = (
                    checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
            )
            max_iter = cfg.SOLVER.MAX_ITER
            # optimizer and scheduler will be resume to checkpointer.checkpointables[*] if resume is True
            if resume:
                optimizer  = checkpointer.checkpointables['optimizer']
                scheduler  = checkpointer.checkpointables['scheduler']
            else:
                start_iter = 0

            periodic_checkpointer = PeriodicCheckpointer(
                checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
            )

            writers = (
                [
                    CommonMetricPrinter(max_iter),
                    JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
                    TensorboardXWriter(cfg.OUTPUT_DIR),
                ]
                if comm.is_main_process()
                else []
            )
            logger.info("Starting training from iteration {}".format(start_iter))

            with EventStorage(start_iter) as storage:
                for data, itr in zip(dataloader, range(start_iter, max_iter)):
                    iteration = itr + 1
                    storage.step()

                    loss_dict = model(data)
                    losses = sum(loss_dict.values())
                    assert torch.isfinite(losses).all(), loss_dict

                    loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
                    losses_reduced = sum(loss for loss in loss_dict_reduced.values())
                    if comm.is_main_process():
                        storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

                    optimizer.zero_grad()
                    losses.backward()
                    optimizer.step()
                    storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
                    scheduler.step()

                    # __________________ Checkpoint / Test / Metrics ___________
                    periodic_checkpointer.step(iteration)

                    if (
                        cfg.TEST.EVAL_PERIOD > 0
                        and iteration % cfg.TEST.EVAL_PERIOD == 0
                        and iteration != max_iter
                    ):
                        do_test(cfg, model)
                        # Compared to "train_net.py", the test results are not dumped to EventStorage
                        comm.synchronize()

                    if iteration - start_iter > 5 and (iteration % 100 == 0 or iteration == max_iter):
                        for writer in writers:
                            writer.write()
                        # __________________ For Debug _____________________________
                        # mem_summary = torch.cuda.memory_summary()
                        # tcp_sock.send(mem_summary.encode('utf-8'))
                        global TIC
                        if TIC is None:
                            TIC = datetime.datetime.now()
                        else:
                            toc = datetime.datetime.now()
                            logger.info('_' * 35 + f'Time Elapsed: {(toc - TIC).total_seconds()} s')
                            TIC = toc