def configure_optimizers(self):
        if FLAGS.optim == 'adam':
            self.optimizer = AdamW(self.model.parameters(),
                                   lr=FLAGS.lr,
                                   weight_decay=1e-5)
        elif FLAGS.optim == 'sm3':
            self.optimizer = SM3(self.model.parameters(),
                                 lr=FLAGS.lr,
                                 momentum=0.0)
        else:
            self.optimizer = Novograd(self.model.parameters(),
                                      lr=FLAGS.lr,
                                      weight_decay=1e-3)
        scheduler = []
        if FLAGS.sched:
            self.plateau_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer,
                patience=FLAGS.sched_patience,
                factor=FLAGS.sched_factor,
                min_lr=FLAGS.sched_min_lr,
                verbose=1)
            scheduler = [self.plateau_scheduler]

        self.warmup_optimizer_step(0)
        return [self.optimizer]
Example #2
0
def get_optimizer(opt, params):
    # large_lr_layers = list(map(id, model.module._fc.parameters()))
    # small_lr_layers = filter(lambda p:id(p) not in large_lr_layers, model.module.parameters())

    if opt.optimizer == 'sgd':
        optimizer = optim.SGD(params, lr=opt.lr, momentum=0.9, nesterov=True)
        # optimizer = torch.optim.SGD([
        #             {"params":model.module._fc.parameters()},
        #             {"params":small_lr_layers,"lr":opt.lr/10}
        #             ],lr = opt.lr, momentum=0.9, weight_decay=1e-4)
    elif opt.optimizer == 'adam':
        optimizer = optim.Adam(params, lr=opt.lr)
    elif opt.optimizer == 'radam':
        optimizer = RAdam(params, lr=opt.lr)
    elif opt.optimizer == 'adamw':
        optimizer = AdamW(params, lr=opt.lr)
        # optimizer = torch.optim.AdamW([
        #             {"params":model.module._fc.parameters()},
        #             {"params":small_lr_layers,"lr":opt.lr/10}
        #             ],lr = opt.lr, weight_decay=5e-4)
    elif opt.optimizer == 'rms':
        # optimizer = optim.RMSprop([
        #                 {"params":model.module._fc.parameters()},
        #                 {"params":small_lr_layers, "lr": opt.lr/10}
        #                 ], lr=opt.lr, momentum=0.9, weight_decay=1e-4)
        optimizer = optim.RMSprop(params, lr=opt.lr, momentum=0.9)
    elif opt.optimizer == 'novograd':
        optimizer = NovoGrad(params, lr=opt.lr, grad_averaging=True)

    if opt.lookahead:
        optimizer = Lookahead(optimizer, k=6, alpha=0.6)
    return optimizer
Example #3
0
def worker(proc_id, gpu_ranks, args, model):
    if args.dist_train:  # multiple GPU mode
        rank = gpu_ranks[proc_id] % args.world_size
        gpu_id = gpu_ranks[proc_id] % args.device_count
    elif args.single_gpu:  # single GPU mode
        rank = None
        gpu_id = proc_id
    else:  # CPU mode
        rank = None
        gpu_id = None

    if args.dist_train:
        train_loader = LOADERS[args.target](args, args.dataset_path,
                                            args.batch_size, rank,
                                            args.world_size, True)
    else:
        train_loader = LOADERS[args.target](args, args.dataset_path,
                                            args.batch_size, 0, 1, True)

    if gpu_id is not None:
        torch.cuda.set_device(gpu_id)
        model.cuda(gpu_id)

    # build optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      correct_bias=False)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.total_steps *
                                     args.warmup,
                                     t_total=args.total_steps)

    if args.dist_train:
        # initialize multiprocessing distributed training environment
        dist.init_process_group(backend=args.backend,
                                init_method=args.master_ip,
                                world_size=args.world_size,
                                rank=rank)
        model = DistributedDataParallel(model, device_ids=[gpu_id])
        print("Worker {} is training ... ".format(rank))
    else:
        print("Worker is training ...")
    TRAINERS[args.target](args, gpu_id, rank, train_loader, model, optimizer,
                          scheduler)
Example #4
0
def train(dataset, embedding, tokenizer, entity_linker, min_count,
          max_word_length, max_entity_length, batch_size, patience,
          learning_rate, weight_decay, warmup_epochs, dropout_prob, use_gpu,
          use_word):
    if use_gpu:
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    data = generate_features(dataset, tokenizer, entity_linker, min_count,
                             max_word_length, max_entity_length)
    word_vocab = data['word_vocab']
    entity_vocab = data['entity_vocab']

    train_data_loader = DataLoader(data['train'],
                                   shuffle=True,
                                   batch_size=batch_size)
    dev_data_loader = DataLoader(data['dev'],
                                 shuffle=False,
                                 batch_size=batch_size)

    dim_size = embedding.syn0.shape[1]
    word_embedding = np.random.uniform(low=-0.05,
                                       high=0.05,
                                       size=(len(word_vocab), dim_size))
    word_embedding[0] = np.zeros(dim_size)
    for word, index in word_vocab.items():
        try:
            word_embedding[index] = embedding.get_word_vector(word)
        except KeyError:
            continue
    entity_embedding = np.random.uniform(low=-0.05,
                                         high=0.05,
                                         size=(len(entity_vocab), dim_size))
    entity_embedding[0] = np.zeros(dim_size)
    for entity, index in entity_vocab.items():
        try:
            entity_embedding[index] = embedding.get_entity_vector(entity)
        except KeyError:
            continue

    model = NABoE(word_embedding, entity_embedding, len(dataset.label_names),
                  dropout_prob, use_word)
    optimizer = AdamW(model.parameters(),
                      lr=learning_rate,
                      weight_decay=weight_decay,
                      warmup=warmup_epochs * len(train_data_loader))
    model.to(device)

    epoch = 0
    best_val_acc = 0.0
    best_weights = None
    num_epochs_without_improvement = 0
    while True:
        with tqdm(train_data_loader) as pbar:
            model.train()
            for batch in pbar:
                args = {
                    k: v.to(device)
                    for k, v in batch.items() if k != 'label'
                }
                logits = model(**args)
                loss = F.cross_entropy(logits, batch['label'].to(device))
                loss.backward()
                optimizer.step()
                model.zero_grad()
                pbar.set_description(f'epoch: {epoch} loss: {loss.item():.8f}')

        epoch += 1
        val_acc = evaluate(model, dev_data_loader, device, 'dev')[0]
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_weights = {
                k: v.to('cpu').clone()
                for k, v in model.state_dict().items()
            }
            num_epochs_without_improvement = 0
        else:
            num_epochs_without_improvement += 1

        if num_epochs_without_improvement >= patience:
            model.load_state_dict(best_weights)
            break

    test_data_loader = DataLoader(data['test'],
                                  shuffle=False,
                                  batch_size=batch_size)
    return evaluate(model, test_data_loader, device, 'test')
dis_loss = nn.BCEWithLogitsLoss(reduction='mean')

param_optimizer = list(model.named_parameters())

no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{
    'params': [p for n, p in param_optimizer if n not in no_decay],
    'weight_decay_rate':
    0.01
}, {
    'params': [p for n, p in param_optimizer if n in no_decay],
    'weight_decay_rate':
    0.0
}]

optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr)
# optimizer = apex.optimizers.FusedLAMB(optimizer_grouped_parameters, lr=args.lr)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

model = DistributedDataParallel(model)

log_dir = os.path.join(args.save_path, 'logs')
model_dir = os.path.join(args.save_path, 'models')

if not os.path.exists(log_dir):
    mkdir_p(log_dir)

if not os.path.exists(model_dir):
    mkdir_p(model_dir)
Example #6
0
def train(args):
    device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
    #### Load data
    # create the data and its corresponding datasets and dataloader
    train_data, num_labels = create_data(args.train, 'train')
    dev_data = create_data(args.dev, 'valid')

    train_dataset = BertDataset(train_data, args)
    dev_dataset = BertDataset(dev_data, args)

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=args.batch_size,
                                  collate_fn=train_dataset.collate_fn)
    dev_dataloader = DataLoader(dev_dataset,
                                shuffle=False,
                                batch_size=args.batch_size,
                                collate_fn=dev_dataset.collate_fn)

    #### Init model
    config = {
        'hidden_dropout_prob': args.hidden_dropout_prob,
        'num_labels': num_labels,
        'hidden_size': 768,
        'data_dir': '.',
        'option': args.option
    }

    config = SimpleNamespace(**config)

    # initialize the Senetence Classification Model
    model = BertSentClassifier(config)
    model = model.to(device)

    lr = args.lr
    ## specify the optimizer
    optimizer = AdamW(model.parameters(), lr=lr)
    best_dev_acc = 0

    ## run for the specified number of epochs
    for epoch in range(args.epochs):
        model.train()
        train_loss = 0
        num_batches = 0
        for step, batch in enumerate(
                tqdm(train_dataloader,
                     desc=f'train-{epoch}',
                     disable=TQDM_DISABLE)):
            b_ids, b_type_ids, b_mask, b_labels, b_sents = batch[0][
                'token_ids'], batch[0]['token_type_ids'], batch[0][
                    'attention_mask'], batch[0]['labels'], batch[0]['sents']

            b_ids = b_ids.to(device)
            b_mask = b_mask.to(device)
            b_labels = b_labels.to(device)

            optimizer.zero_grad()
            logits = model(b_ids, b_mask)
            loss = F.nll_loss(logits, b_labels.view(-1),
                              reduction='sum') / args.batch_size

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            num_batches += 1

        train_loss = train_loss / (num_batches)

        train_acc, train_f1, *_ = model_eval(train_dataloader, model, device)
        dev_acc, dev_f1, *_ = model_eval(dev_dataloader, model, device)

        if dev_acc > best_dev_acc:
            best_dev_acc = dev_acc
            save_model(model, optimizer, args, config, args.filepath)

        print(
            f"epoch {epoch}: train loss :: {train_loss :.3f}, train acc :: {train_acc :.3f}, dev acc :: {dev_acc :.3f}"
        )
Example #7
0
    if args.model == 'resnet':
        model = ResNet18(num_classes=10)

    args.model += f"_{args.optimizer}"
    if args.do_scheduler:
        args.model += "_cosine"

    model.to(device)
    if args.optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=0.001,
                               betas=(0.9, 0.999))

    if args.optimizer == 'adamw':
        optimizer = AdamW(model.parameters(),
                          lr=0.001,
                          betas=(0.9, 0.999),
                          weight_decay=0.001)

    if args.optimizer == 'novograd':
        optimizer = NovoGrad(model.parameters(),
                             lr=0.01,
                             betas=(0.95, 0.98),
                             weight_decay=0.001)

    train_monitor = TrainingMonitor(file_dir='./png', arch=args.model)
    if args.do_scheduler:
        lr_scheduler = CosineAnnealingLR(optimizer,
                                         epochs * len(loaders['train']), 1e-4)

    for epoch in range(1, epochs + 1):
        if args.do_scheduler:
Example #8
0
def train(args, network, train_itr, dev_itr):
    logger.info("Start training.")

    num_train_steps = int(args.max_epoch * len(train_itr) /
                          args.gradient_accumulation_steps)
    logger.info("Num update steps {}!".format(num_train_steps))

    start_epoch, best_result = 1, 0.0

    metrics = {key: AverageMeter() for key in ['loss', 'f1', 'em']}

    def reset_metrics():
        for metric in metrics.values():
            metric.reset()

    def update_metrics(result):
        for key in metrics.keys():
            metrics[key].update(result[key])

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in network.bert.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.bert_weight_decay,
        'lr':
        args.bert_learning_rate
    }, {
        'params': [
            p for n, p in network.bert.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0,
        'lr':
        args.bert_learning_rate
    }, {
        'params': [
            p for n, p in network.named_parameters()
            if not n.startswith("bert.")
        ],
        "weight_decay":
        args.weight_decay,
        "lr":
        args.learning_rate
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      warmup=args.warmup,
                      t_total=num_train_steps,
                      max_grad_norm=args.grad_clipping,
                      schedule=args.warmup_schedule)

    update_cnt, step = 0, 0
    train_start = datetime.now()
    save_prefix = os.path.join(args.save_dir, "checkpoint_best")

    for epoch in range(start_epoch, args.max_epoch + 1):
        logger.info('Start epoch {}'.format(epoch))

        reset_metrics()

        for batch in train_itr:
            step += 1
            network.train()
            output_dict = network(**batch)
            loss = output_dict["loss"]
            if args.gradient_accumulation_steps > 1:
                loss /= args.gradient_accumulation_steps
            loss.backward()

            current_metrics = network.get_metrics(True)
            current_metrics['loss'] = output_dict["loss"]
            update_metrics(current_metrics)

            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                update_cnt += 1

            if update_cnt % (
                    args.log_per_updates *
                    args.gradient_accumulation_steps) == 0 or update_cnt == 1:
                logger.info(
                    "QDGAT train: step:{0:6} loss:{1:.5f} f1:{2:.5f} em:{3:.5f} left:{4}"
                    .format(
                        update_cnt, metrics['loss'].avg, metrics['f1'].avg,
                        metrics['em'].avg,
                        str((datetime.now() - train_start) / (update_cnt + 1) *
                            (num_train_steps - update_cnt - 1)).split('.')[0]))
                reset_metrics()

        if args.do_eval:
            eval_loss, eval_f1, eval_em = evaluate(args, network, dev_itr)
            logger.info("Epoch {} eval result, loss {}, f1 {}, em {}.".format(
                epoch, eval_loss, eval_f1, eval_em))

        if args.do_eval and eval_f1 > best_result:

            save(args, network, optimizer, save_prefix, epoch, best_result)
            best_result = eval_f1
            logger.info("Best eval F1 {} at epoch {}".format(
                best_result, epoch))

    logger.info("Train cost {}s.".format(
        (datetime.now() - train_start).seconds))