Ejemplo n.º 1
0
def get_param_weight_decay_dict(param_group_name_list):
    return [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in param_group_name_list)
        ],
        "weight_decay":
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in param_group_name_list)
        ],
        'weight_decay':
        0.0
    }]
Ejemplo n.º 2
0
def save_layer_fig(model,exam_id, org_input, org_target, prediction,
             slice_level_performance, result_dir):
    result_exam_dir = os.path.join(result_dir, exam_id)
    if not os.path.exists(result_exam_dir):
        os.makedirs(result_exam_dir)

    for name, param in model.named_parameters():
        print(name, '\t\t', param.shape)
Ejemplo n.º 3
0
def model_save(fn, all_model=1, model_para=0):
    if all_model:
        with open(fn, 'wb') as f:
            torch.save([model, optimizer], f)
    if model_para:
        para_dic = {}
        for name, para in model.named_parameters():
            para_dic[name] = para.clone().cpu().data.numpy()
        with open(fn, 'wb') as f:
            pickle.dump(para_dic, f)
Ejemplo n.º 4
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    criterion = nn.CrossEntropyLoss()
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        logits, hidden = model(data, hidden)
        loss_values = model.calculate_loss_values(logits, targets)
        loss_values_data = tuple(map(lambda x: x.data[0], loss_values))
        cross_entropy_val, center_loss_val = loss_values_data
        cross_entropy_loss = loss_values[0]
        center_loss = loss_values[1]
        if center_loss_factor > 0:
            train_loss = cross_entropy_loss + center_loss_factor*center_loss
        else:
            train_loss = cross_entropy_loss
        train_loss.backward()

        train_loss_val = train_loss.data[0]
        perplexity_val = math.exp(cross_entropy_val)

        train_metrics['train_loss'].append(train_loss_val)
        train_metrics['center_loss'].append(center_loss_val)
        train_metrics['cross_entropy'].append(cross_entropy_val)
        train_metrics['perplexity'].append(perplexity_val)

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        for name,p in model.named_parameters():
            if p.requires_grad:
                p.data.add_(-lr, p.grad.data)

        total_loss += cross_entropy_val

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'ce loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            print('| train loss: {:.3f} | center loss: {:.3f} | cross entropy: {:.3f} | '
                  'perplexity: {:.3f}'.format(train_loss_val, center_loss_val, 
                  cross_entropy_val, perplexity_val))

            total_loss = 0
            start_time = time.time()
Ejemplo n.º 5
0
def print_model_param_info(model):
    print('[Model parameters to train]')
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name, param.data)  # param.data

    # for param in model.parameters():
    # 	if param.requires_grad:
    # 		print(param.name, param.size())
    # 		print(param.data)

    return
def trainable_params(model, feature_extract):
    """
    Prints and returns all the trainable parameters in model.

    :param model: the model instance
    :param feature_extract: if True, only params with *requires_grad* will be returned.
    :return: a list containing the model's trainable params.
    """
    params_to_update = model.parameters()
    print("Params to learn:")
    if feature_extract:
        params_to_update = []
        for name, param in model.named_parameters():
            if param.requires_grad == True:
                params_to_update.append(param)
                print("\t", name)
    else:
        for name, param in model.named_parameters():
            if param.requires_grad == True:
                print("\t", name)
    return params_to_update
Ejemplo n.º 7
0
def finetune(cur_iter, cur_learning_rate):
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)

    print("cur learning rate = ", cur_learning_rate)
    print("iteration", cur_iter)

    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):

        data, targets = get_batch(train_data, i)
        # print(data.size())
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset
        # (previous batches).

        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for name, p in model.named_parameters():
            if p.requires_grad:
                p.data.add_(
                    -get_discriminative_lr(name, cur_learning_rate),
                    p.grad.data)  # p.data = p.data += -lr * p.grad.data

        total_loss += loss.item()

        cur_learning_rate = scheduled_slanted_lr(cur_iter, T, args.cut_frac,
                                                 args.ratio, args.lr)
        cur_iter += 1
        #print(batch)
        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:04.4f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, cur_learning_rate,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
    return cur_iter, cur_learning_rate
Ejemplo n.º 8
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)

        orth_reg = 0.0

        for name, param in model.named_parameters():
            if param.requires_grad and 'U' in name:
                d = min(list(param.shape))
                diff = torch.matmul(param.t(), param) - torch.eye(d).cuda()
                orth_reg += torch.sum(diff**2) / (d * d)
            elif param.requires_grad and 'V' in name:
                d = min(list(param.shape))
                diff = torch.matmul(param, param.t()) - torch.eye(d).cuda()
                orth_reg += torch.sum(diff**2) / (d * d)

        total = loss + args.od * orth_reg
        total.backward()
        model.rnn.svd_grad()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Ejemplo n.º 9
0
    def unfreeze_rln(self):
        """
        Method to freeze RLN parameters
        """
        if isinstance(self.net, nn.DataParallel):
            model = self.net.module.model
        else:
            model = self.net.model

        for name, param in model.named_parameters():
            param.learn = True

        for name, param in model.bert.named_parameters():
            param.learn = True

        for name, param in model.bert.named_parameters():
            param.learn = True
Ejemplo n.º 10
0
def test(epoch):
    global best_acc
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_index, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            inputs = Variable(inputs)
            targets = Variable(targets)
            # Forward
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            # Results
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
        print('Loss: %.3f | Accuracy: %.3f' %
              (test_loss / 100, 100. * correct / total))

    # Save the model
    acc = 100. * correct / total
    if acc > best_acc:
        print('Saving..')
        state = {
            'net': model.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        torch.save(state, './checkpoint/Baseline.ckpt')
        best_acc = acc

    # Plot the model
    info = {'loss': test_loss, 'accuracy': acc}
    for tag, value in info.items():
        logger.scalar_summary(tag, value, epoch + 1)
    for tag, value in model.named_parameters():
        tag = tag.replace('.', '/')
        logger.histo_summary(tag, value.data.cpu().numpy(), epoch + 1)
        logger.histo_summary(tag + '/grad',
                             value.grad.data.cpu().numpy(), epoch + 1)
Ejemplo n.º 11
0
def train(lr, epoch = 0):
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        # if args.admm:
        if stage == 'admm':
            ce_loss = loss
            admm.admm_update(args,ADMM,model,None,None,None,epoch,None,batch)   # update Z and U        
            ce_loss,admm_loss,mixed_loss = admm.append_admm_loss(args,ADMM,model,ce_loss) # append admm losss
            loss = mixed_loss
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        if stage == 'masked_retrain':
            for name,W in model.named_parameters():
                if name in config.masks:
                    W.grad.data *= config.masks[name]
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Ejemplo n.º 12
0
def eval_smooth(prev_model, model, num_pts=1):
    alphas = np.arange(1, num_pts + 1) / (num_pts + 1)
    gnorm = eval_grad(prev_model)
    update_size = utils.norm_diff(utils.get_model_params(model), \
                                  utils.get_model_params(prev_model))
    max_smooth = -1
    for alpha in alphas:
        new_model = copy.deepcopy(prev_model)

        for n, p in new_model.named_parameters():
            p.data = alpha * p.data + (
                1 - alpha) * {n: p
                              for n, p in model.named_parameters()}[n].data

        eval_grad(new_model)
        smooth = utils.norm_diff(
            utils.get_model_grads(new_model),
            utils.get_model_grads(prev_model)) / (update_size * (1 - alpha))
        max_smooth = max(smooth, max_smooth)

    return max_smooth, gnorm
Ejemplo n.º 13
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        #print(output.size())
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        #print(model.state_dict())
        #print("Done***")

        for n,p in model.named_parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.data

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | perplexity {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Ejemplo n.º 14
0
def train(epoch, optimizer, compression_scheduler=None):
    # Turn on training mode which enables dropout.
    model.train()

    total_samples = train_data.size(0)
    steps_per_epoch = math.ceil(total_samples / args.bptt)

    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    # The line below was fixed as per: https://github.com/pytorch/examples/issues/214
    for batch, i in enumerate(range(0, train_data.size(0), args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)

        if compression_scheduler:
            compression_scheduler.on_minibatch_begin(
                epoch,
                minibatch_id=batch,
                minibatches_per_epoch=steps_per_epoch)
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)

        if compression_scheduler:
            # Before running the backward phase, we add any regularization loss computed by the scheduler
            regularizer_loss = compression_scheduler.before_backward_pass(
                epoch,
                minibatch_id=batch,
                minibatches_per_epoch=steps_per_epoch,
                loss=loss)
            loss += regularizer_loss

        optimizer.zero_grad()
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()

        total_loss += loss.item()

        if compression_scheduler:
            compression_scheduler.on_minibatch_end(
                epoch,
                minibatch_id=batch,
                minibatches_per_epoch=steps_per_epoch)

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            lr = optimizer.param_groups[0]['lr']
            msglogger.info(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} '
                '| loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
            stats = ('Peformance/Training/',
                     OrderedDict([('Loss', cur_loss),
                                  ('Perplexity', math.exp(cur_loss)),
                                  ('LR', lr), ('Batch Time', elapsed * 1000)]))
            steps_completed = batch + 1
            distiller.log_training_progress(stats, model.named_parameters(),
                                            epoch, steps_completed,
                                            steps_per_epoch, args.log_interval,
                                            [tflogger])
    }

    training_loader = DataLoader(training_set, **train_params)
    testing_loader = DataLoader(testing_set, **test_params)

    # GPU check and setting the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))

    # Object of RobertaBaseClass and setting to device
    model = model.RobertaBaseClass()
    model.to(device)

    # Model parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.001,
        },
        {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0,
Ejemplo n.º 16
0
def main():

    dataframe = pd.read_csv('../input/imdb.csv')  # load dataframe
    dataframe.sentiment = dataframe.sentiment.apply(lambda x: 1
                                                    if x == 'positive' else 0)
    # sentiment is category target variable so we have to label encode it, we can do it like this by hand, or simply with sklearn.model_selection.LabelEncoder

    # now split data into validation and training

    df_train, df_valid = model_selection.train_test_split(
        dataframe,
        test_size=0.1,  # 10 percent of dataframe will be for validation
        random_state=
        42,  # if we are going to run multiple time this script, random state enables that everytime we get same split with same random state
        shuffle=True,  # shuffle indices
        stratify=dataframe.sentiment.
        values  # same distribution in train and valid 
    )

    df_train = df_train.reset_index(
        drop=True)  # we reset indices from 0 to len(df_train)
    df_valid = df_valid.reset_index(
        drop=True)  # we reset indices from 0 to len(df_valid)

    # make datasets with our class in order to make data loaders
    training_dataset = dataset.BERTdataset(review=df_train.review.values,
                                           sentiment=df_train.sentiment.values)
    # from dataset to dataloader
    training_data_loader = torch.utils.data.DataLoader(
        dataset=training_dataset,
        batch_size=config.TRAINING_BATCH_SIZE,
        shuffle=True,
        num_workers=4)

    validation_dataset = dataset.BERTdataset(
        review=df_valid.review.values,
        sentiment=df_valid.sentiment.values,
    )
    # from dataset to dataloader
    validation_data_loader = torch.utils.data.DataLoader(
        dataset=validation_dataset,
        batch_size=config.VALIDATION_BATCH_SIZE,
        shuffle=False,
        num_workers=4)

    device = torch.device('cuda')
    model = model.BERTsentiment()
    model.to(device)  # move model to cuda device
    # params to optimize
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.00
    }]

    number_of_training_steps = int(
        len(df_train) / config.TRAINING_BATCH_SIZE * config.EPOCHS)
    #AdamW focuses on regularization and model does better on  generalization
    optimizer = AdamW(params=param_optimizer, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=number_of_training_steps,
    )

    best_accuracy = []

    for epoch in range(config.EPOCHS):
        print('EPOCH:', epoch + 1)
        engine.training_loop(training_data_loader, model, optimizer, scheduler,
                             device)
        outputs, sentiments = engine.validation_loop(validation_data_loader,
                                                     model, device)
        # distribution is 50 50 so we can use acc score
        outputs = np.array(outputs) >= 0.5  # positive class
        accuracy = metrics.accuracy_score(sentiments, outputs)
        print('ACCURACY SCORE', {accuracy})

        if accuracy > best_accuracy:
            torch.save(model.state_dict(),
                       config.MODEL_PATH)  # save model in working dir
            best_accuracy = accuracy
Ejemplo n.º 17
0
def main():
    """
    Training and validation.
    """
    global epochs_since_improvement, start_epoch, label_map, best_loss, epoch, checkpoint

    # Initialize model or load checkpoint
    if checkpoint is None:
        model = SSD300(n_classes=n_classes)
        # Initialize the optimizer, with twice the default learning rate for biases, as in the original Caffe repo
        biases = list()
        not_biases = list()
        for param_name, param in model.named_parameters():
            if param.requires_grad:
                if param_name.endswith('.bias'):
                    biases.append(param)
                else:
                    not_biases.append(param)
        optimizer = torch.optim.SGD(params=[{
            'params': biases,
            'lr': 2 * lr
        }, {
            'params': not_biases
        }],
                                    lr=lr,
                                    momentum=momentum,
                                    weight_decay=weight_decay)

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        epochs_since_improvement = checkpoint['epochs_since_improvement']
        best_loss = checkpoint['best_loss']
        print(
            '\nLoaded checkpoint from epoch %d. Best loss so far is %.3f.\n' %
            (start_epoch, best_loss))
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']

    # Move to default device
    model = model.to(device)
    criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device)

    # Custom dataloaders
    train_dataset = PascalVOCDataset(data_folder, split='train')
    val_dataset = PascalVOCDataset(data_folder, split='test')

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=train_dataset.collate_fn,
        num_workers=workers,
        pin_memory=True)  # note that we're passing the collate function here
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             collate_fn=val_dataset.collate_fn,
                                             num_workers=workers,
                                             pin_memory=True)
    # Epochs
    for epoch in range(start_epoch, epochs):
        # Paper describes decaying the learning rate at the 80000th, 100000th, 120000th 'iteration', i.e. model update or batch
        # The paper uses a batch size of 32, which means there were about 517 iterations in an epoch
        # Therefore, to find the epochs to decay at, you could do,
        # if epoch in {80000 // 517, 100000 // 517, 120000 // 517}:
        #     adjust_learning_rate(optimizer, 0.1)

        # In practice, I just decayed the learning rate when loss stopped improving for long periods,
        # and I would resume from the last best checkpoint with the new learning rate,
        # since there's no point in resuming at the most recent and significantly worse checkpoint.
        # So, when you're ready to decay the learning rate, just set checkpoint = 'BEST_checkpoint_ssd300.pth.tar' above
        # and have adjust_learning_rate(optimizer, 0.1) BEFORE this 'for' loop

        # One epoch's training
        train(train_loader=train_loader,
              model=model,
              criterion=criterion,
              optimizer=optimizer,
              epoch=epoch)

        # One epoch's validation
        val_loss = validate(val_loader=val_loader,
                            model=model,
                            criterion=criterion)

        # Did validation loss improve?
        is_best = val_loss < best_loss
        best_loss = min(val_loss, best_loss)

        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" %
                  (epochs_since_improvement, ))

        else:
            epochs_since_improvement = 0

        # Save checkpoint
        save_checkpoint(epoch, epochs_since_improvement, model, optimizer,
                        val_loss, best_loss, is_best)
Ejemplo n.º 18
0
def run():
    df_train1 = pd.read_csv(config.TRAIN_PATH_1, usecols=['comment_text', 'toxic']).fillna('none')
    df_train2 = pd.read_csv(config.TRAIN_PATH_2, usecols=['comment_text', 'toxic']).fillna('none')
    
    df_train_full = pd.concat([df_train1, df_train2], axis=0).reset_index(drop=True)

    df_train_full = df_train_full.sample(frac=1).reset_index(drop=True).head(400000)
    
    df_valid = pd.read_csv(config.VALID_PATH)

    tokenizer = config.tokenizer
    
    train_targets = df_train.toxic.values
    valid_targets = df_valid.toxic.values

    train_dataset = dataset.BERTDatasetTraining(
        comment_text=df_train.comment_text.values, 
        targets=train_targets,
        tokenizer=tokenizer, 
        max_length=config.MAX_LEN
    )

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, 
        num_replicas=xm.xrt_world_size(), 
        rank=xm.get_ordinal(),
        shuffle=True
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, 
        batch_size=config.TRAIN_BATCH_SIZE, 
        sampler=train_sampler, 
        drop_last=True, 
        num_workers=4
    )

    valid_dataset = dataset.BERTDatasetTraining(
        comment_text=df_valid.comment_text.values,
        targets=valid_targets,
        tokeizer=config.tokenizer, 
        max_length=config.MAX_LEN
    )
    
    valid_sampler = torch.utils.data.distributed.DistributedSampler(
        valid_dataset, 
        num_replicas=xm.xrt_world_size(), 
        rank=xm.get_ordinal(), 
        shuffle=False
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, 
        batch_size=config.VALID_BATCH_SIZE, 
        sampler=valid_sampler, 
        drop_last=True, 
        num_workers=4 
    )

    device = xm.xla_device()
    model = BERTBaseUncased(bert_path=config.MODEL_PATH).to(device)
    
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    
    optimizer_grouped_parameters = [
        {'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay':0.01}, 
        {'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
    ]

    lr = 3e-5 * xm.xrt_world_size()
    num_train_steps = int(len(train_dataset)/config.TRAIN_BATCH_SIZE/xm.xrt_world_size()*EPOCHS)
    xm.master_print(f'num_train_steps={num_train_steps}, world_size={xm.world_size()}')
    
    optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
    scheduler = get_constant_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )

    for epoch in range(config.EPOCHS):
        para_loader = pl.ParallelLoader(train_data_loader, [device])
        engine.train_fn(para_loader.per_device_loader(device), model, 
        optimizer, device, scheduler=scheduler)
        
        para_loader = pl.ParallelLoader(valid_data_loader, [device])
        o, t = engine.eval_fn(para_loader.per_device_loader, model, device)
        xm.save(model.state_dict(), 'model.bin')
        auc = metrics.roc_auc_score(np.array(t) >= 0.5, o)
        xm.master_print(f'AUC ={auc}')
Ejemplo n.º 19
0
            elapsed = time.time() - start_time
            logging.info(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt,
                    optimizer.param_groups[0]['lr'],
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        batch += 1
        i += seq_len


for k, v in model.named_parameters():
    print(k)
    print(v)
# Loop over epochs.
lr = args.lr
best_val_loss = []
stored_loss = 100000000

# At any point you can hit Ctrl + C to break out of training early.
try:
    if args.continue_train:
        optimizer_state = torch.load(os.path.join(args.save, 'optimizer.pt'))
        if 't0' in optimizer_state['param_groups'][0]:
            optimizer = torch.optim.ASGD(model.parameters(),
                                         lr=args.lr,
                                         t0=0,
Ejemplo n.º 20
0
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()

        noises = {}
        if args.sharpness_smoothing:
            for key, p in model.named_parameters():
                if hasattr(model,
                           'quiet_parameters') and (key
                                                    in model.quiet_parameters):
                    continue

                if args.adapt_type == 'weight':
                    noise = (torch.cuda.FloatTensor(p.size()).uniform_() * 2. -
                             1.) * args.sharpness_smoothing * torch.abs(p.data)
                elif args.adapt_type == 'none':
                    noise = (torch.cuda.FloatTensor(p.size()).uniform_() * 2. -
                             1.) * args.sharpness_smoothing
                else:
                    raise ValueError('Unkown --adapt-type')
                noises[key] = noise
                p.data.add_(noise)

        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # denoise @ each mini-mini-batch.
        if args.sharpness_smoothing:
            for key, p in model.named_parameters():
                if key in noises:
                    p.data.sub_(noises[key])

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Ejemplo n.º 21
0
        retraction=opt.retraction,
        lr=opt.lr * hvd.size(),
    )

    print(f'Size of hvd process : {hvd.size()}')

    # optimizer = Adam(
    #     model.parameters(),
    #     lr=opt.lr * hvd.size(),
    # )

    lr = opt.lr

    # Add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(
        optimizer, named_parameters=model.named_parameters())

    _lr_multiplier = 0.1
    NoDisplayObj = 66  # Number of entities to display on the graph

    # scheduler = MultiStepLR(optimizer, milestones=[opt.burnin]+list(range(int(opt.epochs/10), opt.epochs, int(opt.epochs/10))), gamma=_lr_multiplier)
    # scheduler = StepLR(optimizer, step_size=10, gamma=0.9)

    # Broadcast parameters from rank 0 to all other processes.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    loader = th.utils.data.DataLoader(
        data,
        batch_size=opt.batchsize,
        #shuffle=True,
        num_workers=opt.ndproc,
Ejemplo n.º 22
0
    val_loss2 = evaluate(val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
              epoch, (time.time() - epoch_start_time), val_loss2,
              math.exp(val_loss2), val_loss2 / math.log(2)))
    print('-' * 89)

    print("MAX EPOCH = ", args.epochs + 1)
    for epoch in range(args.start, args.epochs + 1):
        epoch_start_time = time.time()
        train(epoch)
        print("TRAIN FINISHED")
        if 't0' in optimizer.param_groups[0]:
            tmp = {}
            for param_name, prm in model.named_parameters():
                tmp[param_name] = prm.data.clone()
                try:
                    prm.data = optimizer.state[prm]['ax'].clone()
                except:
                    pass
            val_loss2 = evaluate(val_data)
            print('-' * 89)
            print(
                '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | valid bpc {:8.3f}'.format(
                    epoch, (time.time() - epoch_start_time), val_loss2,
                    math.exp(val_loss2), val_loss2 / math.log(2)))
            print('-' * 89)

            if epoch % 30 == 0:
Ejemplo n.º 23
0
# Make results dir
out_dir = os.path.join(args.result_dir, args.name)
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
model = model.RNNModel(args.class_type, args.r, args.model, ntokens,
                       args.emsize, args.nhid, args.nlayers, args.dropout,
                       args.tied).to(device)

# Print params
for name, param in model.named_parameters():
    if param.requires_grad:
        print(('Parameter name, shape: ', name, param.data.shape))

criterion = nn.CrossEntropyLoss()

###############################################################################
# Training code
###############################################################################


def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
Ejemplo n.º 24
0
def train_model(model, trainds, testds, config, device, writer=None):
    batch_size = config['data']['batch_size']
    status = config['training']['status']
    epochs = config['training']['epochs']
    balanced_loss = config['loss']['balanced']
    # nval = config['nval']
    nval_tests = config['nval_tests']
    nsave = config['nsave']
    model_save = config['model_save']
    rank = config['rank']
    nranks = config['nranks']
    hvd = config['hvd']
    num_classes = config['data']['num_classes']

    ## create samplers for these datasets
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        trainds, nranks, rank, shuffle=True, drop_last=True)
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        testds, nranks, rank, shuffle=True, drop_last=True)

    ## create data loaders
    train_loader = torch.utils.data.DataLoader(
        trainds,
        shuffle=False,
        sampler=train_sampler,
        num_workers=config['data']['num_parallel_readers'],
        batch_size=batch_size,
        persistent_workers=True)
    test_loader = torch.utils.data.DataLoader(
        testds,
        shuffle=False,
        sampler=test_sampler,
        num_workers=config['data']['num_parallel_readers'],
        batch_size=batch_size,
        persistent_workers=True)

    loss_func = loss.get_loss(config)
    ave_loss = CalcMean.CalcMean()
    acc_func = accuracy.get_accuracy(config)
    ave_acc = CalcMean.CalcMean()

    opt_func = optimizer.get_optimizer(config)
    opt = opt_func(model.parameters(), **config['optimizer']['args'])

    lrsched_func = optimizer.get_learning_rate_scheduler(config)
    lrsched = lrsched_func(opt, **config['lr_schedule']['args'])

    # Add Horovod Distributed Optimizer
    if hvd:
        opt = hvd.DistributedOptimizer(
            opt, named_parameters=model.named_parameters())

        # Broadcast parameters from rank 0 to all other processes.
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    model.to(device)

    for epoch in range(epochs):
        logger.info(' epoch %s of %s', epoch, epochs)

        train_sampler.set_epoch(epoch)
        test_sampler.set_epoch(epoch)
        model.to(device)
        for batch_counter, (inputs, targets, class_weights,
                            nonzero_mask) in enumerate(train_loader):

            # move data to device
            inputs = inputs.to(device)
            targets = targets.to(device)
            class_weights = class_weights.to(device)
            nonzero_mask = nonzero_mask.to(device)

            # zero grads
            opt.zero_grad()
            outputs, endpoints = model(inputs)

            # set the weights
            if balanced_loss:
                weights = class_weights
                nonzero_to_class_scaler = torch.sum(
                    nonzero_mask.type(torch.float32)) / torch.sum(
                        class_weights.type(torch.float32))
            else:
                weights = nonzero_mask
                nonzero_to_class_scaler = torch.ones(1, device=device)

            loss_value = loss_func(outputs, targets.long())
            loss_value = torch.mean(
                loss_value * weights) * nonzero_to_class_scaler

            # backward calc grads
            loss_value.backward()

            # apply grads
            opt.step()

            ave_loss.add_value(float(loss_value.to('cpu')))

            # calc acc
            ave_acc.add_value(
                float(acc_func(outputs, targets, weights).to('cpu')))

            # print statistics
            if batch_counter % status == 0:

                logger.info(
                    '<[%3d of %3d, %5d of %5d]> train loss: %6.4f acc: %6.4f',
                    epoch + 1, epochs, batch_counter,
                    len(trainds) / nranks / batch_size, ave_loss.mean(),
                    ave_acc.mean())

                if writer and rank == 0:
                    global_batch = epoch * len(
                        trainds) / nranks / batch_size + batch_counter
                    writer.add_scalars('loss', {'train': ave_loss.mean()},
                                       global_batch)
                    writer.add_scalars('accuracy', {'train': ave_acc.mean()},
                                       global_batch)
                    #writer.add_histogram('input_trans',endpoints['input_trans'].view(-1),global_batch)

                ave_loss = CalcMean.CalcMean()
                ave_acc = CalcMean.CalcMean()

            # release tensors for memory
            del inputs, targets, weights, endpoints, loss_value

            if config['batch_limiter'] and batch_counter > config[
                    'batch_limiter']:
                logger.info('batch limiter enabled, stop training early')
                break

        # save at end of epoch
        torch.save(model.state_dict(),
                   model_save + '_%05d.torch_model_state_dict' % epoch)

        if nval_tests == -1:
            nval_tests = len(testds) / nranks / batch_size
        logger.info('epoch %s complete, running validation on %s batches',
                    epoch, nval_tests)

        model.to(device)
        # every epoch, evaluate validation data set
        with torch.no_grad():

            vloss = CalcMean.CalcMean()
            vacc = CalcMean.CalcMean()

            vious = [CalcMean.CalcMean() for i in range(num_classes)]

            for valid_batch_counter, (inputs, targets, class_weights,
                                      nonzero_mask) in enumerate(test_loader):

                inputs = inputs.to(device)
                targets = targets.to(device)
                class_weights = class_weights.to(device)
                nonzero_mask = nonzero_mask.to(device)

                # set the weights
                if balanced_loss:
                    weights = class_weights
                    nonzero_to_class_scaler = torch.sum(
                        nonzero_mask.type(torch.float32)) / torch.sum(
                            class_weights.type(torch.float32))
                else:
                    weights = nonzero_mask
                    nonzero_to_class_scaler = torch.ones(1, device=device)

                outputs, endpoints = model(inputs)

                loss_value = loss_func(outputs, targets.long())
                loss_value = torch.mean(
                    loss_value * weights) * nonzero_to_class_scaler
                vloss.add_value(float(loss_value.to('cpu')))

                # calc acc
                vacc.add_value(
                    float(acc_func(outputs, targets, weights).to('cpu')))

                # calc ious
                ious = get_ious(outputs, targets, weights, num_classes)
                for i in range(num_classes):
                    vious[i].add_value(float(ious[i]))

                if valid_batch_counter > nval_tests:
                    break

            mean_acc = vacc.mean()
            mean_loss = vloss.mean()
            # if config['hvd'] is not None:
            #    mean_acc  = config['hvd'].allreduce(torch.tensor([mean_acc]))
            #    mean_loss = config['hvd'].allreduce(torch.tensor([mean_loss]))
            mious = float(
                torch.sum(torch.FloatTensor([x.mean()
                                             for x in vious]))) / num_classes
            ious_out = {
                'jet': vious[0].mean(),
                'electron': vious[1].mean(),
                'bkgd': vious[2].mean(),
                'all': mious
            }
            # add validation to tensorboard
            if writer and rank == 0:
                global_batch = epoch * len(
                    trainds) / nranks / batch_size + batch_counter
                writer.add_scalars('loss', {'valid': mean_loss}, global_batch)
                writer.add_scalars('accuracy', {'valid': mean_acc},
                                   global_batch)
                writer.add_scalars('IoU', ious_out, global_batch)

            logger.warning(
                '>[%3d of %3d, %5d of %5d]<<< ave valid loss: %6.4f ave valid acc: %6.4f on %s batches >>>',
                epoch + 1, epochs, batch_counter,
                len(trainds) / nranks / batch_size, mean_loss, mean_acc,
                valid_batch_counter + 1)
            logger.warning('      >> ious: %s', ious_out)

        # update learning rate
        lrsched.step()
Ejemplo n.º 25
0
	T.RandomSizedCrop(base_model.input_side),
	T.RandomHorizontalFlip(),
	normalize
]), download = True)
dataset_eval = opts.dataset(opts.data, train = False, transform = transforms.Compose([
	T.Scale(256),
	T.CenterCrop(base_model.input_side),
	normalize
]), download = True)

adapt_sampler = lambda batch, dataset, sampler, **kwargs: type('', (torch.utils.data.Sampler, ), dict(__len__ = dataset.__len__, __iter__ = lambda _: itertools.chain.from_iterable(sampler(batch, dataset, **kwargs))))()
loader_train = torch.utils.data.DataLoader(dataset_train, sampler = adapt_sampler(opts.batch, dataset_train, opts.sampler), num_workers = opts.threads, batch_size = opts.batch, drop_last = True, pin_memory = True)
loader_eval = torch.utils.data.DataLoader(dataset_eval, shuffle = False, num_workers = opts.threads, batch_size = opts.batch, pin_memory = True)

model = opts.model(base_model, dataset_train.num_training_classes).cuda()
model_weights, model_biases, base_model_weights, base_model_biases = [[p for k, p in model.named_parameters() if p.requires_grad and ('bias' in k) == is_bias and ('base' in k) == is_base] for is_base in [False, True] for is_bias in [False, True]]

base_model_lr_mult = model.optimizer_params.pop('base_model_lr_mult', 1.0)
optimizer = model.optimizer([dict(params = base_model_weights, lr = base_model_lr_mult * model.optimizer_params['lr']), dict(params = base_model_biases, lr = base_model_lr_mult * model.optimizer_params['lr'], weight_decay = 0.0), dict(params = model_biases, weight_decay = 0.0)], **model.optimizer_params)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, **model.lr_scheduler_params)

log = open(opts.log, 'w', 0)
for epoch in range(opts.epochs):
	scheduler.step()
	model.train()
	loss_all, norm_all = [], []
	for batch_idx, batch in enumerate(loader_train if model.criterion is not None else []):
		tic = time.time()
		images, labels = [tensor.cuda() for tensor in batch]
		loss = model.criterion(model(images), labels)
		loss_all.append(float(loss))
def run():
    df1 = pd.read_csv("../data/jigsaw-toxic-comment-train.csv",
                      usecols=["comment_text", "toxic"])
    df2 = pd.read_csv("../data/jigsaw-unintended-bias-train.csv",
                      usecols=["comment_text", "toxic"])

    df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True)

    df_valid = pd.read_csv("../data/validation.csv")

    train_dataset = dataset.BERTDataset(
        comment_text=df_train.comment_text.values,
        target=df_train.toxic.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = dataset.BERTDataset(
        comment_text=df_valid.comment_text.values,
        target=df_valid.toxic.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = engine.eval_fn(valid_data_loader, model, device)
        targets = np.array(targets) >= 0.5
        accuracy = metrics.roc_auc_score(targets, outputs)
        print(f"AUC Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
Ejemplo n.º 27
0
                       0., 0., 0., 0., 0., 0., args.n_experts, args.emblocks, args.emdensity,
                       sparse_mode=args.sparse_mode,
                       sparse_fract=args.sparse_fract)

if args.cuda:
    if not args.multi_gpu:
        parallel_model = model.cuda()
    else:
        parallel_model = nn.DataParallel(model, dim=1).cuda()
else:
    parallel_model = model

logging('Args: {}'.format(args))

params_total, params_encoder, params_rnns = 0, 0, 0
for n, p in model.named_parameters():
    #print('param {}: {}'.format(n, p.nelement()))
    if 'encoder' in n:
        params_encoder += p.nelement()
    elif 'rnns' in n:
        params_rnns += p.nelement()
    params_total += p.nelement()
logging('params encoder: {}M'.format(params_encoder / 1.e6))
logging('params rnns: {}M'.format(params_rnns / 1.e6))
logging('params total: {}M'.format(params_total / 1.e6))

log_value('params rnn', params_rnns, 0)
log_value('params encoder', params_encoder, 0)
log_value('params total', params_total, 0)

#write out model
Ejemplo n.º 28
0
def train(model, trainloader):
    if opt.use_cuda:
        model = model.cuda()
    model.train()
    criterion = torch.nn.CrossEntropyLoss()
    lr = opt.lr
    # optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=opt.weight_decay)
    weight_p = []
    bias_p = []
    for name, para in model.named_parameters():
        if 'bias' in name:
            bias_p += [para]
        else:
            weight_p += [para]
    optimizer = torch.optim.Adam([{
        'params': weight_p,
        'weight_decay': opt.weight_decay
    }, {
        'params': bias_p,
        'weight_decay': 0
    }],
                                 lr=lr)
    previous_loss = 1e10
    pEpoch = []
    pLoss = []

    for epoch in range(opt.epoch):
        loss_all = 0
        total_accuracy = 0
        for i, (input, target) in enumerate(trainloader):
            if opt.use_cuda:
                input = input.cuda()
                target = target.cuda()
            optimizer.zero_grad()
            score = model(input)
            loss = criterion(score, target)
            loss.backward()
            optimizer.step()

            pred = torch.max(score, 1)[1]
            accuracy = float((pred == target).sum())
            accuracy = accuracy * 100 / input.size(0)
            # print((pred == target).sum(dim=0,keepdim=False))

            total_accuracy += accuracy
            loss_all += float(loss)

            if i % opt.printinter == 0:
                print("Epoch: ", epoch, "| Iter:", i, "| Loss:", float(loss),
                      "| Accuracy:", accuracy, "%")

        avgloss = loss_all / len(trainloader)
        avgaccuracy = total_accuracy / len(trainloader)
        print("the end of Epoch: ", epoch, "| AVGLoss:", avgloss,
              "| Accuracy:", avgaccuracy, "%")
        save(model, epoch)

        # plot
        pEpoch.append(epoch)
        pLoss.append(avgloss)
        plotlc(pEpoch, pLoss)
Ejemplo n.º 29
0
def train():
    # Turn on training mode which enables dropout.
    if args.model == 'QRNN': model.reset()
    total_loss = 0
    start_time = time.time()
    hidden = model.init_hidden(args.batch_size)
    batch, i = 0, 0
    while i < train_data.size(1) - 1 - 1:

        #        seq_len=args.bptt

        lr2 = optimizer.param_groups[0]['lr']
        #        optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
        model.train()

        data, targets = get_batchlz(train_data, train_label, i,
                                    args.batch_size)

        hidden = repackage_hidden(hidden)
        optimizer.zero_grad()
        '''训练步'''
        output, hidden, rnn_hs, dropped_rnn_hs = model(data,
                                                       None,
                                                       return_h=True)
        raw_loss = criterion(output, targets)

        wri.add_scalar('raw_loss', raw_loss, epoch * lzarg + batch)

        loss = raw_loss
        # Activiation Regularization
        if args.alpha:
            loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean()
                              for dropped_rnn_h in dropped_rnn_hs[-1:])
        wri.add_scalar('alpha_loss', loss - raw_loss, epoch * lzarg + batch)
        # Temporal Activation Regularization (slowness)
        if args.beta:
            loss = loss + sum(args.beta *
                              (rnn_h[1:] - rnn_h[:-1]).pow(2).mean()
                              for rnn_h in rnn_hs[-1:])
        wri.add_scalar('alphabeta_loss', loss - raw_loss,
                       epoch * lzarg + batch)

        loss.backward()

        #        wri.add_scalar('loss',loss,epoch*179+batch)

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip)

        optimizer.step()

        total_loss += raw_loss.data
        optimizer.param_groups[0]['lr'] = lr2
        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss.item(
            ) / args.log_interval  #log_interval平均loss
            wri.add_scalar(
                'bat_curraw_loss', cur_loss,
                epoch * (lzarg // args.log_interval) +
                batch / args.log_interval)
            elapsed = time.time() - start_time
            pred_y = torch.max(output, 1)[1].data
            accuracy = (pred_y == targets).float().sum() / len(targets)
            wri.add_scalar(
                'train_accuracy', accuracy,
                epoch * (lzarg // args.log_interval) +
                batch / args.log_interval)
            print(
                'ACC={:3.2f} | epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
                '平均loss {:5.2f} | ppl {:5.2f} | bpc {:5.3f}'.format(
                    accuracy, epoch, batch, lzarg,
                    optimizer.param_groups[0]['lr'],
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss), cur_loss / math.log(2)), '\n')
            total_loss = 0
            start_time = time.time()
        ###
        batch += 1
        i += args.batch_size
    '''# 保存梯度数据来可视化'''
    if epoch % 10 == 0 and epoch != 0:
        #        targets_image=image[targets]
        #        wri.add_embedding(rnn_hs[0][-1,:,:].clone().cpu().data.numpy(),metadata=targets,label_img=targets_image.data,global_step=epoch/10)
        for name, param in model.named_parameters():
            wri.add_histogram(name,
                              param.clone().cpu().data.numpy(), epoch / 10)
Ejemplo n.º 30
0
test_batch_size = 1
train_data = batchify(corpus.train, args.batch_size, args)
val_data = batchify(corpus.valid, eval_batch_size, args)
test_data = batchify(corpus.test, test_batch_size, args)

###############################################################################
# Build the model
###############################################################################

criterion = None

ntokens = len(corpus.dictionary)
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.chunk_size, args.nlayers,
                       args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)

for name, weight in model.named_parameters():
    print(name, weight.numel())
###
if args.resume:
    print('Resuming model ...')
    model_load(args.resume)
    optimizer.param_groups[0]['lr'] = args.lr
    model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute
    if args.wdrop:
        for rnn in model.rnn.cells:
            rnn.hh.dropout = args.wdrop
###
if not criterion:
    splits = []
    if ntokens > 500000:
        # One Billion