Exemple #1
0
def model_summary(model, what, dataset=None):
    if what == 'sparsity':
        # ctsun 181125
        t, total = distiller.weights_sparsity_tbl_summary(
            model, return_total_sparsity=True)
        print("\nParameters:\n", str(t))
        print('Total sparsity: {:0.2f}\n'.format(total))

        pylogger = PythonLogger(msglogger)
        csvlogger = CsvLogger('weights.csv')
        distiller.log_weights_sparsity(model,
                                       -1,
                                       loggers=[pylogger, csvlogger])

    elif what == 'compute':
        if dataset == 'imagenet':
            dummy_input = Variable(torch.randn(1, 3, 224, 224))
        # ctsun 181125
        elif dataset == 'cifar10' or 'cinic10':
            dummy_input = Variable(torch.randn(1, 3, 32, 32))
        else:
            print("Unsupported dataset (%s) - aborting compute operation" %
                  dataset)
            return
        df = model_performance_summary(model, dummy_input, 1)
        t = tabulate(df, headers='keys', tablefmt='psql', floatfmt=".5f")
        total_macs = df['MACs'].sum()
        # ctsun 181125
        total_weights = df['Weights volume'].sum()
        print(t)
        print("Total MACs   : " + "{:,}".format(total_macs))
        # ctsun 181125
        print("Total Weights: " + "{:,}".format(total_weights))
    elif what == 'model':
        # print the simple form of the model
        print(model)
    elif what == 'modules':
        # Print the names of non-leaf modules
        # Remember that in PyTorch not every node is a module (e.g. F.relu).
        # Also remember that parameterless modules, like nn.MaxPool2d, can be used multiple
        # times in the same model, but they will only appear once in the modules list.
        nodes = []
        for name, module in model.named_modules():
            # Only print leaf modules
            if len(module._modules) == 0:
                nodes.append([name, module.__class__.__name__])
        print(tabulate(nodes, headers=['Name', 'Type']))
    else:
        raise ValueError("%s is not a supported summary type" % what)
Exemple #2
0
    def train(self,
              dataset,
              k=20,
              n_epochs=20,
              save_dir='./models',
              save=True,
              model_name='GRU4REC'):
        """
        Train the GRU4REC model on a pandas dataframe for several training epochs,
        and store the intermediate models to the user-specified directory.

        Args:
            n_epochs (int): the number of training epochs to run
            save_dir (str): the path to save the intermediate trained models
            model_name (str): name of the model
        """
        print(f'Training {model_name}...')
        for epoch in range(n_epochs):
            if self.compression_scheduler:
                self.compression_scheduler.on_epoch_begin(epoch)
            results = self.run_epoch(
                dataset,
                k=k,
                training=True,
                compression_scheduler=self.compression_scheduler,
                epoch=epoch)
            results = [f'{k}:{v:.3f}' for k, v in results.items()]
            print(f'epoch:{epoch+1:2d}/{"/".join(results)}')

            t, total = distiller.weights_sparsity_tbl_summary(
                self.gru, return_total_sparsity=True)
            print("\nParameters:\n" + str(t))
            print('Total sparsity: {:0.2f}\n'.format(total))

            if self.compression_scheduler:
                self.compression_scheduler.on_epoch_end(epoch)

            # Store the intermediate model
            if save:
                save_dir = Path(save_dir)
                if not save_dir.exists(): save_dir.mkdir()
                model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}'
                torch.save(self.gru.state_dict(), save_dir / model_fname)
Exemple #3
0
def objective(space):
    global model
    global count
    global global_min_score
    
    #Explore new model
    model = create_model(False, args.dataset, args.arch, device_ids=args.gpus)
    count += 1
    # Objective function: F(Acc, Lat) = (1 - Acc.) + (alpha * Sparsity)
    accuracy = 0
    alpha = 0.3 # Super-parameter: the importance of inference time
    latency = 0.0
    sparsity = 0.0
    # Training hyperparameter

    if args.resume:
        model, compression_scheduler, start_epoch = apputils.load_checkpoint(
            model, chkpt_file=args.resume)
        print('resume mode: {}'.format(args.resume))

    print(global_min_score)
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    """
    distiller/distiller/config.py
        # Element-wise sparsity
        sparsity_levels = {net_param: sparsity_level}
        pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels)
        policy = distiller.PruningPolicy(pruner, pruner_args=None)
        scheduler = distiller.CompressionScheduler(model)
        scheduler.add_policy(policy, epochs=[0, 2, 4])
        # Local search 
        add multiple pruner for each layer
    """
    sparsity_levels = {}
    for key, value in space.items():
        sparsity_levels[key] = value
    #print(sparsity_levels)

    pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels) # for SparsityLevelParameterPruner
    # pruner = distiller.pruning.SensitivityPruner(name='sensitivity', sensitivities=sparsity_levels) # for SensitivityPruner
    policy = distiller.PruningPolicy(pruner, pruner_args=None)
    lrpolicy = distiller.LRPolicy(torch.optim.lr_scheduler.StepLR(optimizer, step_size=6, gamma=0.1))
    compression_scheduler = distiller.CompressionScheduler(model)
    compression_scheduler.add_policy(policy, epochs=[PrunerEpoch])
    # compression_scheduler.add_policy(policy, starting_epoch=0, ending_epoch=38, frequency=2)
    compression_scheduler.add_policy(lrpolicy, starting_epoch=0, ending_epoch=50, frequency=1)
    """
    distiller/example/classifier_compression/compress_classifier.py
    For each epoch:
        compression_scheduler.on_epoch_begin(epoch)
        train()
        save_checkpoint()
        compression_scheduler.on_epoch_end(epoch)

    train():
        For each training step:
            compression_scheduler.on_minibatch_begin(epoch)
            output = model(input)
            loss = criterion(output, target)
            compression_scheduler.before_backward_pass(epoch)
            loss.backward()
            optimizer.step()
            compression_scheduler.on_minibatch_end(epoch)
    """
    
    local_min_score = 2.
    for i in range(args.epochs):
        compression_scheduler.on_epoch_begin(i)
        train_accuracy = train(i,criterion, optimizer, compression_scheduler)
        val_accuracy = validate() # Validate hyperparameter setting
        t, sparsity = distiller.weights_sparsity_tbl_summary(model, return_total_sparsity=True)
        compression_scheduler.on_epoch_end(i, optimizer)
        apputils.save_checkpoint(i, args.arch, model, optimizer, compression_scheduler, train_accuracy, False,
                                         'hyperopt', './')
        print('Epoch: {}, train_acc: {:.4f}, val_acc: {:.4f}, sparsity: {:.4f}'.format(i, train_accuracy, val_accuracy, sparsity))
        
        score = (1-(val_accuracy/100.)) + (alpha * (1-sparsity/100.)) # objective funtion here
        if(score < global_min_score):
            global_min_score = score
            apputils.save_checkpoint(i, args.arch, model, optimizer, compression_scheduler, train_accuracy, True, 'best', './')

        if(score < local_min_score):
            local_min_score = score

        if (PrunerConstraint == True and i >= PrunerEpoch and (sparsity < Expected_Sparsity_Level_Low or sparsity > Expected_Sparsity_Level_High)):
            break 

    test_accuracy = test() # Validate hyperparameter setting

    print('{} trials: score: {:.4f}, train_acc:{:.4f}, val_acc:{:.4f}, test_acc:{:.4f}, sparsity:{:.4f}'.format(count, 
                                      local_min_score, 
                                      train_accuracy, 
                                      val_accuracy, 
                                      test_accuracy,
                                      sparsity))

    return local_min_score
Exemple #4
0
 def log_weights_sparsity(self, model, epoch):
     t, total = distiller.weights_sparsity_tbl_summary(
         model, return_total_sparsity=True)
     self.pylogger.info("\nParameters:\n" + str(t))
     self.pylogger.info('Total sparsity: {:0.2f}\n'.format(total))
def train(c):
    net = get_net(c)

    opt = get_opt(c, net)
    net, opt, step = c.init_model(net, opt=opt, step='max', train=True)

    step_lr = scheduler(c, opt, step)
    data_tr = SampleIterator(c, c.train_batch, split='valid' if c.debug else 'train')
    iter_tr = iter(data_tr)
    data_val = SequentialIterator(c, c.eval_batch, split='valid')
    data_test = SequentialIterator(c, c.eval_batch, split='test')

    print('Before quantization')
    tbl, sparsity = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True)
    step_result = pd.Series(evaluate(c, data_val, net)).add_prefix('val_')
    step_result = step_result.append(
        pd.Series(evaluate(c, data_test, net)).add_prefix('test_')
    )
    step_result['sparsity'] = sparsity
    print(step_result)

    compression_scheduler = distiller.config.file_config(net, opt, c.compress)

    print('After initial quantization')
    s = Namespace(net=net, opt=opt, step=step)
    c.on_train_start(s)

    tbl, sparsity = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True)
    step_result = pd.Series(evaluate(c, data_val, net)).add_prefix('val_')
    step_result = step_result.append(
        pd.Series(evaluate(c, data_test, net)).add_prefix('test_')
    )
    step_result['sparsity'] = sparsity
    print(step_result)

    npm = []    
    for name, param in net.named_parameters():
        if param.dim() in [2, 4] and any(type in name for type in ['weight', 'bias']):
            npm.append((name, param, param.abs() == 0))

    best_val_loss = np.inf
    if s.results is not None and 'val_loss' in s.results.columns:
        best_val_loss = s.results['val_loss'].dropna().max()
    try:
        steps_per_epoch = c.step_eval
        while step < s.step_max:
            epoch = step // steps_per_epoch
            batch = step % steps_per_epoch

            if batch == 0:
                compression_scheduler.on_epoch_begin(epoch)
            compression_scheduler.on_minibatch_begin(epoch, batch, steps_per_epoch)
            
            step_lr(step)

            x = to_torch(next(iter_tr), c.device).t()

            t_s = time()
            inputs, labels = x[:-1], x[1:]
            loss, _, lam, theta = net(inputs, labels)
            
            compression_scheduler.before_backward_pass(epoch, batch, steps_per_epoch, loss, False)

            opt.zero_grad()
            if torch.isnan(loss):
                import q; q.d()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), c.get('clip_grad', 0.5))

            compression_scheduler.before_parameter_optimization(epoch, batch, steps_per_epoch, opt)
            opt.step()
            for name, param, mask in npm:
                param.data[mask] = 0
            compression_scheduler.on_minibatch_end(epoch, batch, steps_per_epoch)
            
            if (batch + 1) == steps_per_epoch:
                compression_scheduler.on_epoch_end(epoch)

            time_model = np.round(time() - t_s, 5)

            loss = from_torch(loss)
            perplexity = np.nan if loss > 5 else np.e ** loss
            step_result = pd.Series(dict(
                loss=loss,
                perplexity=perplexity,
                time=time_model,
            )).add_prefix('train_')
            step_result['lr'] = next(iter(opt.param_groups))['lr']
            step_result['theta'] = from_torch(theta)
            step_result['lambda'] = from_torch(lam)

            s.step = step = step + 1
            if step % c.step_eval == 0:
                tbl, sparsity = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True)
                step_result = step_result.append(
                    pd.Series(evaluate(c, data_val, net)).add_prefix('val_')
                )
                step_result = step_result.append(
                    pd.Series(evaluate(c, data_test, net)).add_prefix('test_')
                )
                step_result['sparsity'] = sparsity
                s.record_step = step_result['val_loss'] < best_val_loss
                clear_gpu_memory()
            s.step_result = step_result
            c.on_step_end(s)
    except Exception as e:
        import traceback
        err = traceback.format_exc()
        if c.main:
            c.log(err)
        else:
            print(err)
    finally:
        c.on_train_end(s)
Exemple #6
0
 def get_step_logs(self):
     model = self.compression_scheduler.model
     t, total = distiller.weights_sparsity_tbl_summary(
         model, return_total_sparsity=True)
     return {"sparsity": total / 100}
def main():
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    processor = NLUDataProcessor(args.data_dir, args.max_seq_length, tokenizer)
    label_list = processor.get_labels()
    num_labels = len(label_list)
    num_intents = len(processor.intents)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        restrict = {}
        if args.limit_data:
            for i in range(len(args.limit_data) // 2):
                intent = args.limit_data[i * 2]
                size = args.limit_data[i * 2 + 1]
                assert intent in processor.intents
                restrict[intent] = tuple(int(x) for x in size.split('*'))
        train_examples = processor.get_train_examples(restrict)
        num_train_optimization_steps = \
            int(len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * \
            args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = \
                num_train_optimization_steps // torch.distributed.get_world_size()

    if args.eval_on_test:
        eval_examples = processor.get_test_examples()
    else:
        eval_examples = processor.get_dev_examples()

    # Prepare model
    cache_dir = args.cache_dir or os.path.join(
        str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
            args.local_rank))
    model = BertForNLU.from_pretrained(args.bert_model,
                                       cache_dir=cache_dir,
                                       num_labels=num_labels,
                                       num_intents=num_intents,
                                       from_tf=args.from_tf,
                                       layers=args.layers,
                                       prune=args.prune,
                                       dropout=args.dropout)

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        from apex.parallel import DistributedDataParallel as DDP
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # distiller
    compression_scheduler = None
    if args.distiller is not None:
        import distiller
        distiller_pylogger = distiller.data_loggers.PythonLogger(logger)
        compression_scheduler = distiller.config.file_config(
            model, None, args.distiller)

    # Tensorboard
    swriter = SummaryWriter(args.output_dir)
    logger.info("Writing summary to %s", args.output_dir)

    # Prepare optimizer
    if args.do_train:
        if args.train_layers_from is not None:
            param_optimizer = []
            for n, p in model.named_parameters():
                if "classifier" in n or "pooler" in n:
                    param_optimizer.append((n, p))
                elif any(
                        int(s) >= args.train_layers_from
                        for s in re.findall(r'layer\.(\d+)\.', n)):
                    param_optimizer.append((n, p))
                else:
                    print("Not considered trainable:", n)
                    p.requires_grad_(False)
        else:
            param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        if args.fp16:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)

        else:
            optimizer = BertAdam(
                optimizer_grouped_parameters,
                lr=args.learning_rate,
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps,
                schedule=None if args.const_lr else 'warmup_linear')

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_tokenids = torch.tensor(
            [f.input_tokenids for f in train_examples], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_examples],
                                      dtype=torch.long)
        all_input_segmentids = torch.tensor(
            [f.input_segmentids for f in train_examples], dtype=torch.long)
        all_input_labelids = torch.tensor(
            [f.input_labelids for f in train_examples], dtype=torch.long)

        train_data = TensorDataset(all_input_tokenids, all_input_mask,
                                   all_input_segmentids, all_input_labelids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        batches_per_epoch = int(len(train_examples) / args.train_batch_size)

        model.train()
        for epoch_id in trange(int(args.num_train_epochs), desc="Epoch"):
            if compression_scheduler:
                compression_scheduler.on_epoch_begin(epoch_id)
            nb_tr_examples = 0
            global_step_tr_loss = 0.0
            tqdm_bar = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(tqdm_bar):
                if compression_scheduler and step % args.gradient_accumulation_steps == 0:
                    compression_scheduler.on_minibatch_begin(
                        epoch_id,
                        minibatch_id=step / args.gradient_accumulation_steps,
                        minibatches_per_epoch=batches_per_epoch)

                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                loss, _, _ = model(input_ids,
                                   segment_ids,
                                   input_mask,
                                   labels=label_ids)

                if compression_scheduler:
                    # Before running the backward phase, we allow the scheduler to modify the loss
                    # (e.g. add regularization loss)
                    loss = compression_scheduler.before_backward_pass(
                        epoch_id,
                        minibatch_id=step / args.gradient_accumulation_steps,
                        minibatches_per_epoch=batches_per_epoch,
                        loss=loss,
                        return_loss_components=False)

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                global_step_tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    tqdm_bar.set_postfix(train_loss=global_step_tr_loss)
                    swriter.add_scalar('train_loss',
                                       global_step_tr_loss,
                                       global_step=global_step)
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = \
                            args.learning_rate * warmup_linear.get_lr(global_step,
                                                                      args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step

                    if args.fp16:
                        for _, p in param_optimizer:
                            if p.grad is None:
                                p.grad = torch.zeros(p.size(),
                                                     dtype=p.dtype,
                                                     device=p.device)
                    optimizer.step()
                    global_step += 1
                    global_step_tr_loss = 0.0

                    if compression_scheduler:
                        compression_scheduler.on_minibatch_end(
                            epoch_id,
                            minibatch_id=step /
                            args.gradient_accumulation_steps,
                            minibatches_per_epoch=batches_per_epoch)

                    optimizer.zero_grad()

                    if not args.fp16 and optimizer.get_lr():
                        # get_lr returns a list, however all the elements of the list should be the
                        # same
                        swriter.add_scalar('learning_rate',
                                           np.random.choice(
                                               optimizer.get_lr()),
                                           global_step=global_step)

                    if global_step % args.eval_steps == 0:
                        perform_evaluation(eval_examples=eval_examples,
                                           model=model,
                                           processor=processor,
                                           swriter=swriter,
                                           device=device,
                                           global_step=global_step)
                        model.train()

                    if global_step % args.save_checkpoints_steps == 0:
                        save_model(model=model,
                                   tokenizer=tokenizer,
                                   global_step=global_step)

                    if args.prune and global_step % args.eval_steps == 1:
                        prune_model(model=model,
                                    swriter=swriter,
                                    global_step=global_step,
                                    count=args.prune_count)

            if compression_scheduler:
                sparsity_table, total_sparsity = \
                    distiller.weights_sparsity_tbl_summary(model, return_total_sparsity=True)
                logger.info("\nParameters:\n" + str(sparsity_table))
                logger.info('Total sparsity: {:0.2f}\n'.format(total_sparsity))
                swriter.add_scalar('sparsity',
                                   total_sparsity,
                                   global_step=global_step)
                compression_scheduler.on_epoch_end(epoch_id)

        save_model(model=model,
                   tokenizer=tokenizer,
                   global_step=global_step,
                   tag='final')

    perform_evaluation(eval_examples=eval_examples,
                       model=model,
                       processor=processor,
                       swriter=swriter,
                       device=device,
                       global_step=global_step)
    swriter.close()
Exemple #8
0
def train(c, net, compression_scheduler=None):
    import distiller.apputils as apputils
    from distiller.data_loggers import TensorBoardLogger, PythonLogger
    msglogger = apputils.config_pylogger('logging.conf', None)
    tflogger = TensorBoardLogger(msglogger.logdir)
    tflogger.log_gradients = True
    pylogger = PythonLogger(msglogger)
    c.setdefault(hebbian=False)

    emb_params = count_params(net.embed) + count_params(net.loss.projections) + count_params(net.loss.clusters)
    opt = get_opt(c, net)
    net, opt, step = c.init_model(net, opt=opt, step='max', train=True)
    step_lr = scheduler(c, opt, step)
    data_tr = SampleIterator(c, c.train_batch, split='valid' if c.debug else 'train')
    iter_tr = iter(data_tr)
    data_val = SequentialIterator(c, c.eval_batch, split='valid')

    s = Namespace(net=net, opt=opt, step=step)
    c.on_train_start(s)

    c.log('Embedding has %s parameters' % emb_params)

    if c.get("steps_per_epoch"):
        steps_per_epoch = c.steps_per_epoch
    else:
        steps_per_epoch = len(data_tr.tokens) // data_tr.bs // c.train_chunk
    print("#### steps per epoch %d ####" % steps_per_epoch)

    if c.hebbian:
        counters = [torch.ones(end - start, dtype=torch.long, device=c.device) for start, end in zip([0] + c.cutoffs, c.cutoffs + [c.n_vocab])]
        temp_counters = [torch.zeros_like(x) for x in counters]

    best_val_loss = np.inf
    if s.results is not None and 'val_loss' in s.results.columns:
        best_val_loss = s.results['val_loss'].dropna().max()
    try:
        while step < s.step_max:
            batch = step % steps_per_epoch
            epoch = step // steps_per_epoch
            if step % steps_per_epoch == 0:
                c.log("====> batch=%d, epoch=%d, step=%d" % (batch, epoch, step))
                if compression_scheduler:
                    compression_scheduler.on_epoch_begin(epoch)

            if compression_scheduler:
                compression_scheduler.on_minibatch_begin(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch)

            step_lr(step)

            x = to_torch(next(iter_tr), c.device).t()

            t_s = time()
            inputs, labels = x[:-1], x[1:]
            preds = net(inputs, labels)
            loss = preds['loss']

            if compression_scheduler:
                _  = compression_scheduler.before_backward_pass(epoch, minibatch_id=batch,
                                                           minibatches_per_epoch=steps_per_epoch,
                                                           loss=loss, return_loss_components=False)

            opt.zero_grad()
            if torch.isnan(loss):
                raise RuntimeError('Encountered nan loss during training')
            loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), c.get('clip_grad', 0.5))
            opt.step()

            if c.hebbian:
                hebbian_weight_update(c, net, preds['hiddens'], counters, temp_counters)

            time_model = np.round(time() - t_s, 5)

            loss = from_torch(loss)
            perplexity = np.nan if loss > 5 else np.e ** loss
            step_result = pd.Series(dict(
                loss=loss,
                perplexity=perplexity,
                time=time_model
            )).add_prefix('train_')
            step_result['lr'] = next(iter(opt.param_groups))['lr']
            if c.use_cache:
                step_result['theta'] = preds['theta']
                step_result['lambda'] = preds['lambda'].item()

            if compression_scheduler:
                compression_scheduler.on_minibatch_end(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch)

            if step % steps_per_epoch == 0:
                if compression_scheduler:
                    compression_scheduler.on_epoch_end(epoch)

            s.step = step = step + 1
            if step % c.step_eval == 0:
                distiller.log_weights_sparsity(net, epoch, loggers=[tflogger, pylogger])
                t, total = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True)
                c.log("total sparsity: %.3lf" % total)

                step_result = step_result.append(
                    pd.Series(evaluate(c, data_val, net)).add_prefix('val_')
                )
                s.record_step = step_result['val_loss'] < best_val_loss
                clear_gpu_memory()
            s.step_result = step_result
            c.on_step_end(s)
    except Exception as e:
        import traceback
        err = traceback.format_exc()
        if c.main:
            c.log(err)
        else:
            print(err)
    finally:
        c.on_train_end(s)
Exemple #9
0
def train(args):

    SRC = data.Field(tokenize=tokenize_de,
                     pad_token=BLANK_WORD,
                     lower=args.lower)
    TGT = data.Field(tokenize=tokenize_en,
                     init_token=BOS_WORD,
                     eos_token=EOS_WORD,
                     pad_token=BLANK_WORD,
                     lower=args.lower)

    # Load IWSLT Data ---> German to English Translation
    if args.dataset == 'IWSLT':
        train, val, test = datasets.IWSLT.splits(
            exts=('.de', '.en'),
            fields=(SRC, TGT),
            filter_pred=lambda x: len(vars(x)['src']) <= args.max_length and
            len(vars(x)['trg']) <= args.max_length)
    else:
        train, val, test = datasets.Multi30k.splits(
            exts=('.de', '.en'),
            fields=(SRC, TGT),
            filter_pred=lambda x: len(vars(x)['src']) <= args.max_length and
            len(vars(x)['trg']) <= args.max_length)

    # Frequency of words in the vocabulary
    SRC.build_vocab(train.src, min_freq=args.min_freq)
    TGT.build_vocab(train.trg, min_freq=args.min_freq)

    print("Size of source vocabulary:", len(SRC.vocab))
    print("Size of target vocabulary:", len(TGT.vocab))

    pad_idx = TGT.vocab.stoi[BLANK_WORD]
    model = make_model(len(SRC.vocab),
                       len(TGT.vocab),
                       n=args.num_blocks,
                       d_model=args.hidden_dim,
                       d_ff=args.ff_dim,
                       h=args.num_heads,
                       dropout=args.dropout)
    print("Model made with n:", args.num_blocks, "hidden_dim:",
          args.hidden_dim, "feed forward dim:", args.ff_dim, "heads:",
          args.num_heads, "dropout:", args.dropout)

    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print("Number of parameters: ", params)

    if args.load_model:
        print("Loading model from [%s]" % args.load_model)
        model.load_state_dict(torch.load(args.load_model))

    # UNCOMMENT WHEN RUNNING ON RESEARCH MACHINES - run on GPU
    # model.cuda()

    # Used by original authors, hurts perplexity but improves BLEU score
    criterion = LabelSmoothing(size=len(TGT.vocab),
                               padding_idx=pad_idx,
                               smoothing=0.1)

    # UNCOMMENT WHEN RUNNING ON RESEARCH MACHINES - run on GPU
    # criterion.cuda()

    train_iter = MyIterator(train,
                            batch_size=args.batch_size,
                            device=0,
                            repeat=False,
                            sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn,
                            train=True)
    valid_iter = MyIterator(val,
                            batch_size=args.batch_size,
                            device=0,
                            repeat=False,
                            sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn,
                            train=False,
                            sort=False)
    model_par = nn.DataParallel(model, device_ids=devices)

    # model_opt = NoamOpt(model.src_embed[0].d_model, 1, 2000,
    #                     torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

    # Use standard optimizer -- As used in the paper
    model_opt = get_std_opt(model)

    # PRUNING CODE
    if args.summary:
        df = distiller.weights_sparsity_tbl_summary(model, False)
        print(df)
        exit(0)

    msglogger = apputils.config_pylogger('logging.conf', None)
    tflogger = TensorBoardLogger(msglogger.logdir)
    tflogger.log_gradients = True
    pylogger = PythonLogger(msglogger)

    source = args.compress

    if args.compress:
        compression_scheduler = distiller.config.file_config(
            model_par.module, None, args.compress)

    print(model_par.module)

    best_bleu = 0
    best_epoch = 0

    steps_per_epoch = math.ceil(len(train_iter.data()) / 60)

    for epoch in range(args.epoch):
        print("=" * 80)
        print("Epoch ", epoch + 1)
        print("=" * 80)
        print("Training...")
        model_par.train()

        if compression_scheduler:
            compression_scheduler.on_epoch_begin(epoch)

        # IF PRUNING
        run_epoch((rebatch(pad_idx, b) for b in train_iter),
                  model_par,
                  MultiGPULossCompute(model.generator,
                                      criterion,
                                      devices=devices,
                                      opt=model_opt),
                  args,
                  epoch,
                  steps_per_epoch,
                  compression_scheduler,
                  SRC,
                  TGT,
                  valid_iter,
                  is_valid=False)

        # run_epoch((rebatch(pad_idx, b) for b in train_iter), model_par,
        #           MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt), args,
        #           SRC, TGT, valid_iter, is_valid=False)

        print("Validation...")
        model_par.eval()

        # IF PRUNING
        loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter),
                         model_par,
                         MultiGPULossCompute(model.generator,
                                             criterion,
                                             devices=devices,
                                             opt=None),
                         args,
                         epoch,
                         steps_per_epoch,
                         compression_scheduler,
                         SRC,
                         TGT,
                         valid_iter,
                         is_valid=True)

        # loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), model_par,
        #                  MultiGPULossCompute(model.generator, criterion, devices=devices, opt=None), args,
        #                  SRC, TGT, valid_iter, is_valid=True)

        if compression_scheduler:
            compression_scheduler.on_epoch_end(epoch)

        print('Validation loss:', loss)
        print('Validation perplexity: ', np.exp(loss))
        bleu_score = run_validation_bleu_score(model, SRC, TGT, valid_iter)

        if best_bleu < bleu_score:
            best_bleu = bleu_score
            model_file = args.save_to + args.exp_name + 'validation.bin'
            print('Saving model without optimizer [%s]' % model_file)
            torch.save(model_par.module.state_dict(), model_file)
            best_epoch = epoch

        model_file = args.save_to + args.exp_name + 'latest.bin'
        print('Saving latest model without optimizer [%s]' % model_file)
        torch.save(model_par.module.state_dict(), model_file)

    print('The best epoch was:', best_epoch)
def objective(space):
    global model
    global count
    global best_dict
    
    #Explore new model
    model = create_model(False, args.dataset, args.arch, device_ids=args.gpus)
    if args.resume:
        model, _, _ = apputils.load_checkpoint(
            model, chkpt_file=args.resume)
    
    count += 1
    print('{} trial starting...'.format(count))
    # Objective function: F(Acc, Lat) = (1 - Acc.) + (alpha * Sparsity)
    accuracy = 0
    #alpha = 0.2 # Super-parameter: the importance of inference time
    alpha = 1.0 # Super-parameter: the importance of inference time
    sparsity = 0.0
    # Training hyperparameter
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    """
    distiller/distiller/config.py
        # Element-wise sparsity
        sparsity_levels = {net_param: sparsity_level}
        pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels)
        policy = distiller.PruningPolicy(pruner, pruner_args=None)
        scheduler = distiller.CompressionScheduler(model)
        scheduler.add_policy(policy, epochs=[0, 2, 4])
        # Local search 
        add multiple pruner for each layer
    """
    sparsity_levels = {}
    for key, value in space.items():
        sparsity_levels[key] = value
    pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels)
    policy = distiller.PruningPolicy(pruner, pruner_args=None)
    lrpolicy = distiller.LRPolicy(torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1))
    compression_scheduler = distiller.CompressionScheduler(model)
    #compression_scheduler.add_policy(policy, epochs=[90])
    compression_scheduler.add_policy(policy, epochs=[0])
    compression_scheduler.add_policy(lrpolicy, starting_epoch=0, ending_epoch=90, frequency=1)
    """
    distiller/example/classifier_compression/compress_classifier.py
    For each epoch:
        compression_scheduler.on_epoch_begin(epoch)
        train()
        save_checkpoint()
        compression_scheduler.on_epoch_end(epoch)

    train():
        For each training step:
            compression_scheduler.on_minibatch_begin(epoch)
            output = model(input)
            loss = criterion(output, target)
            compression_scheduler.before_backward_pass(epoch)
            loss.backward()
            optimizer.step()
            compression_scheduler.on_minibatch_end(epoch)
    """
    for i in range(args.epochs):
        compression_scheduler.on_epoch_begin(i)
        train_accuracy = train(i,criterion, optimizer, compression_scheduler)
        val_accuracy = validate() # Validate hyperparameter setting
        t, sparsity = distiller.weights_sparsity_tbl_summary(model, return_total_sparsity=True)
        compression_scheduler.on_epoch_end(i, optimizer)
        apputils.save_checkpoint(i, args.arch, model, optimizer, compression_scheduler, train_accuracy, False,
                                         'hyperopt', './')
        print('{} epochs => train acc:{:.2f}%,  val acc:{:.2f}%'.format(i, train_accuracy, val_accuracy))
        
    test_accuracy = validate(test_loader) # Validate hyperparameter setting
    #score = (1-(val_accuracy/100.)) + (alpha * (1-sparsity/100.)) # objective funtion here
    
    # objective funtion here
    # accuracy: 98~90%, sparsity: 80%~50%
    score = -((val_accuracy/100.)**2-0.9**2 + alpha * ((sparsity/100.)**2-0.5**2)) 
    print('{} trials: score: {:.2f}\ttrain acc:{:.2f}%\tval acc:{:.2f}%\ttest acc:{:.2f}%\tsparsity:{:.2f}%'.format(count, 
                                      score, 
                                      train_accuracy, 
                                      val_accuracy, 
                                      test_accuracy,
                                      sparsity))
    if score < best_dict['score']:
        best_dict['trial'] = count
        best_dict['score'] = score
        best_dict['tr_acc'] = train_accuracy        
        best_dict['v_acc'] = val_accuracy
        best_dict['te_acc'] = test_accuracy
        best_dict['sparsity'] = sparsity
        best_dict['model_best'] = copy.deepcopy(model)

    return score