Beispiel #1
0
import os

import time
import logging
import numpy as np
import sys
import copy
import torch as th
import time
import tensorboard
import tqdm

from torch.utils.tensorboard import SummaryWriter

# Writer will output to ./runs/ directory by default
writer = SummaryWriter("vis")

from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.nn.utils import clip_grad_norm_
from utils import compute_sdr, MAX_INT16, center_trim
from preprocess import Prep
from conv_tasnet import TasNet


from torch.nn import MSELoss


n_spks = 3

def load_obj(obj, device):
    """
Beispiel #2
0
        dist.init_process_group(backend='nccl',
                                init_method='env://')  # distributed backend
        opt.world_size = dist.get_world_size()
        assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
        opt.batch_size = opt.total_batch_size // opt.world_size

    print(opt)

    # Train
    if not opt.evolve:
        tb_writer = None
        if opt.local_rank in [-1, 0]:
            print(
                'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/'
            )
            tb_writer = SummaryWriter(
                log_dir=increment_dir('runs/exp', opt.name))

        train(hyp, opt, device, tb_writer)

    # Evolve hyperparameters (optional)
    else:
        # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
        meta = {
            'lr0':
            (1, 1e-5, 1e-1),  # initial learning rate (SGD=1E-2, Adam=1E-3)
            'momentum': (0.1, 0.6, 0.98),  # SGD momentum/Adam beta1
            'weight_decay': (1, 0.0, 0.001),  # optimizer weight decay
            'giou': (1, 0.02, 0.2),  # GIoU loss gain
            'cls': (1, 0.2, 4.0),  # cls loss gain
            'cls_pw': (1, 0.5, 2.0),  # cls BCELoss positive_weight
            'obj': (1, 0.2, 4.0),  # obj loss gain (scale with pixels)
Beispiel #3
0
def main(args):
    np.set_printoptions(precision=3)
    save_dir = os.getcwd()

    log = os.path.join(save_dir, "log.txt")

    # Setup SummaryWriter
    summary_dir = os.path.join(save_dir, "summary")
    if not os.path.exists(summary_dir):
        os.mkdir(summary_dir)
    writer = SummaryWriter(summary_dir)

    if args.run.s3_bucket is not None:
        aws_utils.download_from_s3(log, args.run.s3_bucket, log)

        train_utils.copy_code_to_experiment_dir("/code/nas-theory/cnn",
                                                save_dir)
        aws_utils.upload_directory(os.path.join(save_dir, "scripts"),
                                   args.run.s3_bucket)

    train_utils.set_up_logging(log)

    if not torch.cuda.is_available():
        logging.info("no gpu device available")
        sys.exit(1)

    torch.cuda.set_device(args.run.gpu)
    logging.info("gpu device = %d" % args.run.gpu)
    logging.info("args = %s", args.pretty())

    rng_seed = train_utils.RNGSeed(args.run.seed)

    if args.search.method in ["edarts", "gdarts", "eedarts"]:
        if args.search.fix_alphas:
            from architect.architect_edarts_edge_only import (
                ArchitectEDARTS as Architect, )
        else:
            from architect.architect_edarts import ArchitectEDARTS as Architect
    elif args.search.method in ["darts", "fdarts"]:
        from architect.architect_darts import ArchitectDARTS as Architect
    elif args.search.method == "egdas":
        from architect.architect_egdas import ArchitectEGDAS as Architect
    else:
        raise NotImplementedError

    if args.search.search_space in ["darts", "darts_small"]:
        from search_spaces.darts.model_search import DARTSNetwork as Network
    elif "nas-bench-201" in args.search.search_space:
        from search_spaces.nasbench_201.model_search import (
            NASBENCH201Network as Network, )
    elif args.search.search_space == "pcdarts":
        from search_spaces.pc_darts.model_search import PCDARTSNetwork as Network
    else:
        raise NotImplementedError

    if args.train.smooth_cross_entropy:
        criterion = train_utils.cross_entropy_with_label_smoothing
    else:
        criterion = nn.CrossEntropyLoss()

    num_train, num_classes, train_queue, valid_queue = train_utils.create_data_queues(
        args)

    print("dataset: {}, num_classes: {}".format(args.run.dataset, num_classes))

    model = Network(
        args.train.init_channels, num_classes, args.search.nodes,
        args.train.layers, criterion, **{
            "auxiliary": args.train.auxiliary,
            "search_space_name": args.search.search_space,
            "exclude_zero": args.search.exclude_zero,
            "track_running_stats": args.search.track_running_stats,
        })
    model = model.cuda()
    logging.info("param size = %fMB",
                 train_utils.count_parameters_in_MB(model))

    optimizer, scheduler = train_utils.setup_optimizer(model, args)

    # TODO: separate args by model, architect, etc
    # TODO: look into using hydra for config files
    architect = Architect(model, args, writer)

    # Try to load a previous checkpoint
    try:
        start_epochs, history = train_utils.load(save_dir, rng_seed, model,
                                                 optimizer, architect,
                                                 args.run.s3_bucket)
        scheduler.last_epoch = start_epochs - 1
        (
            num_train,
            num_classes,
            train_queue,
            valid_queue,
        ) = train_utils.create_data_queues(args)
    except Exception as e:
        logging.info(e)
        start_epochs = 0

    best_valid = 0
    for epoch in range(start_epochs, args.run.epochs):
        lr = scheduler.get_lr()[0]
        logging.info("epoch %d lr %e", epoch, lr)

        model.drop_path_prob = args.train.drop_path_prob * epoch / args.run.epochs

        # training
        train_acc, train_obj = train(
            args,
            train_queue,
            valid_queue,
            model,
            architect,
            criterion,
            optimizer,
            lr,
        )
        architect.baseline = train_obj
        architect.update_history()
        architect.log_vars(epoch, writer)

        if "update_lr_state" in dir(scheduler):
            scheduler.update_lr_state(train_obj)

        logging.info("train_acc %f", train_acc)

        # History tracking
        for vs in [("alphas", architect.alphas), ("edges", architect.edges)]:
            for ct in vs[1]:
                v = vs[1][ct]
                logging.info("{}-{}".format(vs[0], ct))
                logging.info(v)
        # Calling genotypes sets alphas to best arch for EGDAS and MEGDAS
        # so calling here before infer.
        genotype = architect.genotype()
        logging.info("genotype = %s", genotype)

        if not args.search.single_level:
            valid_acc, valid_obj = train_utils.infer(
                valid_queue,
                model,
                criterion,
                report_freq=args.run.report_freq,
                discrete=args.search.discrete,
            )
            if valid_acc > best_valid:
                best_valid = valid_acc
                best_genotype = architect.genotype()
            logging.info("valid_acc %f", valid_acc)

        train_utils.save(
            save_dir,
            epoch + 1,
            rng_seed,
            model,
            optimizer,
            architect,
            save_history=True,
            s3_bucket=args.run.s3_bucket,
        )

        scheduler.step()

    valid_acc, valid_obj = train_utils.infer(
        valid_queue,
        model,
        criterion,
        report_freq=args.run.report_freq,
        discrete=args.search.discrete,
    )
    if valid_acc > best_valid:
        best_valid = valid_acc
        best_genotype = architect.genotype()
    logging.info("valid_acc %f", valid_acc)

    if args.run.s3_bucket is not None:
        filename = "cnn_genotypes.txt"
        aws_utils.download_from_s3(filename, args.run.s3_bucket, filename)

        with open(filename, "a+") as f:
            f.write("\n")
            f.write("{}{}{}{} = {}".format(
                args.search.search_space,
                args.search.method,
                args.run.dataset.replace("-", ""),
                args.run.seed,
                best_genotype,
            ))
        aws_utils.upload_to_s3(filename, args.run.s3_bucket, filename)
        aws_utils.upload_to_s3(log, args.run.s3_bucket, log)
def train(model, training_data, validation_data, optimizer, device, opt):
    ''' Start training '''

    # Use tensorboard to plot curves, e.g. perplexity, accuracy, learning rate
    if opt.use_tb:
        from torch.utils.tensorboard import SummaryWriter
        tb_writer = SummaryWriter(
            log_dir=os.path.join(opt.output_dir, 'tensorboard'))

    log_train_file = os.path.join(opt.output_dir, 'train.log')
    log_valid_file = os.path.join(opt.output_dir, 'valid.log')

    print('[Info] Training performance will be written to file: {} and {}'.
          format(log_train_file, log_valid_file))

    with open(log_train_file, 'w') as log_tf, open(log_valid_file,
                                                   'w') as log_vf:
        log_tf.write('epoch,loss,ppl,accuracy\n')
        log_vf.write('epoch,loss,ppl,accuracy\n')

    def print_performances(header, ppl, accu, start_time, lr):
        print('  - {header:12} ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %, lr: {lr:8.5f}, '\
              'elapse: {elapse:3.3f} min'.format(
                  header=f"({header})", ppl=ppl,
                  accu=100*accu, elapse=(time.time()-start_time)/60, lr=lr))

    #valid_accus = []
    valid_losses = []
    for epoch_i in range(opt.epoch):
        print('[ Epoch', epoch_i, ']')

        start = time.time()
        train_loss, train_accu = train_epoch(model,
                                             training_data,
                                             optimizer,
                                             opt,
                                             device,
                                             smoothing=opt.label_smoothing)
        train_ppl = math.exp(min(train_loss, 100))
        # Current learning rate
        lr = optimizer._optimizer.param_groups[0]['lr']
        print_performances('Training', train_ppl, train_accu, start, lr)

        start = time.time()
        valid_loss, valid_accu = eval_epoch(model, validation_data, device,
                                            opt)
        valid_ppl = math.exp(min(valid_loss, 100))
        print_performances('Validation', valid_ppl, valid_accu, start, lr)

        valid_losses += [valid_loss]

        checkpoint = {
            'epoch': epoch_i,
            'settings': opt,
            'model': model.state_dict()
        }

        if opt.save_mode == 'all':
            model_name = 'model_accu_{accu:3.3f}.chkpt'.format(accu=100 *
                                                               valid_accu)
            torch.save(checkpoint, model_name)
        elif opt.save_mode == 'best':
            model_name = 'model.chkpt'
            if valid_loss <= min(valid_losses):
                torch.save(checkpoint, os.path.join(opt.output_dir,
                                                    model_name))
                print('    - [Info] The checkpoint file has been updated.')

        with open(log_train_file, 'a') as log_tf, open(log_valid_file,
                                                       'a') as log_vf:
            log_tf.write(
                '{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                    epoch=epoch_i,
                    loss=train_loss,
                    ppl=train_ppl,
                    accu=100 * train_accu))
            log_vf.write(
                '{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                    epoch=epoch_i,
                    loss=valid_loss,
                    ppl=valid_ppl,
                    accu=100 * valid_accu))

        if opt.use_tb:
            tb_writer.add_scalars('ppl', {
                'train': train_ppl,
                'val': valid_ppl
            }, epoch_i)
            tb_writer.add_scalars('accuracy', {
                'train': train_accu * 100,
                'val': valid_accu * 100
            }, epoch_i)
            tb_writer.add_scalar('learning_rate', lr, epoch_i)
Beispiel #5
0
class PIRL(Task):
    def __init__(self,
                 backbone: BackboneBase,
                 projector: HeadBase,
                 memory: MemoryBank,
                 optimizer: torch.optim.Optimizer,
                 scheduler: torch.optim.lr_scheduler._LRScheduler,
                 loss_function: PIRLLoss,
                 loss_weight: float,
                 num_negatives: int,
                 metrics: dict,
                 checkpoint_dir: str,
                 write_summary: bool
                 ):
        super(PIRL, self).__init__()

        assert isinstance(memory, MemoryBank)
        assert isinstance(loss_function, PIRLLoss)

        self.backbone = backbone
        self.projector = projector
        self.memory = memory
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.loss_function = loss_function
        self.loss_weight = loss_weight
        self.num_negatives = num_negatives
        self.metrics = metrics if isinstance(metrics, dict) else None

        self.checkpoint_dir = checkpoint_dir
        os.makedirs(self.checkpoint_dir, exist_ok=True)

        self.writer = SummaryWriter(log_dir=self.checkpoint_dir) if write_summary else None

    def run(self, train_set, valid_set, epochs, batch_size, num_workers=0, device='cuda', **kwargs):  # pylint: disable=unused-argument

        assert isinstance(train_set, torch.utils.data.Dataset)
        assert isinstance(valid_set, torch.utils.data.Dataset)
        assert isinstance(epochs, int)
        assert isinstance(batch_size, int)
        assert isinstance(num_workers, int)
        assert device.startswith('cuda') or device == 'cpu'

        logger = kwargs.get('logger', None)

        self.backbone = self.backbone.to(device)
        self.projector = self.projector.to(device)

        train_loader = get_dataloader(train_set, batch_size, num_workers=num_workers)
        valid_loader = get_dataloader(valid_set, batch_size, num_workers=num_workers)

        # Initialize training memory
        if not self.memory.initialized:
            self.memory.initialize(self.backbone, self.projector, train_loader)

        with tqdm.tqdm(**get_tqdm_config(total=epochs, leave=True, color='blue')) as pbar:

            best_valid_loss = float('inf')
            best_epoch = 0

            for epoch in range(1, epochs + 1):

                # 0. Train & evaluate
                train_history = self.train(train_loader, device=device)
                valid_history = self.evaluate(valid_loader, device=device)

                # 1. Epoch history (loss)
                epoch_history = {
                    'loss': {
                        'train': train_history.get('loss'),
                        'valid': valid_history.get('loss')
                    },
                }

                # 2. Epoch history (other metrics if provided)
                if self.metrics is not None:
                    assert isinstance(self.metrics, dict)
                    for metric_name, _ in self.metrics.items():
                        epoch_history[metric_name] = {
                            'train': train_history.get(metric_name),
                            'valid': valid_history.get(metric_name),
                        }

                # 3. Tensorboard
                if self.writer is not None:
                    for metric_name, metric_dict in epoch_history.items():
                        self.writer.add_scalars(
                            main_tag=metric_name,
                            tag_scalar_dict=metric_dict,
                            global_step=epoch
                        )
                    if self.scheduler is not None:
                        self.writer.add_scalar(
                            tag='lr',
                            scalar_value=self.scheduler.get_last_lr()[0],
                            global_step=epoch
                        )

                # 4-1. Save model if it is the current best
                valid_loss = epoch_history['loss']['valid']
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    best_epoch = epoch
                    self.save_checkpoint(self.best_ckpt, epoch=epoch, **epoch_history)
                    self.memory.save(os.path.join(os.path.dirname(self.best_ckpt), 'best_memory.pt'), epoch=epoch)

                # 4-2. Save intermediate models
                if isinstance(kwargs.get('save_every'), int):
                    if epoch % kwargs.get('save_every') == 0:
                        new_ckpt = os.path.join(self.checkpoint_dir, f'epoch_{epoch:04d}.loss_{valid_loss:.4f}.pt')  # No need to save memory
                        self.save_checkpoint(new_ckpt, epoch=epoch, **epoch_history)

                # 5. Update learning rate scheduler
                if self.scheduler is not None:
                    self.scheduler.step()

                # 6. Logging
                desc = make_epoch_description(
                    history=epoch_history,
                    current=epoch,
                    total=epochs,
                    best=best_epoch
                )
                pbar.set_description_str(desc)
                pbar.update(1)
                if logger is not None:
                    logger.info(desc)

        # 7. Save last model
        self.save_checkpoint(self.last_ckpt, epoch=epoch, **epoch_history)
        self.memory.save(os.path.join(os.path.dirname(self.last_ckpt), 'last_memory.pt'), epoch=epoch)

        # 8. Test model (optional)
        if 'test_set' in kwargs.keys():
            test_loader = get_dataloader(kwargs.get('test_set'), batch_size, num_workers=num_workers)
            self.test(test_loader, device=device, logger=logger)

    def train(self, data_loader: torch.utils.data.DataLoader, device: str, **kwargs):  # pylint: disable=unused-argument
        """Train function defined for a single epoch."""

        preds = []
        train_loss = 0.
        steps_per_epoch = len(data_loader)
        self._set_learning_phase(train=True)

        with tqdm.tqdm(**get_tqdm_config(steps_per_epoch, leave=False, color='green')) as pbar:
            for i, batch in enumerate(data_loader):

                j  = batch['idx']
                x  = batch['x'].to(device)
                x_t = batch['x_t'].to(device)
                z = self.predict(x)
                z_t = self.predict(x_t)

                m = self.memory.get_representations(j).to(device)
                negatives = self.memory.get_negatives(self.num_negatives, exclude=j)

                # Calculate loss
                loss_z, _ = self.loss_function(
                    anchors=m,
                    positives=z,
                    negatives=negatives,
                )
                loss_z_t, logits = self.loss_function(
                    anchors=m,
                    positives=z_t,
                    negatives=negatives,
                )
                loss = (1 - self.loss_weight)  * loss_z + self.loss_weight * loss_z_t

                # Backpropagation & update
                loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()
                self.memory.update(j, values=z.detach())

                train_loss += loss.detach().item()
                preds += [logits.detach().cpu()]

                desc = f" Batch: [{i+1:>4}/{steps_per_epoch:>4}]"
                desc += f" Loss: {train_loss/(i+1):.4f} "
                pbar.set_description_str(desc)
                pbar.update(1)

        out = {'loss': train_loss / steps_per_epoch}
        if self.metrics is not None:
            assert isinstance(self.metrics, dict)
            with torch.no_grad():
                preds = torch.cat(preds, dim=0)                          # (N, 1+ num_negatives)
                trues = torch.zeros(preds.size(0), device=preds.device)  # (N, )
                for metric_name, metric_function in self.metrics.items():
                    out[metric_name] = metric_function(preds, trues).item()

        return out

    def evaluate(self, data_loader: torch.utils.data.DataLoader, device: str, **kwargs):  # pylint: disable=unused-argument
        """Evaluate current model. A single pass through the given dataset."""

        preds = []
        valid_loss = 0.
        steps_per_epoch = len(data_loader)
        self._set_learning_phase(train=False)

        with torch.no_grad():
            for _, batch in enumerate(data_loader):

                j   = batch['idx']
                x   = batch['x'].to(device)
                x_t = batch['x_t'].to(device)
                z   = self.predict(x)
                z_t = self.predict(x_t)

                negatives = self.memory.get_negatives(self.num_negatives, exclude=j)

                # Note that no memory representation (m) exists for the validation data.
                loss, logits = self.loss_function(
                    anchors=z,
                    positives=z_t,
                    negatives=negatives,
                )

                valid_loss += loss.item()
                preds += [logits.detach().cpu()]

            out = {'loss': valid_loss / steps_per_epoch}
            if self.metrics is not None:
                assert isinstance(self.metrics, dict)
                preds = torch.cat(preds, dim=0)                          # (N, 1+ num_negatives)
                trues = torch.zeros(preds.size(0), device=preds.device)  # (N, )
                for metric_name, metric_function in self.metrics.items():
                    out[metric_name] = metric_function(preds, trues).item()

            return out

    def predict(self, x: torch.Tensor):
        return self.projector(self.backbone(x))

    def test(self, data_loader: torch.utils.data.DataLoader, device: str, logger = None):
        """Evaluate best model on test data."""

        def test_on_ckpt(ckpt: str):
            """Load checkpoint history and add test metric values."""
            self.load_model_from_checkpoint(ckpt)
            ckpt_history = self.load_history_from_checkpoint(ckpt)
            test_history = self.evaluate(data_loader, device)
            for metric_name, metric_val in test_history.items():
                ckpt_history[metric_name]['test'] = metric_val
            return ckpt_history

        def make_description(history: dict, prefix: str = ''):
            desc = f" {prefix} ({history['epoch']:>4d}): "
            for metric_name, metric_dict in history.items():
                if metric_name == 'epoch':
                    continue
                for k, v in metric_dict.items():
                    desc += f" {k}_{metric_name}: {v:.4f} |"
            return desc

        # 1. Best model
        best_history = test_on_ckpt(self.best_ckpt)
        desc = make_description(best_history, prefix='Best model')
        print(desc)
        if logger is not None:
            logger.info(desc)

        with open(os.path.join(self.checkpoint_dir, 'best_history.json'), 'w') as fp:
            json.dump(best_history, fp, indent=2)

        # 2. Last model
        last_history = test_on_ckpt(self.last_ckpt)
        desc = make_description(last_history, prefix='Last model')
        print(desc)
        if logger is not None:
            logger.info(desc)

        with open(os.path.join(self.checkpoint_dir, 'last_history.json'), 'w') as fp:
            json.dump(best_history, fp, indent=2)

    def _set_learning_phase(self, train=False):
        if train:
            self.backbone.train()
            self.projector.train()
        else:
            self.backbone.eval()
            self.projector.eval()

    def save_checkpoint(self, path: str, **kwargs):
        ckpt = {
            'backbone': self.backbone.state_dict(),
            'projector': self.projector.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'scheduler': self.scheduler.state_dict() if \
                self.scheduler is not None else None
        }
        if kwargs:
            ckpt.update(kwargs)
        torch.save(ckpt, path)

    def load_model_from_checkpoint(self, path: str):
        ckpt = torch.load(path)
        self.backbone.load_state_dict(ckpt['backbone'])
        self.projector.load_state_dict(ckpt['projector'])
        self.optimizer.load_state_dict(ckpt['optimizer'])
        if self.scheduler is not None:
            self.scheduler.load_state_dict(ckpt['scheduler'])

    def load_history_from_checkpoint(self, path: str):
        ckpt = torch.load(path)
        del ckpt['backbone']
        del ckpt['projector']
        del ckpt['optimizer']
        del ckpt['scheduler']
        return ckpt
def train_engine(__C):
    # define network
    net = get_network(__C)
    net = net.cuda()

    __C.batch_size = __C.batch_size // __C.gradient_accumulation_steps

    # define dataloader
    train_loader = get_train_loader(__C)
    test_loader = get_test_loader(__C)

    # define optimizer and loss function
    if __C.label_smoothing:
        loss_function = LabelSmoothingCrossEntropy(__C.smoothing)
    else:
        loss_function = nn.CrossEntropyLoss()

    # define optimizer and training parameters
    if __C.no_bias_decay:
        params = split_weights(net)
    else:
        params = net.parameters()
    optimizer = optim.SGD(params, lr=__C.lr, momentum=0.9, weight_decay=5e-4)

    # define optimizer scheduler
    # len(train_loader) 就是一个epoch的steps数量
    warmup_steps = __C.warmup_steps
    total_steps = __C.num_steps
    # change epoch into steps
    for i in __C.milestones:
        i *= len(train_loader)
    if __C.decay_type == 'multi_step':
        train_scheduler = WarmupMultiStepSchedule(__C,
                                                  optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)
    elif __C.decay_type == 'cosine':
        train_scheduler = WarmupCosineSchedule(optimizer,
                                               warmup_steps=warmup_steps,
                                               t_total=total_steps)
    elif __C.decay_type == 'linear':
        train_scheduler = WarmupLinearSchedule(optimizer,
                                               warmup_steps=warmup_steps,
                                               t_total=total_steps)

    # define tensorboard writer
    writer = SummaryWriter(
        log_dir=os.path.join(__C.tensorboard_log_dir, __C.model, __C.version))

    # define model save dir
    checkpoint_path = os.path.join(__C.ckpts_dir, __C.model, __C.version)
    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)
    checkpoint_path = os.path.join(checkpoint_path,
                                   '{net}-{global_step}-{type}.pth')

    # define log save dir
    log_path = os.path.join(__C.result_log_dir, __C.model)
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    log_path = os.path.join(log_path, __C.version + '.txt')

    # write the hyper parameters to log
    logfile = open(log_path, 'a+')
    logfile.write(str(__C))
    logfile.close()

    # Train!
    logger.info("  ***** Running training *****")
    logger.info("  Total optimization steps = %d", __C.num_steps)
    logger.info("  Instantaneous batch size per GPU = %d", __C.batch_size)
    logger.info("  Gradient Accumulation steps = %d",
                __C.gradient_accumulation_steps)

    net.zero_grad()
    losses = AverageMeter()
    global_step, best_acc = 0, 0
    while True:
        net.train()
        epoch_iterator = tqdm(train_loader,
                              desc="Training (X / X Steps) (loss=X.X)",
                              bar_format="{l_bar}{r_bar}",
                              dynamic_ncols=True)
        for step, (images, labels) in enumerate(train_loader):
            images = images.cuda()
            labels = labels.cuda()
            train_outputs = net(images)
            loss = loss_function(train_outputs, labels)

            if __C.gradient_accumulation_steps > 1:
                loss = loss / __C.gradient_accumulation_steps
            else:
                loss.backward()

            if (step + 1) % __C.gradient_accumulation_steps == 0:
                losses.update(loss.item() * __C.gradient_accumulation_steps)
                torch.nn.utils.clip_grad_norm_(net.parameters(),
                                               __C.max_grad_norm)
                train_scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

                epoch_iterator.set_description(
                    "Training (%d / %d Steps) (loss=%2.5f)" %
                    (global_step, total_steps, losses.val))

                writer.add_scalar("[Step] Train/loss",
                                  scalar_value=losses.val,
                                  global_step=global_step)
                writer.add_scalar("[Step] Train/lr",
                                  scalar_value=train_scheduler.get_lr()[0],
                                  global_step=global_step)

                if global_step % __C.eval_every == 0:
                    accuracy = valid(__C,
                                     model=net,
                                     writer=writer,
                                     test_loader=test_loader,
                                     global_step=global_step,
                                     loss_function=loss_function)
                    if best_acc < accuracy:
                        torch.save(
                            net.state_dict(),
                            checkpoint_path.format(net=__C.model,
                                                   global_step=global_step,
                                                   type='best'))
                        best_acc = accuracy
                    net.train()

                if global_step % total_steps == 0:
                    break
        losses.reset()
        if global_step % total_steps == 0:
            break

    writer.close()
    logger.info("Best Accuracy: \t%f" % best_acc)
    logger.info("End Training!")
Beispiel #7
0
def main(argv):
    writer = SummaryWriter()

    torch.manual_seed(FLAGS.random_seed)

    np.random.seed(FLAGS.random_seed)
    if hasattr(torch, "cuda_is_available"):
        if torch.cuda_is_available():
            torch.cuda.manual_seed(FLAGS.random_seed)
            torch.backends.cudnn.enabled = True
            torch.backends.cudnn.benchmark = True

    device = torch.device(FLAGS.device)

    kwargs = {
        "num_workers": 1,
        "pin_memory": True
    } if FLAGS.device is "cuda" else {}
    train_loader = torch.utils.data.DataLoader(
        torchvision.datasets.MNIST(
            root=".",
            train=True,
            download=True,
            transform=torchvision.transforms.Compose([
                #                    torchvision.transforms.RandomCrop(size=[28,28], padding=4),
                torchvision.transforms.ToTensor(),
                torchvision.transforms.Normalize((0.1307, ), (0.3081, )),
            ]),
        ),
        batch_size=FLAGS.batch_size,
        shuffle=True,
        **kwargs,
    )
    test_loader = torch.utils.data.DataLoader(
        torchvision.datasets.MNIST(
            root=".",
            train=False,
            transform=torchvision.transforms.Compose([
                torchvision.transforms.ToTensor(),
                torchvision.transforms.Normalize((0.1307, ), (0.3081, )),
            ]),
        ),
        batch_size=FLAGS.batch_size,
        **kwargs,
    )

    label = os.environ.get("SLURM_JOB_ID", str(uuid.uuid4()))
    if FLAGS.prefix:
        path = f"runs/mnist/{FLAGS.prefix}/{label}"
    else:
        path = f"runs/mnist/{label}"

    os.makedirs(path, exist_ok=True)
    os.chdir(path)
    FLAGS.append_flags_into_file(f"flags.txt")

    input_features = 28 * 28
    output_features = 10

    model = LIFConvNet(
        input_features,
        FLAGS.seq_length,
        model=FLAGS.model,
        device=device,
        refrac=FLAGS.refrac,
        only_first_spike=FLAGS.only_first_spike,
    ).to(device)

    if FLAGS.optimizer == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr=FLAGS.learning_rate)
    elif FLAGS.optimizer == "adam":
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=FLAGS.learning_rate)

    if FLAGS.only_output:
        optimizer = torch.optim.Adam(model.out.parameters(),
                                     lr=FLAGS.learning_rate)

    training_losses = []
    mean_losses = []
    test_losses = []
    accuracies = []

    for epoch in range(FLAGS.epochs):
        training_loss, mean_loss = train(model,
                                         device,
                                         train_loader,
                                         optimizer,
                                         epoch,
                                         writer=writer)
        test_loss, accuracy = test(model,
                                   device,
                                   test_loader,
                                   epoch,
                                   writer=writer)

        training_losses += training_loss
        mean_losses.append(mean_loss)
        test_losses.append(test_loss)
        accuracies.append(accuracy)

        max_accuracy = np.max(np.array(accuracies))

        if (epoch % FLAGS.model_save_interval == 0) and FLAGS.save_model:
            model_path = f"mnist-{epoch}.pt"
            save(
                model_path,
                model=model,
                optimizer=optimizer,
                epoch=epoch,
                is_best=accuracy > max_accuracy,
            )

    np.save("training_losses.npy", np.array(training_losses))
    np.save("mean_losses.npy", np.array(mean_losses))
    np.save("test_losses.npy", np.array(test_losses))
    np.save("accuracies.npy", np.array(accuracies))
    model_path = f"mnist-final.pt"
    save(
        model_path,
        epoch=epoch,
        model=model,
        optimizer=optimizer,
        is_best=accuracy > max_accuracy,
    )
    writer.close()
Beispiel #8
0
def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping,
        permute_masks=None, logger=None):
    
    batch_size = 30
    losses, accs , losses_wo, accs_wo= [], [], [], []
    perm = torch.randperm(dataset[0].num_nodes)

    for k in range(runs):
        model.to(device).reset_parameters()
        optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        best_val_perf = test_perf = 0
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        writer = SummaryWriter('runs/{}_{}'.format(k, tt))

        data = dataset[0]
        data = data.to(device)
        num_nodes = data.num_nodes
        

        if os.path.isfile('{}_{}.pkl'.format(str(dataset)[:-2], k)):
            data = pickle.load(open('{}_{}.pkl'.format(str(dataset)[:-2], k), 'rb'))
            
        else:
            pivot= int(num_nodes*0.1)

            cold_mask_node = perm[list(range(k*pivot, (k+1)*pivot))]
            data.test_masked_nodes =cold_mask_node
            train_node = range(num_nodes)
            train_node = [e for e in train_node if e not in cold_mask_node] #or unknown]
            data = test_edges(data, cold_mask_node)
            val_mask_node = random.sample(train_node, int(pivot*0.5))
            data.val_masked_nodes = torch.tensor(val_mask_node)
            data = val_edges(data, val_mask_node)
            train_node = [e for e in train_node if e not in val_mask_node] #or unknown]
            data.train_nodes = train_node
            data.train_masked_nodes = torch.tensor(random.sample(train_node,int(num_nodes*0.1)))
            data = train_edges(data, data.train_masked_nodes)

            with open('{}_{}.pkl'.format(str(dataset)[:-2], k), 'wb') as f:
                pickle.dump(data, f)
        print("{}-fold Result".format(k))
        train_node=data.train_nodes

        loss_wo, acc_wo = run_(data, dataset, data.train_edge_index, train_node,writer)
        losses_wo.append(loss_wo)
        accs_wo.append(acc_wo) 
        scheduler = StepLR(optimizer, step_size=2000, gamma=0.5)
        for epoch in range(2000):
            with torch.autograd.set_detect_anomaly(True):
                train_gan(dataset, data, writer) 



        for epoch in range(5000):
            with torch.autograd.set_detect_anomaly(True):
                train_loss =train(model, optimizer,data,epoch)    
                scheduler.step()
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        loss, acc = evaluate(model, data)
        losses.append(loss)
        accs.append(acc)
        print('Val Loss: {:.4f}, Test Accuracy: {:.3f}'.format(loss,acc))
    losses, accs, losses_wo, accs_wo = tensor(losses), tensor(accs), tensor(losses_wo), tensor(accs_wo)
    print('w/o Mean Val Loss: {:.4f}, Mean Test Accuracy: {:.3f} ± {:.3f}'.
        format(losses_wo.mean().item(),
                accs_wo.mean().item(),
                accs_wo.std().item()
                ))
    print('Mean Val Loss: {:.4f}, Mean Test Accuracy: {:.3f} ± {:.3f}'.
        format(losses.mean().item(),
                accs.mean().item(),
                accs.std().item()
                ))
Beispiel #9
0
    depth = self.depth
    data = ocnn.octree_property(octree, 'feature', depth)
    assert data.size(1) == self.channel_in

    pool_idx = [None] * (depth + 1)
    for i, d in enumerate(range(depth, 2, -1)):
      data = self.convs[i](data, octree)
      data, pool_idx[d] = self.pools[i](data, octree)

    for i, d in enumerate(range(2, depth)):
      data = self.deconvs[i](data, octree)
      data = self.unpools[i](data, pool_idx[d+1], octree)
    
    data = self.deconv(data, octree)
    data = self.header(data)
    return data


if __name__ == '__main__':
  from torch.utils.tensorboard import SummaryWriter

  writer = SummaryWriter('logs/segnet')
  octree = ocnn.octree_batch(ocnn.octree_samples(['octree_1', 'octree_2']))
  model = SegNet(depth=5, channel_in=3, nout=4)
  print(model)

  octree = octree.cuda()
  model = model.cuda()
  writer.add_graph(model, octree)
  writer.flush()
    def train(self, dataset):
        if self.torch_manual_seed:
            torch.random.manual_seed(self.torch_manual_seed)
        # create PyTorch datasets
        dataset_train = dataset.create_torch_dataset(
            part='train',
            reshape=((1, ) + dataset.space[0].shape,
                     (1, ) + dataset.space[1].shape))

        dataset_validation = dataset.create_torch_dataset(
            part='validation',
            reshape=((1, ) + dataset.space[0].shape,
                     (1, ) + dataset.space[1].shape))

        # reset model before training
        self.init_model()

        criterion = torch.nn.MSELoss()
        self.init_optimizer(dataset_train=dataset_train)

        # create PyTorch dataloaders
        data_loaders = {
            'train':
            DataLoader(dataset_train,
                       batch_size=self.batch_size,
                       num_workers=self.num_data_loader_workers,
                       shuffle=True,
                       pin_memory=True),
            'validation':
            DataLoader(dataset_validation,
                       batch_size=self.batch_size,
                       num_workers=self.num_data_loader_workers,
                       shuffle=True,
                       pin_memory=True)
        }

        dataset_sizes = {
            'train': len(dataset_train),
            'validation': len(dataset_validation)
        }

        self.init_scheduler(dataset_train=dataset_train)
        if self.scheduler is not None:
            schedule_every_batch = isinstance(self.scheduler,
                                              (CyclicLR, OneCycleLR))

        best_model_wts = deepcopy(self.model.state_dict())
        best_psnr = 0

        if self.log_dir is not None:
            writer = SummaryWriter(log_dir=self.log_dir, max_queue=0)
            validation_samples = dataset.get_data_pairs(
                'validation', self.log_num_validation_samples)

        self.model.to(self.device)
        self.model.train()

        for epoch in range(self.epochs):
            # Each epoch has a training and validation phase
            for phase in ['train', 'validation']:
                if phase == 'train':
                    self.model.train()  # Set model to training mode
                else:
                    self.model.eval()  # Set model to evaluate mode

                running_psnr = 0.0
                running_loss = 0.0
                running_size = 0
                with tqdm(data_loaders[phase],
                          desc='epoch {:d}'.format(epoch + 1),
                          disable=not self.show_pbar) as pbar:
                    for inputs, labels in pbar:
                        if self.normalize_by_opnorm:
                            inputs = (1. / self.opnorm) * inputs
                        inputs = inputs.to(self.device)
                        labels = labels.to(self.device)

                        # zero the parameter gradients
                        self.optimizer.zero_grad()

                        # forward
                        # track gradients only if in train phase
                        with torch.set_grad_enabled(phase == 'train'):
                            outputs = self.model(inputs)
                            loss = criterion(outputs, labels)

                            # backward + optimize only if in training phase
                            if phase == 'train':
                                loss.backward()
                                torch.nn.utils.clip_grad_norm_(
                                    self.model.parameters(), max_norm=1)
                                self.optimizer.step()
                                if (self.scheduler is not None
                                        and schedule_every_batch):
                                    self.scheduler.step()

                        for i in range(outputs.shape[0]):
                            labels_ = labels[i, 0].detach().cpu().numpy()
                            outputs_ = outputs[i, 0].detach().cpu().numpy()
                            running_psnr += PSNR(outputs_, labels_)

                        # statistics
                        running_loss += loss.item() * outputs.shape[0]
                        running_size += outputs.shape[0]

                        pbar.set_postfix({
                            'phase': phase,
                            'loss': running_loss / running_size,
                            'psnr': running_psnr / running_size
                        })
                        if self.log_dir is not None and phase == 'train':
                            step = (epoch * ceil(
                                dataset_sizes['train'] / self.batch_size) +
                                    ceil(running_size / self.batch_size))
                            writer.add_scalar(
                                'loss/{}'.format(phase),
                                torch.tensor(running_loss / running_size),
                                step)
                            writer.add_scalar(
                                'psnr/{}'.format(phase),
                                torch.tensor(running_psnr / running_size),
                                step)

                    if self.scheduler is not None and not schedule_every_batch:
                        self.scheduler.step()

                    epoch_loss = running_loss / dataset_sizes[phase]
                    epoch_psnr = running_psnr / dataset_sizes[phase]

                    if self.log_dir is not None and phase == 'validation':
                        step = (epoch + 1) * ceil(
                            dataset_sizes['train'] / self.batch_size)
                        writer.add_scalar('loss/{}'.format(phase), epoch_loss,
                                          step)
                        writer.add_scalar('psnr/{}'.format(phase), epoch_psnr,
                                          step)

                    # deep copy the model (if it is the best one seen so far)
                    if phase == 'validation' and epoch_psnr > best_psnr:
                        best_psnr = epoch_psnr
                        best_model_wts = deepcopy(self.model.state_dict())
                        if self.save_best_learned_params_path is not None:
                            self.save_learned_params(
                                self.save_best_learned_params_path)

                if (phase == 'validation' and self.log_dir is not None
                        and self.log_num_validation_samples > 0):
                    with torch.no_grad():
                        val_images = []
                        for (y, x) in validation_samples:
                            y = torch.from_numpy(np.asarray(y))[None, None].to(
                                self.device)
                            x = torch.from_numpy(np.asarray(x))[None, None].to(
                                self.device)
                            reco = self.model(y)
                            reco -= torch.min(reco)
                            reco /= torch.max(reco)
                            val_images += [reco, x]
                        writer.add_images(
                            'validation_samples',
                            torch.cat(val_images), (epoch + 1) *
                            (ceil(dataset_sizes['train'] / self.batch_size)),
                            dataformats='NCWH')

        print('Best val psnr: {:4f}'.format(best_psnr))
        self.model.load_state_dict(best_model_wts)
def run(data_path, model_path, stump_type, gpu_name, batch_size, num_epochs, num_workers):
    """
    Main method to train, evaluate and test the multiple-instance-learning approach to classify the Paxos dataset into refer- and nonreferable retinopathy.
    :param base_path: Absolute path to the dataset. The folder should have folders for training (train), evaluation (val) and corresponding label files
    :param model_path: Absolute path to the pretrained model
    :param gpu_name: ID of the gpu (e.g. cuda0)
    :param batch_size: Bath size
    :param num_epochs: Maximum number of training epochs
    :param num_workers: Number of threads used for data loading
    :return: f1-score for the evaluation (or test) set
    """
    device = torch.device(gpu_name if torch.cuda.is_available() else "cpu")
    print(f'Using device {device}')
    hyperparameter = {
        'data': os.path.basename(os.path.normpath(data_path)),
        'learning_rate': 1e-4,
        'weight_decay': 1e-3,
        'num_epochs': num_epochs,
        'batch_size': batch_size,
        'optimizer': optim.Adam.__name__,
        'freeze': 0.0,
        'balance': 0.35,
        'image_size': 450,
        'crop_size': 399,
        'pretraining': True,
        'preprocessing': False,
        'stump': stump_type,
        'attention_neurons': 738,
        'bag_size': 75,
        'attention': 'normal',          # normal / gated
        'pooling': 'max'                # avg / max / none
    }
    os.mkdir(RES_PATH)
    with open(os.path.join(RES_PATH, 'hp.txt'), 'w') as f:
        print(hyperparameter, file=f)
    aug_pipeline_train = get_training_pipeline(hyperparameter['image_size'], hyperparameter['crop_size'])
    aug_pipeline_val = get_validation_pipeline(hyperparameter['image_size'], hyperparameter['crop_size'])

    hyperparameter_str = str(hyperparameter).replace(', \'', ',\n \'')[1:-1]
    print(f'Hyperparameter info:\n {hyperparameter_str}')

    loaders = prepare_dataset(data_path, hyperparameter, aug_pipeline_train, aug_pipeline_val, num_workers)
    net = prepare_network(model_path, hyperparameter, device)

    optimizer_ft = optim.Adam([{'params': net.feature_extractor_part1.parameters(), 'lr': 1e-5},
                               {'params': net.feature_extractor_part2.parameters()}, #, 'lr': 1e-5},
                               {'params': net.attention.parameters()},
                               {'params': net.att_v.parameters()},
                               {'params': net.att_u.parameters()},
                               {'params': net.att_weights.parameters()},
                               {'params': net.classifier.parameters()}], lr=hyperparameter['learning_rate'],
                              weight_decay=hyperparameter['weight_decay'])
    # optimizer_ft = optim.Adam(net.parameters(), lr=hyperparameter['learning_rate'], weight_decay=hyperparameter['weight_decay'])
    criterion = nn.CrossEntropyLoss()
    plateau_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer_ft, mode='min', factor=0.1, patience=15, verbose=True)

    desc = f'_paxos_mil_{str("_".join([k[0] + str(hp) for k, hp in hyperparameter.items()]))}'
    writer = SummaryWriter(comment=desc)

    best_model = train_model(net, criterion, optimizer_ft, plateau_scheduler, loaders, device, writer,
                                  hyperparameter, num_epochs=hyperparameter['num_epochs'], description=desc)
    _, f1 = validate(best_model, criterion, loaders[1], device, writer, hyperparameter, hyperparameter['num_epochs'], calc_roc=True)
    return f1
def main():
    parser = argparse.ArgumentParser(
        description='Test',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('data', metavar='DATA', help='path to file')

    parser.add_argument('--tb-save-path',
                        dest='tb_save_path',
                        metavar='PATH',
                        default='../checkpoints/',
                        help='tensorboard checkpoints path')

    parser.add_argument('--lgd-weight',
                        dest='lgd_weight',
                        metavar='PATH',
                        default=None,
                        help='pretrained weight for LGD model')
    parser.add_argument('--sdf-weight',
                        dest='sdf_weight',
                        metavar='PATH',
                        default=None,
                        help='pretrained weight for SDF model')

    parser.add_argument('--batchsize',
                        dest='batchsize',
                        type=int,
                        metavar='BATCHSIZE',
                        default=1,
                        help='batch size')
    parser.add_argument('--epoch',
                        dest='epoch',
                        type=int,
                        metavar='EPOCH',
                        default=200,
                        help='epochs for adam and lgd')

    parser.add_argument('--width',
                        dest='width',
                        type=int,
                        metavar='WIDTH',
                        default=128,
                        help='width for rendered image')
    parser.add_argument('--height',
                        dest='height',
                        type=int,
                        metavar='HEIGHT',
                        default=128,
                        help='height for rendered image')

    parser.add_argument('--outfile',
                        dest='outfile',
                        metavar='OUTFILE',
                        help='output file')

    args = parser.parse_args()

    width = args.width
    height = args.height
    epoch = args.epoch

    writer = SummaryWriter(args.tb_save_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # create models
    model = Siren(in_features=3,
                  out_features=1,
                  hidden_features=256,
                  hidden_layers=5,
                  outermost_linear=True).to(device)

    if args.sdf_weight != None:
        try:
            model.load_state_dict(torch.load(args.sdf_weight))
        except:
            print("Couldn't load pretrained weight: " + args.sdf_weight)

    model.eval()
    for param in model.parameters():
        param.requires_grad = False

    # load
    mm = torch.tensor([-0.1, -0.1, 0.1], device=device, dtype=torch.float)
    mx = torch.tensor([0.1, 0.1, 0.1], device=device, dtype=torch.float)
    wh = torch.tensor([width, height, 1], device=device, dtype=torch.int)

    rot = torch.tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                       device=device,
                       dtype=torch.float)
    trans = torch.tensor([[0, 0, -0.8]], device=device, dtype=torch.float)

    p_distribution = GridDataset(mm, mx, wh)

    d = torch.zeros((width * height, 1), device=device,
                    dtype=torch.float).requires_grad_(True)

    sampler = nn.Sequential(UniformSample(width * height), PointTransform(rot))

    p = sampler(p_distribution)

    ds = ObjDataset(args.data)
    objsampler = ObjUniformSample(1000)
    x_preview = (objsampler(ds)['p']).to(device)

    d2_eval = lambda d: torch.pow(d, 2).mean()
    sdf_eval = lambda d: torch.pow(model(d * ray_n + p + trans)[0], 2).sum(
        dim=1).mean()
    d_eval = lambda d: (torch.tanh(d) - 1.).mean() * 0.5

    d2_eval_list = lambda d: d2_eval(d[0])
    sdf_eval_list = lambda d: sdf_eval(d[0])
    d_eval_list = lambda d: d_eval(d[0])

    writer.add_mesh("preview",
                    torch.cat([(p + trans), x_preview]).unsqueeze(0),
                    global_step=0)

    print("lgd")
    hidden = None

    lgd = LGD(1, 3, k=10).to(device)

    if args.lgd_weight != None:
        try:
            lgd.load_state_dict(torch.load(args.lgd_weight))
        except:
            print("Couldn't load pretrained weight: " + args.lgd_weight)

    ray_n = torch.tensor([[0, 0, 1]], device=device,
                         dtype=torch.float).repeat(width * height, 1)

    writer.add_mesh("raymarch_LGD",
                    torch.cat([(d * ray_n + trans + p),
                               x_preview]).unsqueeze(0),
                    global_step=0)
    # test LGD
    lgd.eval()
    for i in range(epoch):
        # evaluate losses
        #loss = sdf_eval(x).mean()
        # update x
        [d], hidden = lgd.step(d, [d2_eval_list, sdf_eval_list, d_eval_list],
                               hidden, width * height)
        d = detach_var(d)
        hidden = detach_var(hidden)

        if i % 5 == 0:
            writer.add_mesh("raymarch_LGD",
                            torch.cat([(d * ray_n + trans + p),
                                       x_preview]).unsqueeze(0),
                            global_step=i + 1)

    writer.close()
class DefaultTrainer:
    """
    Implements general image classification with pruning
    """
    def __init__(self, model: GeneralModel, loss: GeneralModel,
                 optimizer: Optimizer, device, arguments: argparse.Namespace,
                 train_loader: DataLoader, test_loader: DataLoader,
                 metrics: Metrics, criterion: GeneralModel):

        self._test_loader = test_loader
        self._train_loader = train_loader
        self._loss_function = loss
        self._model = model
        self._arguments = arguments
        self._optimizer = optimizer
        self._device = device
        self._global_steps = 0
        self.out = metrics.log_line
        DATA_MANAGER.set_date_stamp(addition=arguments.run_name)
        self._writer = SummaryWriter(
            os.path.join(DATA_MANAGER.directory, RESULTS_DIR,
                         DATA_MANAGER.stamp, SUMMARY_DIR))
        self._metrics: Metrics = metrics
        self._metrics.init_training(self._writer)
        self._acc_buffer = []
        self._loss_buffer = []
        self._elapsed_buffer = []
        self._criterion = criterion

        self.ts = None

        batch = next(iter(self._test_loader))
        self.saliency = Saliency(model, device, batch[0][:8])
        self._metrics.write_arguments(arguments)
        self._flopcounter = FLOPCounter(model,
                                        batch[0][:8],
                                        self._arguments.batch_size,
                                        device=device)
        self._metrics.model_to_tensorboard(model, timestep=-1)

    def _batch_iteration(self,
                         x: torch.Tensor,
                         y: torch.Tensor,
                         train: bool = True):
        """ one iteration of forward-backward """

        # unpack
        x, y = x.to(self._device).float(), y.to(self._device)

        # update metrics
        self._metrics.update_batch(train)

        # record time
        if "cuda" in str(self._device):
            start = torch.cuda.Event(enable_timing=True)
            end = torch.cuda.Event(enable_timing=True)
            start.record()

        # forward pass
        accuracy, loss, out = self._forward_pass(x, y, train=train)

        # backward pass
        if train:
            self._backward_pass(loss)

        # record time
        if "cuda" in str(self._device):
            end.record()
            torch.cuda.synchronize(self._device)
            time = start.elapsed_time(end)
        else:
            time = 0

        # free memory
        for tens in [out, y, x, loss]:
            tens.detach()

        return accuracy, loss.item(), time

    def _forward_pass(self,
                      x: torch.Tensor,
                      y: torch.Tensor,
                      train: bool = True):
        """ implementation of a forward pass """

        if train:
            self._optimizer.zero_grad()
            if self._model.is_maskable:
                self._model.apply_weight_mask()

        out = self._model(x).squeeze()
        loss = self._loss_function(output=out,
                                   target=y,
                                   weight_generator=self._model.parameters(),
                                   model=self._model,
                                   criterion=self._criterion)
        accuracy = self._get_accuracy(out, y)
        return accuracy, loss, out

    def _backward_pass(self, loss):
        """ implementation of a backward pass """

        loss.backward()
        self._model.insert_noise_for_gradient(self._arguments.grad_noise)
        if self._arguments.grad_clip > 0:
            torch.nn.utils.clip_grad_norm_(self._model.parameters(),
                                           self._arguments.grad_clip)
        self._optimizer.step()
        if self._model.is_maskable:
            self._model.apply_weight_mask()

    def _epoch_iteration(self):
        """ implementation of an epoch """

        self.out("\n")

        self._acc_buffer, self._loss_buffer = self._metrics.update_epoch()

        for batch_num, batch in enumerate(self._train_loader):
            self.out(f"\rTraining... {batch_num}/{len(self._train_loader)}",
                     end='')

            if self._model.is_tracking_weights:
                self._model.save_prev_weights()

            acc, loss, elapsed = self._batch_iteration(*batch,
                                                       self._model.training)

            if self._model.is_tracking_weights:
                self._model.update_tracked_weights(self._metrics.batch_train)

            self._acc_buffer.append(acc)
            self._loss_buffer.append(loss)
            self._elapsed_buffer.append(elapsed)

            self._log(batch_num)

            self._check_exit_conditions_epoch_iteration()

        self.out("\n")

    def _log(self, batch_num: int):
        """ logs to terminal and tensorboard if the time is right"""

        if (batch_num % self._arguments.eval_freq) == 0:
            # validate on test and train set
            train_acc, train_loss = np.mean(self._acc_buffer), np.mean(
                self._loss_buffer)
            test_acc, test_loss, test_elapsed = self.validate()
            self._elapsed_buffer += test_elapsed

            # log metrics
            self._add_metrics(test_acc, test_loss, train_acc, train_loss)

            # reset for next log
            self._acc_buffer, self._loss_buffer, self._elapsed_buffer = [], [], []

            # print to terminal
            self.out(self._metrics.printable_last)

    def validate(self):
        """ validates the model on test set """

        self.out("\n")

        # init test mode
        self._model.eval()
        cum_acc, cum_loss, cum_elapsed = [], [], []

        with torch.no_grad():
            for batch_num, batch in enumerate(self._test_loader):
                acc, loss, elapsed = self._batch_iteration(
                    *batch, self._model.training)
                cum_acc.append(acc)
                cum_loss.append(loss),
                cum_elapsed.append(elapsed)
                self.out(
                    f"\rEvaluating... {batch_num}/{len(self._test_loader)}",
                    end='')
        self.out("\n")

        # put back into train mode
        self._model.train()

        return float(np.mean(cum_acc)), float(np.mean(cum_loss)), cum_elapsed

    def _add_metrics(self, test_acc, test_loss, train_acc, train_loss):
        """
        save metrics
        """

        sparsity = self._model.pruned_percentage
        spasity_index = 2 * ((sparsity * test_acc) /
                             (1e-8 + sparsity + test_acc))

        flops_per_sample, total_seen = self._flopcounter.count_flops(
            self._metrics.batch_train)

        self._metrics.add(train_acc, key="acc/train")
        self._metrics.add(train_loss, key="loss/train")
        self._metrics.add(test_loss, key="loss/test")
        self._metrics.add(test_acc, key="acc/test")
        self._metrics.add(sparsity, key="sparse/weight")
        self._metrics.add(self._model.structural_sparsity, key="sparse/node")
        self._metrics.add(spasity_index, key="sparse/hm")
        self._metrics.add(np.log(self._model.compressed_size),
                          key="sparse/log_disk_size")
        self._metrics.add(np.mean(self._elapsed_buffer), key="time/gpu_time")
        self._metrics.add(int(flops_per_sample), key="time/flops_per_sample")
        self._metrics.add(np.log10(total_seen), key="time/flops_log_cum")
        if torch.cuda.is_available():
            self._metrics.add(torch.cuda.memory_allocated(0),
                              key="cuda/ram_footprint")
        self._metrics.timeit()

    def train(self):
        """ main training function """

        # setup data output directories:
        setup_directories()
        save_codebase_of_run(self._arguments)
        DATA_MANAGER.write_to_file(
            os.path.join(RESULTS_DIR, DATA_MANAGER.stamp, OUTPUT_DIR,
                         "calling_command.txt"), str(" ".join(sys.argv)))

        # data gathering
        epoch = self._metrics._epoch

        self._model.train()

        try:

            self.out(f"{PRINTCOLOR_BOLD}Started training{PRINTCOLOR_END}")

            if self._arguments.skip_first_plot:
                self._metrics.handle_weight_plotting(0, trainer_ns=self)

            # if snip we prune before training
            if self._arguments.prune_criterion in SINGLE_SHOT:
                self._criterion.prune(self._arguments.pruning_limit,
                                      train_loader=self._train_loader,
                                      manager=DATA_MANAGER)
                if self._arguments.prune_criterion in STRUCTURED_SINGLE_SHOT:
                    self._optimizer = find_right_model(
                        OPTIMS,
                        self._arguments.optimizer,
                        params=self._model.parameters(),
                        lr=self._arguments.learning_rate,
                        weight_decay=self._arguments.l2_reg)
                    self._metrics.model_to_tensorboard(self._model,
                                                       timestep=epoch)

            # do training
            for epoch in range(epoch, self._arguments.epochs + epoch):
                self.out(
                    f"\n\n{PRINTCOLOR_BOLD}EPOCH {epoch} {PRINTCOLOR_END} \n\n"
                )

                # do epoch
                self._epoch_iteration()

                # plotting
                if (epoch % self._arguments.plot_weights_freq
                    ) == 0 and self._arguments.plot_weights_freq > 0:
                    self._metrics.handle_weight_plotting(epoch,
                                                         trainer_ns=self)

                # do all related to pruning
                self._handle_pruning(epoch)

                # save what needs to be saved
                self._handle_backing_up(epoch)

            if self._arguments.skip_first_plot:
                self._metrics.handle_weight_plotting(epoch + 1,
                                                     trainer_ns=self)

            # example last save
            save_models([self._model, self._metrics], "finished")

        except KeyboardInterrupt as e:
            self.out(f"Killed by user: {e} at {time.time()}")
            save_models([self._model, self._metrics],
                        f"KILLED_at_epoch_{epoch}")
            sys.stdout.flush()
            DATA_MANAGER.write_to_file(
                os.path.join(RESULTS_DIR, DATA_MANAGER.stamp, OUTPUT_DIR,
                             "log.txt"), self._metrics.log)
            self._writer.close()
            exit(69)
        except Exception as e:
            self._writer.close()
            report_error(e, self._model, epoch, self._metrics)

        # flush prints
        sys.stdout.flush()
        DATA_MANAGER.write_to_file(
            os.path.join(RESULTS_DIR, DATA_MANAGER.stamp, OUTPUT_DIR,
                         "log.txt"), self._metrics.log)
        self._writer.close()

    def _handle_backing_up(self, epoch):
        if (epoch % self._arguments.save_freq) == 0 and epoch > 0:
            self.out("\nSAVING...\n")
            save_models([self._model, self._metrics], f"save_at_epoch_{epoch}")
        sys.stdout.flush()
        DATA_MANAGER.write_to_file(
            os.path.join(RESULTS_DIR, DATA_MANAGER.stamp, OUTPUT_DIR,
                         "log.txt"), self._metrics.log)

    def _handle_pruning(self, epoch):
        if self._is_pruning_time(epoch):
            if self._is_not_finished_pruning():
                self.out("\nPRUNING...\n")
                self._criterion.prune(percentage=self._arguments.pruning_rate,
                                      train_loader=self._train_loader,
                                      manager=DATA_MANAGER)
                if self._arguments.prune_criterion in DURING_TRAINING:
                    self._optimizer = find_right_model(
                        OPTIMS,
                        self._arguments.optimizer,
                        params=self._model.parameters(),
                        lr=self._arguments.learning_rate,
                        weight_decay=self._arguments.l2_reg)
                    self._metrics.model_to_tensorboard(self._model,
                                                       timestep=epoch)
                if self._model.is_rewindable:
                    self.out("rewinding weights to checkpoint...\n")
                    self._model.do_rewind()
            if self._model.is_growable:
                self.out("growing too...\n")
                self._criterion.grow(self._arguments.growing_rate)

        if self._is_checkpoint_time(epoch):
            self.out(f"\nCreating weights checkpoint at epoch {epoch}\n")
            self._model.save_rewind_weights()

    def _is_not_finished_pruning(self):
        return self._arguments.pruning_limit > self._model.pruned_percentage \
               or \
               (
                       self._arguments.prune_criterion in DURING_TRAINING
                       and
                       self._arguments.pruning_limit > self._model.structural_sparsity
               )

    @staticmethod
    def _get_accuracy(output, y):
        predictions = output.argmax(dim=-1, keepdim=True).view_as(y)
        correct = y.eq(predictions).sum().item()
        return correct / output.shape[0]

    def _is_checkpoint_time(self, epoch: int):
        return epoch == self._arguments.rewind_to and self._model.is_rewindable

    def _is_pruning_time(self, epoch: int):
        if self._arguments.prune_criterion == "EmptyCrit":
            return False
        epoch -= self._arguments.prune_delay
        return (epoch % self._arguments.prune_freq) == 0 and \
               epoch > 0 and \
               self._model.is_maskable and \
               self._arguments.prune_criterion not in SINGLE_SHOT

    def _check_exit_conditions_epoch_iteration(self, patience=1):

        time_passed = datetime.now() - DATA_MANAGER.actual_date
        # check if runtime is expired
        if (time_passed.total_seconds() > (self._arguments.max_training_minutes * 60)) \
                and \
                self._arguments.max_training_minutes > 0:
            raise KeyboardInterrupt(
                f"Process killed because {self._arguments.max_training_minutes} minutes passed "
                f"since {DATA_MANAGER.actual_date}. Time now is {datetime.now()}"
            )
        if patience == 0:
            raise NotImplementedError(
                "feature to implement",
                KeyboardInterrupt("Process killed because patience is zero"))
Beispiel #14
0
def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [{
        "params": [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        "weight_decay":
        args.weight_decay
    }, {
        "params": [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        "weight_decay":
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3]
            }
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = batch[2] if args.model_type in [
                    "bert", "xlnet"
                ] else None  # XLM and RoBERTa don"t use segment_ids

            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results, _ = evaluate(args,
                                              model,
                                              tokenizer,
                                              labels,
                                              pad_token_label_id,
                                              mode="dev")
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, "module"
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
def main():
    global best_prec1, args

    args = parse()
    print("opt_level = {}".format(args.opt_level))
    print("keep_batchnorm_fp32 = {}".format(args.keep_batchnorm_fp32),
          type(args.keep_batchnorm_fp32))
    print("loss_scale = {}".format(args.loss_scale), type(args.loss_scale))
    print("\nCUDNN VERSION: {}\n".format(torch.backends.cudnn.version()))

    cudnn.benchmark = True
    best_prec1 = 0

    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        # this will be 0-3 if you have 4 GPUs on curr node
        args.gpu = args.local_rank
        torch.cuda.set_device(args.gpu)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        # this is the total # of GPUs across all nodes
        # if using 2 nodes with 4 GPUs each, world size is 8
        args.world_size = torch.distributed.get_world_size()
    print("### global rank of curr node: {}".format(
        torch.distributed.get_rank()))

    assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled."

    if args.channels_last:
        memory_format = torch.channels_last
    else:
        memory_format = torch.contiguous_format

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if args.sync_bn:
        print("using apex synced BN")
        model = apex.parallel.convert_syncbn_model(model)

    model = model.cuda()

    # initialize tb logging, you don't want to "double log"
    # so only allow GPU0 to launch tb
    if torch.distributed.get_rank() == 0:
        writer = SummaryWriter(comment="_{}_gpux{}_b{}_cpu{}_opt{}".format(
            args.arch, args.world_size, args.batch_size, args.workers,
            args.opt_level))

    # Scale init learning rate based on global batch size
    args.lr = args.lr * float(args.batch_size * args.world_size) / 256.
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # Initialize Amp.  Amp accepts either values or strings for the optional override arguments,
    # for convenient interoperation with argparse.
    model, optimizer = amp.initialize(
        model,
        optimizer,
        opt_level=args.opt_level,
        keep_batchnorm_fp32=args.keep_batchnorm_fp32,
        loss_scale=args.loss_scale)

    # For distributed training, wrap the model with apex.parallel.DistributedDataParallel.
    # This must be done AFTER the call to amp.initialize.  If model = DDP(model) is called
    # before model, ... = amp.initialize(model, ...), the call to amp.initialize may alter
    # the types of model's parameters in a way that disrupts or destroys DDP's allreduce hooks.
    if args.distributed:
        # By default, apex.parallel.DistributedDataParallel overlaps communication with
        # computation in the backward pass.
        # model = DDP(model)
        # delay_allreduce delays all communication to the end of the backward pass.
        model = DDP(model, delay_allreduce=True)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    # Optionally resume from a checkpoint
    if args.resume:
        # Use a local scope to avoid dangling references
        def resume():
            if os.path.isfile(args.resume):
                print("=> loading checkpoint '{}'".format(args.resume))
                checkpoint = torch.load(
                    args.resume,
                    map_location=lambda storage, loc: storage.cuda(args.gpu))
                args.start_epoch = checkpoint['epoch']
                best_prec1 = checkpoint['best_prec1']
                model.load_state_dict(checkpoint['state_dict'])
                optimizer.load_state_dict(checkpoint['optimizer'])
                print("=> loaded checkpoint '{}' (epoch {})".format(
                    args.resume, checkpoint['epoch']))
            else:
                print("=> no checkpoint found at '{}'".format(args.resume))

        resume()

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')

    if args.arch == "inception_v3":
        raise RuntimeError(
            "Currently, inception_v3 is not supported by this example.")
    else:
        crop_size = 224
        val_size = 256

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(crop_size),
            transforms.RandomHorizontalFlip(),
            # transforms.ToTensor(), Too slow
            # normalize,
        ]))
    val_dataset = datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(val_size),
            transforms.CenterCrop(crop_size),
        ]))

    # makes sure that each process gets a different slice of the training data
    # during distributed training
    train_sampler = None
    val_sampler = None
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
        val_sampler = torch.utils.data.distributed.DistributedSampler(
            val_dataset)

    collate_fn = lambda b: fast_collate(b, memory_format)

    # notice we turn off shuffling and use distributed data sampler
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               collate_fn=collate_fn)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True,
                                             sampler=val_sampler,
                                             collate_fn=collate_fn)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    if torch.distributed.get_rank() == 0:
        start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        # train for one epoch
        train_throughput, train_batch_time, train_losses, train_top1, train_top5, train_lr = train(
            train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        val_throughput, val_batch_time, val_losses, val_top1, val_top5 = validate(
            val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        # only allow GPU0 to print training states to prevent double logging
        if torch.distributed.get_rank() == 0:
            is_best = val_top1 > best_prec1
            best_prec1 = max(val_top1, best_prec1)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                    'optimizer': optimizer.state_dict(),
                }, is_best, writer.log_dir)

            # log train and val states to tensorboard
            writer.add_scalar('Throughput/train', train_throughput, epoch + 1)
            writer.add_scalar('Throughput/val', val_throughput, epoch + 1)
            writer.add_scalar('Time/train', train_batch_time, epoch + 1)
            writer.add_scalar('Time/val', val_batch_time, epoch + 1)
            writer.add_scalar('Loss/train', train_losses, epoch + 1)
            writer.add_scalar('Loss/val', val_losses, epoch + 1)
            writer.add_scalar('Top1/train', train_top1, epoch + 1)
            writer.add_scalar('Top1/val', val_top1, epoch + 1)
            writer.add_scalar('Top5/train', train_top5, epoch + 1)
            writer.add_scalar('Top5/val', val_top5, epoch + 1)
            writer.add_scalar('Lr', train_lr, epoch + 1)

    if torch.distributed.get_rank() == 0:
        writer.close()
        time_elapse = time.time() - start_time
        mins, secs = divmod(time_elapse, 60)
        hrs, mins = divmod(mins, 60)
        print(
            '### Training Time: {:.2f} hrs {:.2f} mins {:.2f} secs | {:.2f} secs'
            .format(hrs, mins, secs, time_elapse))
        print('### All Arguments:')
        print(args)
    return
def train(args, train_dataset, model, tokenizer, criterion):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=args.train_batch_size,
        collate_fn=collate_fn,
        num_workers=args.num_workers,
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_f1, n_no_improve = 0, 0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            labels = batch[5]
            inputs = {
                "input_ids": batch[0],
                "input_modal": batch[2],
                "attention_mask": batch[1],
                "modal_start_tokens": batch[3],
                "modal_end_tokens": batch[4],
            }
            outputs = model(**inputs)
            logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
            loss = criterion(logits, labels)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer, criterion)
                        for key, value in results.items():
                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs["learning_rate"] = learning_rate_scalar
                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    print(json.dumps({**logs, **{"step": global_step}}))

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME))
                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

        if args.local_rank == -1:
            results = evaluate(args, model, tokenizer, criterion)
            if results["micro_f1"] > best_f1:
                best_f1 = results["micro_f1"]
                n_no_improve = 0
            else:
                n_no_improve += 1

            if n_no_improve > args.patience:
                train_iterator.close()
                break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
        # Load image
        img_id = self.images[idx]
        image = Image.open(os.path.join(self.root_dir, img_id + ".jpg"))

        # Apply transform
        if self.transform is not None:
            image = self.transform(image)

        return {'image': image, 'label': self.image_labels[idx]}


data_path = "/home/kevin/deep_learning/OpenImages/"
eval_freq = 50

writer = SummaryWriter(log_dir=os.path.join(data_path, "models"))

transform = transforms.Compose([
    transforms.Scale((299, 299)),
    transforms.Grayscale(3),
    transforms.ToTensor()
])

root_dir = os.path.join(data_path, "pics")
csv_path = os.path.join(data_path, "open_image_labels_formatted.csv")
label_name_path = os.path.join(data_path, "label_names.csv")
dataset = ImageDataset(label_file=csv_path,
                       root_dir=root_dir,
                       label_name_path=label_name_path,
                       transform=transform)
Beispiel #18
0
        dist.init_process_group(backend='nccl', init_method='env://')  # distributed backend
        assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
        opt.batch_size = opt.total_batch_size // opt.world_size

    # Hyperparameters
    with open(opt.hyp) as f:
        hyp = yaml.safe_load(f)  # load hyps

    # Train
    logger.info(opt)
    if not opt.evolve:
        tb_writer = None  # init loggers
        if opt.global_rank in [-1, 0]:
            prefix = colorstr('tensorboard: ')
            logger.info(f"{prefix}Start with 'tensorboard --logdir {opt.project}', view at http://localhost:6006/")
            tb_writer = SummaryWriter(opt.save_dir)  # Tensorboard
        train(hyp, opt, device, tb_writer)

    # Evolve hyperparameters (optional)
    else:
        # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
        meta = {'lr0': (1, 1e-5, 1e-1),  # initial learning rate (SGD=1E-2, Adam=1E-3)
                'lrf': (1, 0.01, 1.0),  # final OneCycleLR learning rate (lr0 * lrf)
                'momentum': (0.3, 0.6, 0.98),  # SGD momentum/Adam beta1
                'weight_decay': (1, 0.0, 0.001),  # optimizer weight decay
                'warmup_epochs': (1, 0.0, 5.0),  # warmup epochs (fractions ok)
                'warmup_momentum': (1, 0.0, 0.95),  # warmup initial momentum
                'warmup_bias_lr': (1, 0.0, 0.2),  # warmup initial bias lr
                'box': (1, 0.02, 0.2),  # box loss gain
                'cls': (1, 0.2, 4.0),  # cls loss gain
                'cls_pw': (1, 0.5, 2.0),  # cls BCELoss positive_weight
Beispiel #19
0
#dataset = datasets.ImageFolder(root="celeb_dataset", transform=transforms)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

gen = Generator(NOISE_DIM, CHANNELS_IMG, FEATURES_GEN).to(device)
disc = Discriminator(CHANNELS_IMG, FEATURES_DISC).to(device)
initialize_weights(gen)
initialize_weights(disc)
# print(gen)   # 输出模型
# print(disc)

opt_gen = optim.Adam(gen.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999))
opt_disc = optim.Adam(disc.parameters(), lr=LEARNING_RATE, betas=(0.5, 0.999))
criterion = nn.BCELoss()

fixed_noise = torch.randn(32, NOISE_DIM, 1, 1).to(device)
writer_real = SummaryWriter(f"logs/real")
writer_fake = SummaryWriter(f"logs/fake")
step = 0

gen.train()
disc.train()

G_losses = []   # 为了画loss图
D_losses = []
img_list = []


for epoch in range(NUM_EPOCHS):
    # 不需要目标的标签,无监督
    for batch_id, (real, _) in enumerate(dataloader):
        real = real.to(device)
Beispiel #20
0
class TensorboardWriter():
    def __init__(self, args):
        name_model = args.model + "_" + args.dataset_name + "_" + datestr()
        self.writer = SummaryWriter(log_dir=args.tb_log_dir,
                                    comment=name_model)

        # self.step = 0
        # self.mode = ''
        self.csv_train, self.csv_val = self.create_stats_files(args.save)
        self.dataset_name = args.dataset_name
        self.label_names = dict_class_names[args.dataset_name]
        self.data = {
            "train": dict((label, 0.0) for label in self.label_names),
            "val": dict((label, 0.0) for label in self.label_names)
        }
        self.data['train']['loss'] = 0.0
        self.data['val']['loss'] = 0.0
        self.data['train']['count'] = 1
        self.data['val']['count'] = 1

        self.data['train']['dsc'] = 0.0
        self.data['val']['dsc'] = 0.0

        # self.tb_writer_ftns = {
        #     'add_scalar', 'add_scalars', 'add_image', 'add_images', 'add_audio',
        #     'add_text', 'add_histogram', 'add_pr_curve', 'add_embedding'
        # }
        #
        # self.timer = datetime.now()

    def display_terminal(self, iter, epoch, mode='train', summary=False):
        """

        :param iter: iteration or partial epoch
        :param epoch: epoch of training
        :param loss: any loss numpy
        :param mode: train or val ( for training and validation)
        :param summary: to print total statistics at the end of epoch
        """
        if summary:

            info_print = "\n Epoch {} : {} summary Loss : {}".format(
                epoch, mode,
                self.data[mode]['loss'] / self.data[mode]['count'])

            for i in range(len(self.label_names)):
                info_print += " {} : {}".format(
                    self.label_names[i], self.data[mode][self.label_names[i]] /
                    self.data[mode]['count'])

            print(info_print)
        else:

            info_print = "partial epoch: {} Loss:{}".format(
                iter, self.data[mode]['loss'] / self.data[mode]['count'])

            for i in range(len(self.label_names)):
                info_print += " {} : {}".format(
                    self.label_names[i], self.data[mode][self.label_names[i]] /
                    self.data[mode]['count'])
            print(info_print)

    def create_stats_files(self, path):
        train_f = open(os.path.join(path, 'train.csv'), 'w')
        val_f = open(os.path.join(path, 'val.csv'), 'w')
        return train_f, val_f

    def reset(self, mode):
        self.data[mode]['dsc'] = 0.0
        self.data[mode]['loss'] = 0.0
        self.data[mode]['count'] = 1
        for i in range(len(self.label_names)):
            self.data[mode][self.label_names[i]] = 0.0

    def update_scores(self, iter, loss, channel_score, mode, writer_step):
        """

        :param iter: iteration or partial epoch
        :param loss: any loss torch.tensor.item()
        :param channel_score: per channel score or dice coef
        :param mode: train or val ( for training and validation)
        :param writer_step: tensorboard writer step
        """
        ## WARNING ASSUMING THAT CHANNELS IN SAME ORDER AS DICTIONARY  ###########

        dice_coeff = np.mean(channel_score) * 100

        num_channels = len(channel_score)
        self.data[mode]['dsc'] += dice_coeff
        self.data[mode]['loss'] += loss
        self.data[mode]['count'] = iter

        for i in range(num_channels):
            self.data[mode][self.label_names[i]] += channel_score[i]
            if self.writer != None:
                self.writer.add_scalar(mode + '/' + self.label_names[i],
                                       channel_score[i],
                                       global_step=writer_step)

    def _write_end_of_epoch(self, epoch):

        self.writer.add_scalars(
            'DSC/', {
                'train':
                self.data['train']['dsc'] / self.data['train']['count'],
                'val': self.data['val']['dsc'] / self.data['val']['count'],
            }, epoch)
        self.writer.add_scalars(
            'Loss/', {
                'train':
                self.data['train']['loss'] / self.data['train']['count'],
                'val': self.data['val']['loss'] / self.data['val']['count'],
            }, epoch)
        for i in range(len(self.label_names)):
            self.writer.add_scalars(
                self.label_names[i], {
                    'train':
                    self.data['train'][self.label_names[i]] /
                    self.data['train']['count'],
                    'val':
                    self.data['val'][self.label_names[i]] /
                    self.data['train']['count'],
                }, epoch)
Beispiel #21
0
    def train(self, summary=False):

        # build model
        netG, netD = self.build_model()

        # just summary model
        if summary:
            self.model_summary(netG, netD)
            return

        # define optimizerss
        optimizerG, optimizerD = self.define_optimizers(netG, netD)
        scheduler_G = ExponentialLR(optimizerG, gamma=0.95)
        # create log directory
        self.set_logdir()
        # tensorboard
        writer = SummaryWriter()

        total_d_loss = 0.

        step = 1
        for epoch in range(self.epochs):
            # train
            netD.train()
            netG.train()
            for i, data in enumerate(self.dataloader):
                fake, mat_real, mis_real = data[0], data[1], data[2]

                true_voxel, fake_embedding = fake[0].to(
                    self.device,
                    dtype=torch.float), fake[1].to(self.device,
                                                   dtype=torch.float)

                mat_voxel, mat_embedding = mat_real[0].to(
                    self.device,
                    dtype=torch.float), mat_real[1].to(self.device,
                                                       dtype=torch.float)

                mis_voxel, mis_embedding = mis_real[0].to(
                    self.device,
                    dtype=torch.float), mis_real[1].to(self.device,
                                                       dtype=torch.float)

                # train Discriminator
                netD.zero_grad()

                # get G output
                noise = torch.from_numpy(
                    np.random.uniform(
                        low=-self.noise_unif_abs_max,
                        high=self.noise_unif_abs_max,
                        size=[mat_embedding.size(0),
                              self.noise_size])).to(self.device,
                                                    dtype=torch.float)

                fake_voxel = netG(noise, fake_embedding)

                _, fake_logits = netD(fake_voxel.detach(), fake_embedding)
                _, mat_real_logits = netD(mat_voxel, mat_embedding)
                _, mis_real_logits = netD(mis_voxel, mis_embedding)
                d_loss = losses.wasserstein_loss(
                    fake_logits, 'dis_fake') + 2.0 * losses.wasserstein_loss(
                        mat_real_logits, 'dis_real') + losses.wasserstein_loss(
                            mis_real_logits, 'dis_fake')
                d_gp = losses.gradient_penalty(fake_voxel, mat_voxel,
                                               fake_embedding, mat_embedding,
                                               netD, self.device)
                d_loss += d_gp

                total_d_loss = total_d_loss + d_loss.item()
                d_loss.backward()
                optimizerD.step()

                # train G
                if step % self.dstep == 0:
                    #                     print('g time', step)
                    netG.zero_grad()
                    _, fake_logits = netD(fake_voxel, fake_embedding)
                    g_loss = losses.wasserstein_loss(fake_logits, 'gen')

                    g_loss.backward()
                    optimizerG.step()

                    # tensorboard
                    writer.add_scalar('d_loss/train', d_loss.item(),
                                      step // self.dstep)
                    writer.add_scalar('d_gp/train', d_gp.item(),
                                      step // self.dstep)
                    writer.add_scalar('g_loss/train', g_loss.item(),
                                      step // self.dstep)

                if step % (self.decay_step * self.dstep) == 0:
                    print('g_lr is decay!')
                    scheduler_G.step()

                if step % (self.print_step * self.dstep) == 0:
                    print('global step:', step)
                    print(f'train--->step:{step//self.dstep}')
                    print(f'd_loss:{d_loss.item()}')
                    print(f'd_gp:{d_gp.item()}')
                    print(f'g_loss:{g_loss.item()}')

                # checkpoint
                if (step % self.cpkt_step * self.dstep) == 0:
                    self.save_cpkt(epoch, step, optimizerG, optimizerD, netG,
                                   netD, d_loss, g_loss)

                # generated shape save
                if step % (self.save_voxel_step * self.dstep) == 0:
                    #                     print(fake_voxel.size())
                    self.save_voxel(fake_voxel, true_voxel, step // self.dstep)

                step = step + 1
Beispiel #22
0
        opt.global_rank = dist.get_rank()
        assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
        opt.batch_size = opt.total_batch_size // opt.world_size

    print(opt)
    with open(opt.hyp) as f:
        hyp = yaml.load(f, Loader=yaml.FullLoader)  # load hyps

    # Train
    if not opt.evolve:
        tb_writer = None
        if opt.global_rank in [-1, 0]:
            print(
                'Start Tensorboard with "tensorboard --logdir %s", view at http://localhost:6006/'
                % opt.logdir)
            tb_writer = SummaryWriter(log_dir=increment_dir(
                Path(opt.logdir) / 'exp', opt.name))  # runs/exp

        train(hyp, opt, device, tb_writer, wandb)

    # Evolve hyperparameters (optional)
    else:
        # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
        meta = {
            'lr0':
            (1, 1e-5, 1e-1),  # initial learning rate (SGD=1E-2, Adam=1E-3)
            'momentum': (0.1, 0.6, 0.98),  # SGD momentum/Adam beta1
            'weight_decay': (1, 0.0, 0.001),  # optimizer weight decay
            'giou': (1, 0.02, 0.2),  # GIoU loss gain
            'cls': (1, 0.2, 4.0),  # cls loss gain
            'cls_pw': (1, 0.5, 2.0),  # cls BCELoss positive_weight
            'obj': (1, 0.2, 4.0),  # obj loss gain (scale with pixels)
Beispiel #23
0
        x = self.classifier(x)
        if self.training == True:
            return [x, out1, out2]
        else:
            return x


if __name__ == '__main__':
    # Temporary define data and target
    batch_size = 5
    x = torch.randn((batch_size, 3, 224, 224))
    y = torch.randint(0, 1000, (batch_size, ))
    num_classes = 1000

    # Add to graph in tensorboard
    writer = SummaryWriter(log_dir='logs/googlenet')
    m = MyGoogleNet()
    # print(m)
    # we have x,o1,o2 = m(x)
    # m(x)[0] means x; m(x)[1] means o1; m(x)[2] means o2
    # o1 and o2 are output from auxclassifier
    print(m(x)[0].shape)
    m.eval()
    print(m.training)
    writer.add_graph(m, x)
    writer.close()

    # Notice here! When you going to train your network
    # Put these loss value into train step of your model
    m.train()
    loss = nn.CrossEntropyLoss()
    def train(self,
              epochs=100,
              learningRate=0.005,
              dataset="Coco",
              useDatabase=True,
              printUpdateEvery=40,
              visualize=False,
              tensorboard=False):
        self._training = True
        self._initTraining(learningRate, dataset, useDatabase)

        # Deal with tensorboard
        if tensorboard or type(tensorboard) == str:
            from torch.utils.tensorboard import SummaryWriter

            if type(tensorboard) == str:
                writer = SummaryWriter("./data/tensorboard/" + tensorboard)
            else:
                writer = SummaryWriter("./data/tensorboard/")
            tensorboard = True

        def findBestROI(ROIs, label):
            bestMatch = 0
            bestIndex = -1
            for i, ROI in enumerate(ROIs):
                lbox = np.array(label["bbox"])
                larea = lbox[2:] - lbox[:2]
                larea = larea[0] * larea[1]
                rbox = ROI.bounds
                rarea = rbox[2:] - rbox[:2]
                rarea = rarea[0] * rarea[1]

                SI = np.maximum(0, np.minimum(lbox[2], rbox[2]) - np.maximum(lbox[0], rbox[0])) * \
                     np.maximum(0, np.minimum(lbox[3], rbox[3]) - np.maximum(lbox[1], rbox[1]))
                SU = larea + rarea - SI
                overlap = SI / SU
                if bestMatch < overlap and SU != 0:
                    bestMatch = overlap
                    bestIndex = i
            return bestIndex

        Iterations = len(self.dataset)

        print("Starting training")
        for epoch in range(epochs):
            epochLoss = np.float64(0)
            for i in range(Iterations):
                ROIs, peopleTextures, labels = self._load(i)

                # Figure out what ROI belongs to what label
                groundtruth = np.zeros((len(ROIs), 14), dtype=np.float32)
                for label in labels:
                    mostMatching = findBestROI(ROIs, label)
                    if mostMatching != -1:
                        groundtruth[mostMatching][label["category_id"]] = 1

                # Most items in this dataset will be bypassed because no people were found or overlapping with gt
                if len(ROIs) == 0 or not np.any(groundtruth != 0):
                    continue

                groundtruth = torch.from_numpy(groundtruth).to(device)

                # Apply noise to peopleTextures
                noise = np.random.randn(*peopleTextures.shape) * 5
                b = peopleTextures.astype(np.int32)
                peopleTextures = peopleTextures.astype(
                    np.int32) + noise.astype(np.int32)
                peopleTextures = np.clip(peopleTextures, 0, 255)
                peopleTextures = peopleTextures.astype(np.uint8)

                peopleTextures = torch.Tensor(peopleTextures).to(device)
                predictions = self.classifier.forward(peopleTextures)
                print(groundtruth)
                print(predictions)
                print("\n")

                lossSize = self.lossFunction(predictions, groundtruth)
                lossSize.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()
                lossSize = lossSize.cpu().item()

                epochLoss += lossSize / Iterations
                if (i - 1) % printUpdateEvery == 0:
                    print("Iteration {} / {}, epoch {} / {}".format(
                        i, Iterations, epoch, epochs))
                    print("Loss size: {}\n".format(lossSize /
                                                   printUpdateEvery))

                if tensorboard:
                    absI = i + epoch * Iterations
                    writer.add_scalar("Loss size", lossSize, absI)

                # Show visualization
                if visualize:
                    pass  # TODO
                    """
                    image = self.renderDebug(image)
                    plt.ion()
                    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
                    plt.draw()
                    plt.pause(4)
                    """

            print("Finished epoch {} / {}. Loss size:".format(
                epoch, epochs, epochLoss))
            self.saveModel(self.modelPath)

        self._training = False
Beispiel #25
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    # Added here for reproductibility
    set_seed(args)

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                # Save model checkpoint
                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(
                        model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Beispiel #26
0
def main():
    # Load dataset
    print('Loading dataset ...\n')
    dataset_train = Dataset(train=True)
    dataset_val = Dataset(train=False)
    loader_train = DataLoader(dataset=dataset_train,
                              num_workers=0,
                              batch_size=opt.batchSize,
                              shuffle=True)
    print("# of training samples: %d\n" % int(len(dataset_train)))
    # Build model
    # net = DnCNN(channels=1, num_of_layers=opt.num_of_layers)
    net = NoiseNetwork(4, 4, True)
    # net.apply(weights_init_kaiming)
    criterion = nn.MSELoss(size_average=False)
    # Move to GPU
    device_ids = [0]
    model = nn.DataParallel(net, device_ids=device_ids).cuda()
    criterion.cuda()
    # Optimizer
    optimizer = optim.AdamW(model.parameters(), lr=opt.lr)
    # training
    writer = SummaryWriter(opt.outf)
    step = 0
    noiseL_B = [0, 55]  # ingnored when opt.mode=='S'
    ep = []
    ps = []
    for epoch in range(opt.epochs):
        if epoch < opt.milestone:
            current_lr = opt.lr
        else:
            current_lr = opt.lr / 10.
        # set learning rate
        for param_group in optimizer.param_groups:
            param_group["lr"] = current_lr
        print('learning rate %f' % current_lr)
        # train
        for i, data in enumerate(loader_train, 0):
            # training step
            model.train()
            model.zero_grad()
            optimizer.zero_grad()
            img_train = data
            if opt.mode == 'S':
                noise = torch.FloatTensor(img_train.size()).normal_(
                    mean=0, std=opt.noiseL / 255.)
            if opt.mode == 'B':
                noise = torch.zeros(img_train.size())
                stdN = np.random.uniform(noiseL_B[0],
                                         noiseL_B[1],
                                         size=noise.size()[0])
                for n in range(noise.size()[0]):
                    sizeN = noise[0, :, :, :].size()
                    noise[n, :, :, :] = torch.FloatTensor(sizeN).normal_(
                        mean=0, std=stdN[n] / 255.)
            imgn_train = img_train + noise
            # ================dwt================
            imgn_train = tensor_dwt(imgn_train)
            img_train = tensor_dwt(img_train)
            noise = tensor_dwt(noise)
            # ====================================
            img_train, imgn_train = Variable(img_train.cuda()), Variable(
                imgn_train.cuda())
            noise = Variable(noise.cuda())
            # out_train = torch.clamp(model(imgn_train), 0., 1.)
            out_train = model(imgn_train)
            # out_train = torch.sigmoid(model(imgn_train))
            loss = criterion(out_train, imgn_train) / (imgn_train.size()[0]**2)
            loss.backward()
            #torch.nn.utils.clip_grad_value_(model.parameters(), 0.5)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1, norm_type=2)
            optimizer.step()
            # results
            model.eval()
            out_train = model(imgn_train)
            out_train = tensor_idwt(out_train)
            out_train = torch.clamp(out_train, 0., 1.)
            img_train = tensor_idwt(img_train)
            # out_train = torch.sigmoid(model(imgn_train))
            psnr_train = batch_PSNR(out_train, img_train, 1.)
            print(
                "[epoch %d][%d/%d] loss: %.4f PSNR_train: %.4f" %
                (epoch + 1, i + 1, len(loader_train), loss.item(), psnr_train))
            # if you are using older version of PyTorch, you may need to change loss.item() to loss.data[0]
            if step % 10 == 0:
                # Log the scalar values
                writer.add_scalar('loss', loss.item(), step)
                writer.add_scalar('PSNR on training data', psnr_train, step)
            step += 1
        ## the end of each epoch
        model.eval()
        # validate
        psnr_val = 0
        with torch.no_grad():
            for k in range(len(dataset_val)):
                img_val = torch.unsqueeze(dataset_val[k], 0)
                noise = torch.FloatTensor(img_val.size()).normal_(
                    mean=0, std=opt.val_noiseL / 255.)
                imgn_val = img_val + noise
                #========================dwt=======================
                imgn_val = tensor_dwt(imgn_val)
                img_val = tensor_dwt(img_val)
                noise = tensor_dwt(noise)
                # ========================dwt=======================
                img_val, imgn_val = Variable(img_val.cuda()), Variable(
                    imgn_val.cuda())
                out_val = model(imgn_val)
                out_val = tensor_idwt(out_val)
                out_val = torch.clamp(out_val, 0., 1.)
                img_val = tensor_idwt(img_val)
                # out_val = torch.sigmoid(model(imgn_val))
                psnr_val += batch_PSNR(out_val, img_val, 1.)
        psnr_val /= len(dataset_val)
        print("\n[epoch %d] PSNR_val: %.4f" % (epoch + 1, psnr_val))
        ep.append(epoch + 1)
        ps.append(psnr_val)
        writer.add_scalar('PSNR on validation data', psnr_val, epoch)
        # log the images
        out_train = torch.clamp(model(imgn_train), 0., 1.)
        # out_train = torch.sigmoid(model(imgn_train))
        Img = utils.make_grid(img_train.data,
                              nrow=8,
                              normalize=True,
                              scale_each=True)
        Imgn = utils.make_grid(imgn_train.data,
                               nrow=8,
                               normalize=True,
                               scale_each=True)
        Irecon = utils.make_grid(out_train.data,
                                 nrow=8,
                                 normalize=True,
                                 scale_each=True)
        writer.add_image('clean image', Img, epoch)
        writer.add_image('noisy image', Imgn, epoch)
        writer.add_image('reconstructed image', Irecon, epoch)
        # save model
        name = "net_epoch%d_PSNR%.4f.pth" % (epoch + 1, psnr_val)
        torch.save(model.state_dict(), os.path.join(opt.outf, name))
    # save chart
    plt.plot(ep, ps)
    plt.gca().xaxis.set_major_locator(mticker.MultipleLocator(2))
    plt.xlabel('epoch')
    plt.ylabel('PSNR')
    plt.title("PSNR values during training")
    plt.savefig('./psnr_val.jpg')
    plt.show()
Beispiel #27
0
        (3 - len(opt.img_size)))  # extend to 3 sizes (min, max, test)
    device = torch_utils.select_device(opt.device,
                                       apex=mixed_precision,
                                       batch_size=opt.batch_size)
    if device.type == 'cpu':
        mixed_precision = False

    # scale hyp['obj'] by img_size (evolved at 320)
    # hyp['obj'] *= opt.img_size[0] / 320.

    tb_writer = None
    if not opt.evolve:  # Train normally
        print(
            'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/'
        )
        tb_writer = SummaryWriter(comment=opt.name)
        train(hyp)  # train normally

    else:  # Evolve hyperparameters (optional)
        opt.notest, opt.nosave = True, True  # only test/save final epoch
        if opt.bucket:
            os.system('gsutil cp gs://%s/evolve.txt .' %
                      opt.bucket)  # download evolve.txt if exists

        for _ in range(1):  # generations to evolve
            if os.path.exists(
                    'evolve.txt'
            ):  # if evolve.txt exists: select best hyps and mutate
                # Select parent(s)
                parent = 'single'  # parent selection method: 'single' or 'weighted'
                x = np.loadtxt('evolve.txt', ndmin=2)
Beispiel #28
0
    def train(
        self,
        base_path: Union[Path, str],
        learning_rate: float = 0.1,
        mini_batch_size: int = 32,
        mini_batch_chunk_size: int = None,
        max_epochs: int = 100,
        scheduler=AnnealOnPlateau,
        anneal_factor: float = 0.5,
        patience: int = 3,
        initial_extra_patience=0,
        min_learning_rate: float = 0.0001,
        train_with_dev: bool = False,
        monitor_train: bool = False,
        monitor_test: bool = False,
        embeddings_storage_mode: str = "cpu",
        checkpoint: bool = False,
        save_final_model: bool = True,
        anneal_with_restarts: bool = False,
        anneal_with_prestarts: bool = False,
        batch_growth_annealing: bool = False,
        shuffle: bool = True,
        param_selection_mode: bool = False,
        num_workers: int = 6,
        sampler=None,
        use_amp: bool = False,
        amp_opt_level: str = "O1",
        eval_on_train_fraction=0.0,
        eval_on_train_shuffle=False,
        **kwargs,
    ) -> dict:
        """
        Trains any class that implements the flair.nn.Model interface.
        :param base_path: Main path to which all output during training is logged and models are saved
        :param learning_rate: Initial learning rate
        :param mini_batch_size: Size of mini-batches during training
        :param mini_batch_chunk_size: If mini-batches are larger than this number, they get broken down into chunks of this size for processing purposes
        :param max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed.
        :param anneal_factor: The factor by which the learning rate is annealed
        :param patience: Patience is the number of epochs with no improvement the Trainer waits
         until annealing the learning rate
        :param min_learning_rate: If the learning rate falls below this threshold, training terminates
        :param train_with_dev: If True, training is performed using both train+dev data
        :param monitor_train: If True, training data is evaluated at end of each epoch
        :param monitor_test: If True, test data is evaluated at end of each epoch
        :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),
        'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU)
        :param checkpoint: If True, a full checkpoint is saved at end of each epoch
        :param save_final_model: If True, final model is saved
        :param anneal_with_restarts: If True, the last best model is restored when annealing the learning rate
        :param shuffle: If True, data is shuffled during training
        :param param_selection_mode: If True, testing is performed against dev data. Use this mode when doing
        parameter selection.
        :param num_workers: Number of workers in your data loader.
        :param sampler: You can pass a data sampler here for special sampling of data.
        :param eval_on_train_fraction: the fraction of train data to do the evaluation on,
        if 0. the evaluation is not performed on fraction of training data,
        if 'dev' the size is determined from dev set size
        :param eval_on_train_shuffle: if True the train data fraction is determined on the start of training
        and kept fixed during training, otherwise it's sampled at beginning of each epoch
        :param kwargs: Other arguments for the Optimizer
        :return:
        """

        if self.use_tensorboard:
            try:
                from torch.utils.tensorboard import SummaryWriter

                writer = SummaryWriter()
            except:
                log_line(log)
                log.warning(
                    "ATTENTION! PyTorch >= 1.1.0 and pillow are required for TensorBoard support!"
                )
                log_line(log)
                self.use_tensorboard = False
                pass

        if use_amp:
            if sys.version_info < (3, 0):
                raise RuntimeError(
                    "Apex currently only supports Python 3. Aborting.")
            if amp is None:
                raise RuntimeError(
                    "Failed to import apex. Please install apex from https://www.github.com/nvidia/apex "
                    "to enable mixed-precision training.")

        if mini_batch_chunk_size is None:
            mini_batch_chunk_size = mini_batch_size
        if learning_rate < min_learning_rate:
            min_learning_rate = learning_rate / 10

        initial_learning_rate = learning_rate

        # cast string to Path
        if type(base_path) is str:
            base_path = Path(base_path)

        log_handler = add_file_handler(log, base_path / "training.log")

        log_line(log)
        log.info(f'Model: "{self.model}"')
        log_line(log)
        log.info(f'Corpus: "{self.corpus}"')
        log_line(log)
        log.info("Parameters:")
        log.info(f' - learning_rate: "{learning_rate}"')
        log.info(f' - mini_batch_size: "{mini_batch_size}"')
        log.info(f' - patience: "{patience}"')
        log.info(f' - anneal_factor: "{anneal_factor}"')
        log.info(f' - max_epochs: "{max_epochs}"')
        log.info(f' - shuffle: "{shuffle}"')
        log.info(f' - train_with_dev: "{train_with_dev}"')
        log.info(f' - batch_growth_annealing: "{batch_growth_annealing}"')
        log_line(log)
        log.info(f'Model training base path: "{base_path}"')
        log_line(log)
        log.info(f"Device: {flair.device}")
        log_line(log)
        log.info(f"Embeddings storage mode: {embeddings_storage_mode}")
        if isinstance(self.model, SequenceTagger
                      ) and self.model.weight_dict and self.model.use_crf:
            log_line(log)
            log.warning(
                f'WARNING: Specified class weights will not take effect when using CRF'
            )

        # determine what splits (train, dev, test) to evaluate and log
        log_train = True if monitor_train else False
        log_test = (True if (not param_selection_mode and self.corpus.test
                             and monitor_test) else False)
        log_dev = False
        log_train_part = (True if (eval_on_train_fraction == "dev"
                                   or eval_on_train_fraction > 0.0) else False)

        if log_train_part:
            train_part_size = (len(
                self.corpus.dev) if eval_on_train_fraction == "dev" else int(
                    len(self.corpus.train) * eval_on_train_fraction))
            assert train_part_size > 0
            if not eval_on_train_shuffle:
                train_part_indices = list(range(train_part_size))
                train_part = torch.utils.data.dataset.Subset(
                    self.corpus.train, train_part_indices)

        # prepare loss logging file and set up header
        loss_txt = init_output_file(base_path, "loss.tsv")

        weight_extractor = WeightExtractor(base_path)

        optimizer: torch.optim.Optimizer = self.optimizer(
            self.model.parameters(), lr=learning_rate, **kwargs)

        if use_amp:
            self.model, optimizer = amp.initialize(self.model,
                                                   optimizer,
                                                   opt_level=amp_opt_level)

        # minimize training loss if training with dev data, else maximize dev score
        anneal_mode = "min" if train_with_dev else "max"

        lr_scheduler = scheduler(
            optimizer,
            factor=anneal_factor,
            patience=patience,
            initial_extra_patience=initial_extra_patience,
            mode=anneal_mode,
            verbose=True,
        )

        train_data = self.corpus.train

        # if training also uses dev data, include in training set
        if train_with_dev:
            train_data = ConcatDataset([self.corpus.train, self.corpus.dev])

        # initialize sampler if provided
        if sampler is not None:
            # init with default values if only class is provided
            if inspect.isclass(sampler):
                sampler = sampler()
            # set dataset to sample from
            sampler.set_dataset(train_data)
            shuffle = False

        dev_score_history = []
        dev_loss_history = []
        train_loss_history = []

        micro_batch_size = mini_batch_chunk_size

        # At any point you can hit Ctrl + C to break out of training early.
        try:
            previous_learning_rate = learning_rate

            for self.epoch in range(self.epoch + 1, max_epochs + 1):
                log_line(log)

                if anneal_with_prestarts:
                    last_epoch_model_state_dict = copy.deepcopy(
                        self.model.state_dict())

                if eval_on_train_shuffle:
                    train_part_indices = list(range(self.corpus.train))
                    random.shuffle(train_part_indices)
                    train_part_indices = train_part_indices[:train_part_size]
                    train_part = torch.utils.data.dataset.Subset(
                        self.corpus.train, train_part_indices)

                # get new learning rate
                for group in optimizer.param_groups:
                    learning_rate = group["lr"]

                if learning_rate != previous_learning_rate and batch_growth_annealing:
                    mini_batch_size *= 2

                # reload last best model if annealing with restarts is enabled
                if ((anneal_with_restarts or anneal_with_prestarts)
                        and learning_rate != previous_learning_rate
                        and (base_path / "best-model.pt").exists()):
                    if anneal_with_restarts:
                        log.info("resetting to best model")
                        self.model.load_state_dict(
                            self.model.load(base_path /
                                            "best-model.pt").state_dict())
                    if anneal_with_prestarts:
                        log.info("resetting to pre-best model")
                        self.model.load_state_dict(
                            self.model.load(base_path /
                                            "pre-best-model.pt").state_dict())

                previous_learning_rate = learning_rate

                # stop training if learning rate becomes too small
                if learning_rate < min_learning_rate:
                    log_line(log)
                    log.info("learning rate too small - quitting training!")
                    log_line(log)
                    break

                batch_loader = DataLoader(
                    train_data,
                    batch_size=mini_batch_size,
                    shuffle=shuffle,
                    num_workers=num_workers,
                    sampler=sampler,
                )

                self.model.train()

                train_loss: float = 0

                seen_batches = 0
                total_number_of_batches = len(batch_loader)

                modulo = max(1, int(total_number_of_batches / 10))

                # process mini-batches
                batch_time = 0
                for batch_no, batch in enumerate(batch_loader):
                    start_time = time.time()

                    # zero the gradients on the model and optimizer
                    self.model.zero_grad()
                    optimizer.zero_grad()

                    # if necessary, make batch_steps
                    batch_steps = [batch]
                    if len(batch) > micro_batch_size:
                        batch_steps = [
                            batch[x:x + micro_batch_size]
                            for x in range(0, len(batch), micro_batch_size)
                        ]

                    # forward and backward for batch
                    for batch_step in batch_steps:

                        # forward pass
                        loss = self.model.forward_loss(batch_step)

                        # Backward
                        if use_amp:
                            with amp.scale_loss(loss,
                                                optimizer) as scaled_loss:
                                scaled_loss.backward()
                        else:
                            loss.backward()

                    # do the optimizer step
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   5.0)
                    optimizer.step()

                    seen_batches += 1
                    train_loss += loss.item()

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(batch, embeddings_storage_mode)

                    batch_time += time.time() - start_time
                    if seen_batches % modulo == 0:
                        log.info(
                            f"epoch {self.epoch} - iter {seen_batches}/{total_number_of_batches} - loss "
                            f"{train_loss / seen_batches:.8f} - samples/sec: {mini_batch_size * modulo / batch_time:.2f}"
                        )
                        batch_time = 0
                        iteration = self.epoch * total_number_of_batches + batch_no
                        if not param_selection_mode:
                            weight_extractor.extract_weights(
                                self.model.state_dict(), iteration)

                train_loss /= seen_batches

                self.model.eval()

                log_line(log)
                log.info(
                    f"EPOCH {self.epoch} done: loss {train_loss:.4f} - lr {learning_rate:.7f}"
                )

                if self.use_tensorboard:
                    writer.add_scalar("train_loss", train_loss, self.epoch)

                # anneal against train loss if training with dev, otherwise anneal against dev score
                current_score = train_loss

                # evaluate on train / dev / test split depending on training settings
                result_line: str = ""

                if log_train:
                    train_eval_result, train_loss = self.model.evaluate(
                        self.corpus.train,
                        mini_batch_size=mini_batch_chunk_size,
                        num_workers=num_workers,
                        embedding_storage_mode=embeddings_storage_mode,
                    )
                    result_line += f"\t{train_eval_result.log_line}"

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(self.corpus.train,
                                     embeddings_storage_mode)

                if log_train_part:
                    train_part_eval_result, train_part_loss = self.model.evaluate(
                        train_part,
                        mini_batch_size=mini_batch_chunk_size,
                        num_workers=num_workers,
                        embedding_storage_mode=embeddings_storage_mode,
                    )
                    result_line += (
                        f"\t{train_part_loss}\t{train_part_eval_result.log_line}"
                    )
                    log.info(
                        f"TRAIN_SPLIT : loss {train_part_loss} - score {round(train_part_eval_result.main_score, 4)}"
                    )

                if log_dev:
                    dev_eval_result, dev_loss = self.model.evaluate(
                        self.corpus.dev,
                        mini_batch_size=mini_batch_chunk_size,
                        num_workers=num_workers,
                        embedding_storage_mode=embeddings_storage_mode,
                    )
                    result_line += f"\t{dev_loss}\t{dev_eval_result.log_line}"
                    log.info(
                        f"DEV : loss {dev_loss} - score {round(dev_eval_result.main_score, 4)}"
                    )
                    # calculate scores using dev data if available
                    # append dev score to score history
                    dev_score_history.append(dev_eval_result.main_score)
                    dev_loss_history.append(dev_loss.item())

                    current_score = dev_eval_result.main_score

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(self.corpus.dev, embeddings_storage_mode)

                    if self.use_tensorboard:
                        writer.add_scalar("dev_loss", dev_loss, self.epoch)
                        writer.add_scalar("dev_score",
                                          dev_eval_result.main_score,
                                          self.epoch)

                if log_test:
                    test_eval_result, test_loss = self.model.evaluate(
                        self.corpus.test,
                        mini_batch_size=mini_batch_chunk_size,
                        num_workers=num_workers,
                        out_path=base_path / "test.tsv",
                        embedding_storage_mode=embeddings_storage_mode,
                    )
                    result_line += f"\t{test_loss}\t{test_eval_result.log_line}"
                    log.info(
                        f"TEST : loss {test_loss} - score {round(test_eval_result.main_score, 4)}"
                    )

                    # depending on memory mode, embeddings are moved to CPU, GPU or deleted
                    store_embeddings(self.corpus.test, embeddings_storage_mode)

                    if self.use_tensorboard:
                        writer.add_scalar("test_loss", test_loss, self.epoch)
                        writer.add_scalar("test_score",
                                          test_eval_result.main_score,
                                          self.epoch)

                # determine learning rate annealing through scheduler. Use auxiliary metric for AnnealOnPlateau
                #if not train_with_dev and isinstance(lr_scheduler, AnnealOnPlateau):
                if False:
                    lr_scheduler.step(current_score, dev_loss)
                else:
                    lr_scheduler.step(current_score)

                train_loss_history.append(train_loss)

                # determine bad epoch number
                try:
                    bad_epochs = lr_scheduler.num_bad_epochs
                except:
                    bad_epochs = 0
                for group in optimizer.param_groups:
                    new_learning_rate = group["lr"]
                if new_learning_rate != previous_learning_rate:
                    bad_epochs = patience + 1
                    if previous_learning_rate == initial_learning_rate:
                        bad_epochs += initial_extra_patience

                # log bad epochs
                log.info(f"BAD EPOCHS (no improvement): {bad_epochs}")

                # output log file
                with open(loss_txt, "a") as f:

                    # make headers on first epoch
                    if self.epoch == 1:
                        f.write(
                            f"EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS"
                        )

                        if log_train:
                            f.write("\tTRAIN_" + "\tTRAIN_".join(
                                train_eval_result.log_header.split("\t")))
                        if log_train_part:
                            f.write("\tTRAIN_PART_LOSS\tTRAIN_PART_" +
                                    "\tTRAIN_PART_".join(
                                        train_part_eval_result.log_header.
                                        split("\t")))
                        if log_dev:
                            f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join(
                                dev_eval_result.log_header.split("\t")))
                        if log_test:
                            f.write("\tTEST_LOSS\tTEST_" + "\tTEST_".join(
                                test_eval_result.log_header.split("\t")))

                    f.write(
                        f"\n{self.epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t{train_loss}"
                    )
                    f.write(result_line)

                # if checkpoint is enabled, save model at each epoch
                if checkpoint and not param_selection_mode:
                    self.save_checkpoint(base_path / "checkpoint.pt")

                # if we use dev data, remember best model based on dev evaluation score
                if ((not train_with_dev or anneal_with_restarts
                     or anneal_with_prestarts) and not param_selection_mode
                        and current_score == lr_scheduler.best
                        and bad_epochs == 0):
                    print("saving best model")
                    self.model.save(base_path / "best-model.pt")

                    if anneal_with_prestarts:
                        current_state_dict = self.model.state_dict()
                        self.model.load_state_dict(last_epoch_model_state_dict)
                        self.model.save(base_path / "pre-best-model.pt")
                        self.model.load_state_dict(current_state_dict)

            # if we do not use dev data for model selection, save final model
            if save_final_model and not param_selection_mode:
                self.model.save(base_path / "final-model.pt")

        except KeyboardInterrupt:
            log_line(log)
            log.info("Exiting from training early.")

            if self.use_tensorboard:
                writer.close()

            if not param_selection_mode:
                log.info("Saving model ...")
                self.model.save(base_path / "final-model.pt")
                log.info("Done.")

        # test best model if test data is present
        #if self.corpus.test:
        if True:
            final_score = self.final_test(base_path, mini_batch_chunk_size,
                                          num_workers)
        else:
            final_score = 0
            from flair.data import Sentence
            sentence: Sentence = Sentence(
                "George Washington went to Washington .")
            self.model.predict(sentence)

            print("Analysing %s" % sentence)
            print("\nThe following NER tags are found: \n")
            print(sentence)
            print(sentence.to_tagged_string())
            log.info("Test data not provided setting final score to 0")

        log.removeHandler(log_handler)

        if self.use_tensorboard:
            writer.close()

        return {
            "test_score": final_score,
            "dev_score_history": dev_score_history,
            "train_loss_history": train_loss_history,
            "dev_loss_history": dev_loss_history,
        }
Beispiel #29
0
class Agent():
    def __init__(self, state_size, action_size, action_dim, config):
        self.state_size = state_size
        self.action_size = action_size
        self.action_dim = action_dim
        self.seed = 0
        self.device = 'cuda'
        self.batch_size = config["batch_size"]
        self.lr = 0.005
        self.gamma = 0.99
        self.q_shift_local = QNetwork(state_size, action_size,
                                      self.seed).to(self.device)
        self.q_shift_target = QNetwork(state_size, action_size,
                                       self.seed).to(self.device)
        self.Q_local = QNetwork(state_size, action_size,
                                self.seed).to(self.device)
        self.Q_target = QNetwork(state_size, action_size,
                                 self.seed).to(self.device)
        self.R_local = RNetwork(state_size, action_size,
                                self.seed).to(self.device)
        self.R_target = RNetwork(state_size, action_size,
                                 self.seed).to(self.device)
        self.policy = PolicyNetwork(state_size, action_size,
                                    self.seed).to(self.device)
        self.predicter = Classifier(state_size, action_dim,
                                    self.seed).to(self.device)
        #self.criterion = nn.CrossEntropyLoss()
        # optimizer
        self.optimizer_q_shift = optim.Adam(self.q_shift_local.parameters(),
                                            lr=self.lr)
        self.optimizer_q = optim.Adam(self.Q_local.parameters(), lr=self.lr)
        self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr)
        self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.optimizer_pre = optim.Adam(self.predicter.parameters(),
                                        lr=self.lr)
        pathname = "lr {} batch_size {} seed {}".format(
            self.lr, self.batch_size, self.seed)
        tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
        self.writer = SummaryWriter(tensorboard_name)
        self.steps = 0
        self.ratio = 1. / action_dim
        self.all_actions = []
        for a in range(self.action_dim):
            action = torch.Tensor(1) * 0 + a
            self.all_actions.append(action.to(self.device))

    def act(self, state):
        dis, action, log_probs, ent = self.policy.sample_action(
            torch.Tensor(state).unsqueeze(0))
        return dis, action, log_probs, ent

    def learn(self, memory):
        states, next_states, actions = memory.expert_policy(self.batch_size)
        # actions = actions[0]
        # print("states ",  states)
        self.state_action_frq(states, actions)
        self.get_action_prob(states, actions)
        self.compute_r_function(states, actions)
        return
        # compute difference between Q_shift and y_sh
        q_sh_value = self.q_shift_local(next_states, actions)
        y_sh = np.empty((self.batch_size, 1), dtype=np.float32)
        for idx, s in enumerate(next_states):
            q = []
            for action in self.all_actions:
                q.append(Q_target(s.unsqueeze(0), action.unsqueeze(0)))
            q_max = max(q)
            np.copyto(y_sh[idx], q_max.detach().numpy())

        y_sh = torch.Tensor(y_sh)
        y_sh *= self.gamma
        q_shift_loss = F.mse_loss(y_sh, q_shift_values)
        # Minimize the loss
        self.optimizer.zero_grad()
        q_shift_loss.backward()
        self.optimizer.step()

        #minimize MSE between pred Q and y = r'(s,a) + gama * max Q'(s',a)
        q_current = self.Q_local(states, actions)
        r_hat = self.R_target(states, actions)
        # use y_sh as target
        y_q = r_hat + y_sh

        q_loss = F.mse_loss(q_current, y_q)
        # Minimize the loss
        self.optimizer.zero_grad()
        q_loss.backward()
        self.optimizer.step()

        #  get predicted reward
        r = self.R_local(states, actions)

    def state_action_frq(self, states, action):
        """ Train classifer to compute state action freq
        """
        self.steps += 1
        output = self.predicter(states)
        # create one hot encode y from actions
        y = action.type(torch.long)
        y = y.squeeze(1)
        loss = nn.CrossEntropyLoss()(output, y)
        self.optimizer_pre.zero_grad()
        loss.backward()
        self.optimizer_pre.step()
        self.writer.add_scalar('Predict_loss', loss, self.steps)

    def get_action_prob(self, states, actions, dim=False):
        """

        """
        if dim:
            output = self.predicter(states)
            action_prob = output.gather(1, actions.type(torch.long))
            action_prob = torch.log(action_prob)
            return action_prob
        output = self.predicter(states)
        print("Output prob ", output)
        action_prob = output.gather(1, actions.type(torch.long))
        print("action prob ", action_prob)
        action_prob = torch.log(action_prob)
        print("action prob ", action_prob)
        return action_prob

    def compute_r_function(self, states, actions):
        """
        
        """
        actions = actions.type(torch.float)
        y = self.R_local(states, actions)
        y_shift = self.q_shift_target(states, actions)
        y_r_part1 = self.get_action_prob(states, actions) - y_shift
        print("ratio ", self.ratio)
        # sum all other actions
        y_r_part2 = torch.empty((self.batch_size, 1), dtype=torch.float32)
        idx = 0
        for a, s in zip(actions, states):
            y_h = 0
            for b in self.all_actions:
                if torch.eq(a, b):
                    continue
                print("diff ac ", b)
                r_hat = self.R_target(s.unsqueeze(0), b.unsqueeze(0))
                n_b = self.get_action_prob(s.unsqueeze(0), b.unsqueeze(0),
                                           True) - self.q_shift_target(
                                               s.unsqueeze(0), b.unsqueeze(0))
                y_h += (r_hat - n_b)
            y_h = self.ratio * y_h
            y_r_part2[idx] = y_h
            idx += 1
        print("shape of r y ", y.shape)
        print("y r part 1 ", y_r_part1.shape)
        print("y r part 2 ", y_r_part2.shape)
class TabularQLearningAgent:
    """A Tabular. epsilon greedy Q-Learning Agent using Experience Replay """
    def __init__(self,
                 env,
                 seed=None,
                 lr=0.001,
                 training_steps=10000,
                 final_epsilon=0.05,
                 exploration_steps=10000,
                 gamma=0.99,
                 verbose=True,
                 **kwargs):

        # This implementation only works for flat actions
        assert env.flat_actions
        self.verbose = verbose
        if self.verbose:
            print("\nRunning Tabular Q-Learning with config:")
            pprint(locals())

        # set seeds
        self.seed = seed
        if self.seed is not None:
            np.random.seed(self.seed)

        # envirnment setup
        self.env = env

        self.num_actions = self.env.action_space.n
        self.obs_dim = self.env.observation_space.shape

        # logger setup
        self.logger = SummaryWriter()

        # Training related attributes
        self.lr = lr
        self.exploration_steps = exploration_steps
        self.final_epsilon = final_epsilon
        self.epsilon_schedule = np.linspace(1.0, self.final_epsilon,
                                            self.exploration_steps)
        self.discount = gamma
        self.training_steps = training_steps
        self.steps_done = 0

        # Q-Function
        self.qfunc = TabularQFunction(self.num_actions)

    def get_epsilon(self):
        if self.steps_done < self.exploration_steps:
            return self.epsilon_schedule[self.steps_done]
        return self.final_epsilon

    def get_egreedy_action(self, o, epsilon):
        if random.random() > epsilon:
            return self.qfunc.get_action(o)
        return random.randint(0, self.num_actions - 1)

    def optimize(self, s, a, next_s, r, done):
        # get q_val for state and action performed in that state
        q_vals_raw = self.qfunc.forward(s)
        q_val = q_vals_raw[a]

        # get target q val = max val of next state
        target_q_val = self.qfunc.forward(next_s).max()
        target = r + self.discount * (1 - done) * target_q_val

        # calculate error and update
        td_error = target - q_val
        td_delta = self.lr * td_error

        # optimize the model
        self.qfunc.update(s, a, td_delta)

        s_value = q_vals_raw.max()
        return td_error, s_value

    def train(self):
        if self.verbose:
            print("\nStarting training")

        num_episodes = 0
        training_steps_remaining = self.training_steps

        while self.steps_done < self.training_steps:
            ep_results = self.run_train_episode(training_steps_remaining)
            ep_return, ep_steps, goal = ep_results
            num_episodes += 1
            training_steps_remaining -= ep_steps

            self.logger.add_scalar("episode", num_episodes, self.steps_done)
            self.logger.add_scalar("epsilon", self.get_epsilon(),
                                   self.steps_done)
            self.logger.add_scalar("episode_return", ep_return,
                                   self.steps_done)
            self.logger.add_scalar("episode_steps", ep_steps, self.steps_done)
            self.logger.add_scalar("episode_goal_reached", int(goal),
                                   self.steps_done)

            if num_episodes % 10 == 0 and self.verbose:
                print(f"\nEpisode {num_episodes}:")
                print(f"\tsteps done = {self.steps_done} / "
                      f"{self.training_steps}")
                print(f"\treturn = {ep_return}")
                print(f"\tgoal = {goal}")

        self.logger.close()
        if self.verbose:
            print("Training complete")
            print(f"\nEpisode {num_episodes}:")
            print(f"\tsteps done = {self.steps_done} / {self.training_steps}")
            print(f"\treturn = {ep_return}")
            print(f"\tgoal = {goal}")

    def run_train_episode(self, step_limit):
        s = self.env.reset()
        done = False

        steps = 0
        episode_return = 0

        while not done and steps < step_limit:
            a = self.get_egreedy_action(s, self.get_epsilon())

            next_s, r, done, _ = self.env.step(a)
            self.steps_done += 1
            td_error, s_value = self.optimize(s, a, next_s, r, done)
            self.logger.add_scalar("td_error", td_error, self.steps_done)
            self.logger.add_scalar("s_value", s_value, self.steps_done)

            s = next_s
            episode_return += r
            steps += 1

        return episode_return, steps, self.env.goal_reached()

    def run_eval_episode(self,
                         env=None,
                         render=False,
                         eval_epsilon=0.05,
                         render_mode="readable"):
        if env is None:
            env = self.env
        s = env.reset()
        done = False

        steps = 0
        episode_return = 0

        line_break = "=" * 60
        if render:
            print("\n" + line_break)
            print(f"Running EVALUATION using epsilon = {eval_epsilon:.4f}")
            print(line_break)
            env.render(render_mode)
            input("Initial state. Press enter to continue..")

        while not done:
            a = self.get_egreedy_action(s, eval_epsilon)
            next_s, r, done, _ = env.step(a)
            s = next_s
            episode_return += r
            steps += 1
            if render:
                print("\n" + line_break)
                print(f"Step {steps}")
                print(line_break)
                print(f"Action Performed = {env.action_space.get_action(a)}")
                env.render(render_mode)
                print(f"Reward = {r}")
                print(f"Done = {done}")
                input("Press enter to continue..")

                if done:
                    print("\n" + line_break)
                    print("EPISODE FINISHED")
                    print(line_break)
                    print(f"Goal reached = {env.goal_reached()}")
                    print(f"Total steps = {steps}")
                    print(f"Total reward = {episode_return}")

        return episode_return, steps, env.goal_reached()