Ejemplo n.º 1
0
def main(cfg_dict: DictConfig):

    # TODO: erase previous logs in the folder at every run
    config = ConfigParser(cfg_dict)
    logger = config.get_logger('train')

    # setup data_loader instances
    data_loader = config.init_obj('data_loader', module_data)
    valid_data_loader = data_loader.split_validation()

    # build model architecture, then print to console
    model = config.init_obj('arch', module_arch)
    # logger.info(model)

    # get function handles of loss and metrics
    criterion = getattr(module_loss, config['loss'])
    metrics = [getattr(module_metric, met) for met in config['metrics']]

    # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler
    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = config.init_obj('optimizer', torch.optim, trainable_params)

    lr_scheduler = config.init_obj('lr_scheduler', torch.optim.lr_scheduler,
                                   optimizer)

    trainer = Trainer(model,
                      criterion,
                      metrics,
                      optimizer,
                      config=config,
                      data_loader=data_loader,
                      valid_data_loader=valid_data_loader,
                      lr_scheduler=lr_scheduler)

    trainer.train()
Ejemplo n.º 2
0
def main(config: ConfigParser):
    # 获取一个logging.getLogger,默认日志级别为debug
    logger = config.get_logger('train')
    # 数据模块
    # 获取config中读取到的config.json里的loader的名字,并实例化,用json里的参数去填充
    data_loader = config.init_obj('data_loader', module_data)
    valid_data_loader = data_loader.split_validation()

    # 模型模块
    model = config.init_obj('arch', module_arch)
    logger.info(model)

    # 损失与评估模块
    criterion = getattr(module_loss, config['loss'])
    # 这里面存的是function,也可能存的是类,通过__name__方法获得名字
    metrics = [getattr(module_metric, met) for met in config['metrics']]

    # 优化器模块
    # filter,过滤掉false值
    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = config.init_obj('optimizer', torch.optim, trainable_params)
    # 学习率衰减策略
    lr_scheduler = config.init_obj('lr_scheduler', torch.optim.lr_scheduler, optimizer)

    # 训练模型
    trainer = Trainer(model, criterion, metrics, optimizer,
                      config=config,
                      data_loader=data_loader,
                      valid_data_loader=valid_data_loader,
                      lr_scheduler=lr_scheduler)

    trainer.train()
Ejemplo n.º 3
0
def main(config: ConfigParser):
    logger = config.get_logger("train")

    # setup data_loader instances
    data_loader = config.init_obj("data_loader", module_data)
    valid_data_loader = data_loader.split_validation()

    # build model architecture, then print to console
    model = config.init_obj("arch", module_arch)
    logger.info(model)

    # get function handles of loss and metrics
    criterion = config.init_obj("criterion", module_criterion)
    metrics = [getattr(module_metric, met) for met in config["metrics"]]

    # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler
    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = config.init_obj("optimizer", module_optim, trainable_params)

    lr_scheduler = config.init_obj("lr_scheduler", torch.optim.lr_scheduler,
                                   optimizer)

    trainer = Trainer(
        model,
        criterion,
        metrics,
        optimizer,
        config=config,
        data_loader=data_loader,
        valid_data_loader=valid_data_loader,
        lr_scheduler=lr_scheduler,
    )

    trainer.train()
Ejemplo n.º 4
0
def main(config: ConfigParser) -> None:
    """
    Main training function.

    Parameters
    ----------
    config : parse_config.ConfigParser
        Parsed configuration JSON file.
    """
    logger: Logger = config.get_logger("train")

    # Setup data_loader instances.
    data_loader: DataLoader = config.initialize("data_loader", module_data)
    valid_data_loader: Optional[DataLoader] = data_loader.split_validation()

    # Build model architecture, then print to console.
    model: Module = config.initialize("arch", module_arch)
    logger.info(model)

    # Get function handles of loss and metrics as well as args.
    loss_fn: Callable = getattr(module_loss, config["loss"]["type"])
    loss_args: Dict[str, Any] = config["loss"]["args"]
    metric_fns: List[Callable] = [
        getattr(module_metric, met) for met in config["metrics"]
    ]
    metric_args: List[Dict[str, Any]] = [
        config["metrics"][met] for met in config["metrics"]
    ]

    # Build optimizer, learning rate scheduler.
    # Delete every line containing lr_scheduler to disable scheduler.
    trainable_params: Iterable[Tensor] = filter(lambda p: p.requires_grad,
                                                model.parameters())
    optimizer: Optimizer = config.initialize("optimizer", torch.optim,
                                             trainable_params)

    lr_scheduler: Optional = config.initialize("lr_scheduler",
                                               torch.optim.lr_scheduler,
                                               optimizer)

    trainer: Trainer = Trainer(
        model,
        loss_fn,
        loss_args,
        metric_fns,
        metric_args,
        optimizer,
        config=config,
        data_loader=data_loader,
        valid_data_loader=valid_data_loader,
        lr_scheduler=lr_scheduler,
    )

    trainer.train()
Ejemplo n.º 5
0
    def __init__(
        self,
        model: torch.nn.Module,
        criterion: torch.nn.modules.loss._Loss,
        metric_ftns: List[Callable[..., float]],
        optimizer: torch.optim.Optimizer,
        config: ConfigParser,
        lr_scheduler: Union[torch.optim.lr_scheduler._LRScheduler,
                            torch.optim.lr_scheduler.ReduceLROnPlateau,
                            None, ] = None,
    ):
        self.config = config
        self.logger = config.get_logger("trainer",
                                        config["trainer"]["verbosity"])

        # setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config["n_gpu"])
        self.model = model.to(self.device)
        if len(device_ids) > 1:
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)

        self.criterion = criterion
        self.metric_ftns = metric_ftns
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler

        cfg_trainer = config["trainer"]
        self.epochs = cfg_trainer["epochs"]
        self.save_period = cfg_trainer["save_period"]
        self.monitor = cfg_trainer.get("monitor", "off")
        self.save_last = cfg_trainer.get("save_last", False)

        # configuration to monitor model performance and save best
        if self.monitor == "off":
            self.mnt_mode = "off"
            self.mnt_best = 0
        else:
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ["min", "max"]

            self.mnt_best = inf if self.mnt_mode == "min" else -inf
            self.early_stop = cfg_trainer.get("early_stop", inf)

        self.start_epoch = 1

        self.checkpoint_dir = config.model_dir

        # setup visualization writer instance
        self.writer = TensorboardWriter(config.log_dir, self.logger,
                                        cfg_trainer["tensorboard"])

        if config.resume is not None:
            self._resume_checkpoint(config.resume)
Ejemplo n.º 6
0
def entry_point(config: ConfigParser):
    '''
    entry-point function for a single worker, distributed training
    '''

    local_world_size = config['local_world_size']

    # check gpu available
    if torch.cuda.is_available():
        if torch.cuda.device_count() < local_world_size:
            raise RuntimeError(f'the number of GPU ({torch.cuda.device_count()}) is less than '
                               f'the number of processes ({local_world_size}) running on each node')
        local_master = config['local_rank'] == 0
    else:
        raise RuntimeError('CUDA is not available, Distributed training is not supported.')

    if local_master:
        logger = config.get_logger('train')
        logger.info('Distributed training start...')

    # these are the parameters used to initialize the process group
    env_dict = {
        key: os.environ[key]
        for key in ('MASTER_ADDR', 'MASTER_PORT', 'RANK', 'WORLD_SIZE')
    }
    logger.info(f'[Process {os.getpid()}] Initializing process group with: {env_dict}') if local_master else None

    # init process group
    dist.init_process_group(backend='nccl', init_method='env://')

    logger.info(
        f'[Process {os.getpid()}] world_size = {dist.get_world_size()}, '
        + f'rank = {dist.get_rank()}, backend={dist.get_backend()}'
    ) if local_master else None

    # start train
    main(config, local_master, logger if local_master else None)

    # tear down the process group
    dist.destroy_process_group()
Ejemplo n.º 7
0
def main(cfg_dict: DictConfig):

    config = ConfigParser(cfg_dict)
    logger = config.get_logger('test')

    # setup data_loader instances
    data_loader = getattr(module_data, config['data_loader']['type'])(
        config['data_loader']['args']['data_dir'],
        batch_size=512,
        shuffle=False,
        validation_split=0.0,
        training=False,
        num_workers=2)

    # build model architecture
    model = config.init_obj('arch', module_arch)
    logger.info(model)

    # get function handles of loss and metrics
    loss_fn = getattr(module_loss, config['loss'])
    metric_fns = [getattr(module_metric, met) for met in config['metrics']]

    logger.info('Loading checkpoint: {} ...'.format(config['resume']))
    checkpoint = torch.load(config['resume'])
    state_dict = checkpoint['state_dict']
    if config['n_gpu'] > 1:
        model = torch.nn.DataParallel(model)
    model.load_state_dict(state_dict)

    # prepare model for testing
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    total_loss = 0.0
    total_metrics = torch.zeros(len(metric_fns))

    with torch.no_grad():
        for i, (data, target) in enumerate(tqdm(data_loader)):

            # TODO: overlap objects with overlap_objects_from_batch in util.oy
            # TODO: check model's output is correct for the loss_fn

            data, target = data.to(device), target.to(device)
            output = model(data)

            #
            # save sample images, or do something with output here
            #
            # computing loss, metrics on test set
            loss = loss_fn(output, target)
            batch_size = data.shape[0]
            total_loss += loss.item() * batch_size
            for i, metric in enumerate(metric_fns):
                total_metrics[i] += metric(output, target) * batch_size

    n_samples = len(data_loader.sampler)
    log = {'loss': total_loss / n_samples}
    log.update({
        met.__name__: total_metrics[i].item() / n_samples
        for i, met in enumerate(metric_fns)
    })
    logger.info(log)
Ejemplo n.º 8
0
import os, sys
from pathlib import Path
import re
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
src_dir = os.path.join(root_dir, "src")
sys.path.insert(0, src_dir)
# change cwd to root dir
os.chdir(root_dir)

from parse_config import ConfigParser
from utils import util_geo

re_img_index = re.compile("img\d+")

if __name__ == '__main__':
    config = ConfigParser(ConfigParser.from_file("test/configs/geotest.json"))
    logger = config.get_logger('train')
    data_dir = Path(config['data_loader']['args']['data_dir'])
    data_name = config['data_loader']['args']['data_name']
    img_dir = data_dir / data_name / "RGB-PanSharpen"
    save_dir = data_dir / data_name / 'processed'
    img_save_dir = save_dir / "RGB"

    geojson_dir = data_dir / data_name / "geojson"
    mask_save_dir = save_dir / "labels"
    colors = config['data_loader']['args']["colors"]

    img_save_dir.mkdir(parents=True, exist_ok=True)
    mask_save_dir.mkdir(parents=True, exist_ok=True)
    util_geo.GeoLabelUtil.preprocess(img_dir, geojson_dir, img_save_dir, mask_save_dir, colors)
Ejemplo n.º 9
0
def main(config: ConfigParser):

    logger = config.get_logger('train')

    data_loader = getattr(module_data, config['data_loader']['type'])(
        config['data_loader']['args']['data_dir'],
        batch_size=config['data_loader']['args']['batch_size'],
        shuffle=config['data_loader']['args']['shuffle'],
        validation_split=config['data_loader']['args']['validation_split'],
        num_batches=config['data_loader']['args']['num_batches'],
        training=True,
        num_workers=config['data_loader']['args']['num_workers'],
        pin_memory=config['data_loader']['args']['pin_memory'])

    # valid_data_loader = data_loader.split_validation()
    valid_data_loader = None
    # test_data_loader = None

    test_data_loader = getattr(module_data, config['data_loader']['type'])(
        config['data_loader']['args']['data_dir'],
        batch_size=128,
        shuffle=False,
        validation_split=0.0,
        training=False,
        num_workers=2)  #.split_validation()

    # build model architecture, then print to console
    model = config.initialize('arch', module_arch)

    # get function handles of loss and metrics
    logger.info(config.config)
    if hasattr(data_loader.dataset, 'num_raw_example'):
        num_examp = data_loader.dataset.num_raw_example
    else:
        num_examp = len(data_loader.dataset)

    train_loss = getattr(module_loss, config['train_loss']['type'])(
        num_examp=num_examp,
        num_classes=config['num_classes'],
        beta=config['train_loss']['args']['beta'])

    val_loss = getattr(module_loss, config['val_loss'])
    metrics = [getattr(module_metric, met) for met in config['metrics']]

    # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler
    trainable_params = filter(lambda p: p.requires_grad, model.parameters())

    optimizer = config.initialize('optimizer', torch.optim,
                                  [{
                                      'params': trainable_params
                                  }])

    lr_scheduler = config.initialize('lr_scheduler', torch.optim.lr_scheduler,
                                     optimizer)

    trainer = Trainer(model,
                      train_loss,
                      metrics,
                      optimizer,
                      config=config,
                      data_loader=data_loader,
                      valid_data_loader=valid_data_loader,
                      test_data_loader=test_data_loader,
                      lr_scheduler=lr_scheduler,
                      val_criterion=val_loss)

    trainer.train()
    logger = config.get_logger('trainer', config['trainer']['verbosity'])
    cfg_trainer = config['trainer']
Ejemplo n.º 10
0
    def __init__(
        self,
        model: Module,
        loss_fn: Callable,
        loss_args: Dict[str, Any],
        metric_fns: List[Callable],
        metric_args: List[Dict[str, Any]],
        optimizer: Optimizer,
        config: ConfigParser,
    ):

        self.config: ConfigParser = config
        self.logger: Logger = config.get_logger("trainer",
                                                config["trainer"]["verbosity"])

        # Setup GPU device if available.
        self.device: torch.device
        device_ids: List[int]
        self.device, device_ids = self._prepare_device(config["n_gpu"])

        # Move model into configured device(s).
        self.model: Module = model.to(self.device)
        if len(device_ids) > 1:
            self.model = DataParallel(model, device_ids=device_ids)

        # Set loss function and arguments.
        self.loss_fn: Callable = loss_fn
        self.loss_args: Dict[str, Any] = loss_args

        # Set all metric functions and associated arguments.
        self.metric_fns: List[Callable] = metric_fns
        self.metric_args: List[Dict[str, Any]] = metric_args

        # Set optimizer.
        self.optimizer: Optimizer = optimizer

        # Set training configuration.
        cfg_trainer: Dict[str, Any] = config["trainer"]
        self.epochs: int = cfg_trainer["epochs"]
        self.save_period: int = cfg_trainer["save_period"]
        self.monitor: str = cfg_trainer.get("monitor", "off")

        # Configuration to monitor model performance and save best.
        if self.monitor == "off":
            self.mnt_mode: str = "off"
            self.mnt_best: float = 0
        else:
            self.mnt_metric: str
            self.mnt_mode, self.mnt_metric = self.monitor.split()
            assert self.mnt_mode in ["min", "max"]

            self.mnt_best = inf if self.mnt_mode == "min" else -inf
            self.early_stop: float = cfg_trainer.get("early_stop", inf)

        self.start_epoch: int = 1
        self.checkpoint_dir: Path = config.save_dir

        # Setup visualization writer instance.
        self.writer = TensorboardWriter(config.log_dir, self.logger,
                                        cfg_trainer["tensorboard"])

        if config.resume is not None:
            self._resume_checkpoint(config.resume)
Ejemplo n.º 11
0
        vocab_optimal, T_opt = get_optimal_ordering(config, args_outer)

    else:
        with open(args_outer.optimal_taxo_path, "rb") as f:
            T_opt = pickle.load(f)
        vocab_optimal = list(nx.topological_sort(T_opt))

    if args_outer.model:
        vocab_model, T_model = get_insertion_ordering(config, args_outer)

    else:
        with open(args_outer.model_taxo_path, "rb") as f:
            T_model = pickle.load(f)
        vocab_model = list(nx.topological_sort(T_model))

    if args_outer.direct_eval:
        logger = config.get_logger('test')
        logger.info(edge_metrics(T_opt, T_model))
        logger.info(ancestor_metrics(T_opt, T_model))

    if args_outer.insert:
        main_sequential(config, args_outer, vocab_model)

    # reverse optimal
    # rev_optimal = list(reversed(vocab_optimal))
    # main_sequential(config, args_outer, rev_optimal)

    # random order insertion
    # vocab_random = [vocab_optimal[i] for i in np.random.permutation(len(vocab_optimal))]
    # main_sequential(config, args_outer, vocab_random)
Ejemplo n.º 12
0
def main(config: ConfigParser):

    logger = config.get_logger('train')
    logger.info(config.config)

    # setup data_loader instances
    data_loader1 = getattr(module_data, config['data_loader']['type'])(
        config['data_loader']['args']['data_dir'],
        batch_size=config['data_loader']['args']['batch_size'],
        shuffle=config['data_loader']['args']['shuffle'],
        validation_split=config['data_loader']['args']['validation_split'],
        num_batches=config['data_loader']['args']['num_batches'],
        training=True,
        num_workers=config['data_loader']['args']['num_workers'],
        pin_memory=config['data_loader']['args']['pin_memory'])

    data_loader2 = getattr(module_data, config['data_loader']['type'])(
        config['data_loader']['args']['data_dir'],
        batch_size=config['data_loader']['args']['batch_size2'],
        shuffle=config['data_loader']['args']['shuffle'],
        validation_split=config['data_loader']['args']['validation_split'],
        num_batches=config['data_loader']['args']['num_batches'],
        training=True,
        num_workers=config['data_loader']['args']['num_workers'],
        pin_memory=config['data_loader']['args']['pin_memory'])

    valid_data_loader = data_loader1.split_validation()

    test_data_loader = getattr(module_data, config['data_loader']['type'])(
        config['data_loader']['args']['data_dir'],
        batch_size=128,
        shuffle=False,
        validation_split=0.0,
        training=False,
        num_workers=2).split_validation()

    # build model architecture
    model1 = config.initialize('arch1', module_arch)
    model_ema1 = config.initialize('arch1', module_arch)
    model_ema1_copy = config.initialize('arch1', module_arch)
    model2 = config.initialize('arch2', module_arch)
    model_ema2 = config.initialize('arch2', module_arch)
    model_ema2_copy = config.initialize('arch2', module_arch)

    # get function handles of loss and metrics
    device_id = list(range(min(torch.cuda.device_count(), config['n_gpu'])))

    if hasattr(data_loader1.dataset, 'num_raw_example') and hasattr(
            data_loader2.dataset, 'num_raw_example'):
        num_examp1 = data_loader1.dataset.num_raw_example
        num_examp2 = data_loader2.dataset.num_raw_example
    else:
        num_examp1 = len(data_loader1.dataset)
        num_examp2 = len(data_loader2.dataset)

    train_loss1 = getattr(module_loss, config['train_loss']['type'])(
        num_examp=num_examp1,
        num_classes=config['num_classes'],
        device='cuda:' + str(device_id[0]),
        config=config.config,
        beta=config['train_loss']['args']['beta'])
    train_loss2 = getattr(module_loss, config['train_loss']['type'])(
        num_examp=num_examp2,
        num_classes=config['num_classes'],
        device='cuda:' + str(device_id[-1]),
        config=config.config,
        beta=config['train_loss']['args']['beta'])

    val_loss = getattr(module_loss, config['val_loss'])
    metrics = [getattr(module_metric, met) for met in config['metrics']]

    # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler
    trainable_params1 = filter(lambda p: p.requires_grad, model1.parameters())
    trainable_params2 = filter(lambda p: p.requires_grad, model2.parameters())

    optimizer1 = config.initialize('optimizer1', torch.optim,
                                   [{
                                       'params': trainable_params1
                                   }])
    optimizer2 = config.initialize('optimizer2', torch.optim,
                                   [{
                                       'params': trainable_params2
                                   }])

    lr_scheduler1 = config.initialize('lr_scheduler', torch.optim.lr_scheduler,
                                      optimizer1)
    lr_scheduler2 = config.initialize('lr_scheduler', torch.optim.lr_scheduler,
                                      optimizer2)

    trainer = Trainer(model1,
                      model2,
                      model_ema1,
                      model_ema2,
                      train_loss1,
                      train_loss2,
                      metrics,
                      optimizer1,
                      optimizer2,
                      config=config,
                      data_loader1=data_loader1,
                      data_loader2=data_loader2,
                      valid_data_loader=valid_data_loader,
                      test_data_loader=test_data_loader,
                      lr_scheduler1=lr_scheduler1,
                      lr_scheduler2=lr_scheduler2,
                      val_criterion=val_loss,
                      model_ema1_copy=model_ema1_copy,
                      model_ema2_copy=model_ema2_copy)

    trainer.train()
    logger = config.get_logger('trainer', config['trainer']['verbosity'])
    cfg_trainer = config['trainer']
Ejemplo n.º 13
0
def main(config: ConfigParser):

    logger = config.get_logger('train')

    data_loader = getattr(module_data, config['data_loader']['type'])(
        config['data_loader']['args']['data_dir'],
        batch_size=config['data_loader']['args']['batch_size'],
        shuffle=config['data_loader']['args']['shuffle'],
        validation_split=config['data_loader']['args']['validation_split'],
        num_batches=config['data_loader']['args']['num_batches'],
        training=True,
        num_workers=config['data_loader']['args']['num_workers'],
        pin_memory=config['data_loader']['args']['pin_memory'])

    valid_data_loader = data_loader.split_validation()

    test_data_loader = getattr(module_data, config['data_loader']['type'])(
        config['data_loader']['args']['data_dir'],
        batch_size=128,
        shuffle=False,
        validation_split=0.0,
        training=False,
        num_workers=2).split_validation()

    # build model architecture, then print to console
    model = config.initialize('arch', module_arch)

    train_loss = getattr(module_loss, config['train_loss'])
    val_loss = getattr(module_loss, config['val_loss'])
    metrics = [getattr(module_metric, met) for met in config['metrics']]

    logger.info(str(model).split('\n')[-1])

    # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler

    trainable_params = [{
        'params': [
            p for p in model.parameters()
            if (not getattr(p, 'bin_gate', False)) and (
                not getattr(p, 'bin_theta', False)) and (
                    not getattr(p, 'srelu_bias', False))
            and getattr(p, 'requires_grad', False)
        ]
    }, {
        'params': [
            p for p in model.parameters() if getattr(p, 'bin_gate', False)
            and getattr(p, 'requires_grad', False)
        ],
        'lr':
        config['optimizer']['args']['lr'] * 10,
        'weight_decay':
        0
    }, {
        'params': [
            p for p in model.parameters() if getattr(p, 'srelu_bias', False)
            and getattr(p, 'requires_grad', False)
        ],
        'weight_decay':
        0
    }, {
        'params': [
            p for p in model.parameters() if getattr(p, 'bin_theta', False)
            and getattr(p, 'requires_grad', False)
        ],
        'lr':
        config['optimizer']['args']['lr'],
        'weight_decay':
        0
    }]

    optimizer = config.initialize('optimizer', torch.optim, trainable_params)

    lr_scheduler = config.initialize('lr_scheduler', torch.optim.lr_scheduler,
                                     optimizer)

    trainer = Trainer(model,
                      train_loss,
                      metrics,
                      optimizer,
                      config=config,
                      data_loader=data_loader,
                      valid_data_loader=valid_data_loader,
                      test_data_loader=test_data_loader,
                      lr_scheduler=lr_scheduler,
                      val_criterion=val_loss)

    trainer.train()
    logger = config.get_logger('trainer', config['trainer']['verbosity'])
    cfg_trainer = config['trainer']
Ejemplo n.º 14
0
def main(cfg_dict : DictConfig):
    generate = False
    load_gen = True
    save = True
    # remove_eigs = True
    remove_eigs = False

    config = ConfigParser(cfg_dict)
    T_rec, T_pred = config['n_timesteps'], config['seq_length'] - config['n_timesteps']
    logger = config.get_logger('test')

    gt = True
    # gt = True
    model_name = 'ddpae-iccv'
    # model_name = 'DRNET'
    # model_name = 'scalor'
    # model_name = 'sqair'
    s_directory = os.path.join(config['data_loader']['args']['data_dir'], 'test_data')
    res_directory = os.path.join(config['data_loader']['args']['data_dir'], 'res_data')
    load_gen_directory = os.path.join(config['data_loader']['args']['data_dir'],
                                      'results')
    # # TODO: Testing features
    # load_gen_directory = os.path.join(config['data_loader']['args']['data_dir'], 'test_data')

    if not os.path.exists(s_directory):
        os.makedirs(s_directory)
    if not os.path.exists(res_directory):
        os.makedirs(res_directory)
    dataset_dir = os.path.join(s_directory, config['data_loader']['args']['dataset_case']+
                               '_Len-'+str(config['seq_length'])+'_Nts-'+str(config['n_timesteps'])+'.npy')
    results_dir = os.path.join(res_directory, config['data_loader']['args']['dataset_case']+
                               '_Len-'+str(config['seq_length'])+'_Nts-'+str(config['n_timesteps'])+'.npz')
    all_data = []
    if not os.path.exists(dataset_dir) and generate:
        config['data_loader']['args']['shuffle'] = False
        config['data_loader']['args']['training'] = False
        config['data_loader']['args']['validation_split'] = 0.0
        data_loader = config.init_obj('data_loader', module_data)

        for i, data in enumerate(tqdm(data_loader)):
            all_data.append(data)
        all_data = torch.cat(all_data, dim=0).numpy()
        print(all_data.shape)
        np.save(dataset_dir, all_data)
        print(config['data_loader']['args']['dataset_case']+ ' data generated in: '+s_directory)
        exit()
    if os.path.exists(dataset_dir):
        print('LOADING EXISTING DATA FROM: ' + dataset_dir)
        inps = torch.from_numpy(np.load(dataset_dir))
        if os.path.exists(load_gen_directory) and load_gen:
            if model_name == 'ddpae-iccv':
                outs = torch.from_numpy(
                    np.load(os.path.join(
                        load_gen_directory,
                        model_name +'--'+config['data_loader']['args']['dataset_case']+
                        '_Len-'+str(config['seq_length'])+'_Nts-'+str(config['n_timesteps'])+'.npy')))

            else:
                with np.load(os.path.join(
                        load_gen_directory,
                        model_name +'_'+config['data_loader']['args']['dataset_case']+'.npz')) as outputs:
                    if model_name == 'scalor':
                        outs = torch.from_numpy(outputs["pred"]).permute(0,1,3,2).unsqueeze(2)
                    elif model_name == 'DRNET':
                        outs = torch.from_numpy(outputs["pred"]).unsqueeze(2).float()
                    else:
                        outs = torch.from_numpy(outputs["pred"]).unsqueeze(2)
                    print('Inps and Outs shapes', inps.shape, outs.shape)
            loaded_dataset = TensorDataset(inps, outs)
        else:
            loaded_dataset = TensorDataset(inps)
        data_loader = DataLoader(loaded_dataset, batch_size=40, shuffle=False, sampler=None,
                            batch_sampler=None, num_workers=2, collate_fn=None,
                            pin_memory=False)
    else:
        print('te has liao si te metes aqui')
        exit()
        config['data_loader']['args']['shuffle'] = False
        config['data_loader']['args']['training'] = False
        config['data_loader']['args']['validation_split'] = 0.0
        data_loader = config.init_obj('data_loader', module_data)
    # build model architecture
    if not load_gen:
        model = config.init_obj('arch', module_arch)
    # logger.info(model)

    # get function handles of loss and metrics
    loss_fn = getattr(module_loss, config['loss'])
    metric_fns = [getattr(module_metric, met) for met in ["mse", "mae", "bce", "mssim", "mlpips"]]

    if not load_gen:
        logger.info('Loading checkpoint: {} ...'.format(config['resume']))
        checkpoint = torch.load(config['resume'])
        state_dict = checkpoint['state_dict']
        if config['n_gpu'] > 1:
            model = torch.nn.DataParallel(model)
        model.load_state_dict(state_dict)

        # prepare model for testing
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = model.to(device)
        model.eval()

        if remove_eigs:
            A_modified, indices, e = remove_eig_under_t(
                model.koopman.dynamics.dynamics.weight.data, t=0.7)
            A_modified = torch.from_numpy(A_modified.real).to(device)
            model.koopman.dynamics.dynamics.weight.data = A_modified

    total_loss = 0.0


    total_metrics = [torch.zeros(len(metric_fns)), torch.zeros(len(metric_fns))]

    # TODO: Here we can change the model's K, and crop the eigenvalues under certain module threshold.
    # Si la nova prediccio es mes llarga, evaluem nomes la nova:
    # T_pred = 8
    all_pred, all_rec = [], []
    with torch.no_grad():
        for i, data in enumerate(tqdm(data_loader)):
            if isinstance(data, list) and len(data) == 2:
                target = data[0]
                output = data[1]
                batch_size = target.shape[0]
                # total_loss += loss.item() * batch_size
                pred = output[:, -T_pred:], target[:, -T_pred:]
                rec = output[:, :T_rec], target[:, :T_rec]

                assert T_rec + T_pred == target.shape[1]
                assert target.shape == output.shape
            else:
                if isinstance(data, list) and len(data) == 1:
                    data = data[0]
                # if config["data_loader"]["type"] == "MovingMNISTLoader":
                #     data = overlap_objects_from_batch(data,config['n_objects'])
                target = data # Is data a variable?
                data, target = data.to(device), target.to(device)

                output = model(data, epoch_iter=[-1], test=True)
                # computing loss, metrics on test set
                # loss, loss_particles = loss_fn(output, target,
                #                                epoch_iter=[-1],
                #                                case=config["data_loader"]["args"]["dataset_case"])
                batch_size = data.shape[0]
                # total_loss += loss.item() * batch_size

                pred = output["pred_roll"][:, -T_pred:] , target[:, -T_pred:] #* 0.85
                rec = output["rec_ori"][:, :T_rec] * 0.85, target[:, :T_rec]

                assert T_rec + T_pred == target.shape[1]

            if config['data_loader']['args']['dataset_case'] == 'circles_crop':
                rec_cr, pred_cr = [crop_top_left_keepdim(vid[0], 35) for vid in [rec, pred]]
                rec, pred = (rec_cr, target[:, :T_rec]), (pred_cr, target[:, -T_pred:])

            # Save image sample
            if i==0:
                if gt:
                    idx_gt = 1
                else:
                    idx_gt = 0
                # 11 fail to reconstruct.
                idx = 21
                # print(rec.shape, pred.shape)
                # print_u = output["u"].reshape(40, 2, -1, 4)[idx,:,-torch.cat(pred, dim=-2).shape[1]:]\
                #     .cpu()
                # print_u = print_u.abs()*255
                # print_im = torch.cat(pred, dim=-2).permute(0,2,3,1,4)[idx,0,:,:]
                print_im = pred[idx_gt].permute(0,2,3,1,4)[idx,0]
                np.save("/home/acomasma/ool-dynamics/dk/image_sample.npy", print_im.cpu().numpy())
                image = im.fromarray(print_im.reshape(print_im.shape[-3], -1).cpu().numpy()*255)
                image = image.convert('RGB')
                image.save("/home/acomasma/ool-dynamics/dk/image_sample.png")

                # u_plot_o1 = im.fromarray(plot_matrix(print_u[0]).permute(1,0).numpy()).convert('RGB')
                # u_plot_o1.save("/home/acomasma/ool-dynamics/dk/input_sample_o1.png")
                #
                # u_plot_o2 = im.fromarray(plot_matrix(print_u[1]).permute(1,0).numpy()).convert('RGB')
                # u_plot_o2.save("/home/acomasma/ool-dynamics/dk/input_sample_o2.png")
                # exit()
                image = im.fromarray(rec[idx_gt].permute(0,2,3,1,4)[idx,0].reshape(64, -1).cpu().numpy()*255)
                image = image.convert('RGB')
                image.save("/home/acomasma/ool-dynamics/dk/image_sample_rec.png")
                exit()

            all_pred.append(pred[0])
            all_rec.append(rec[0])

            for j, (out, tar) in enumerate([rec, pred]):
                for i, metric in enumerate(metric_fns):
                    # TODO: dataset case in metrics
                    total_metrics[j][i] += metric(out, tar) * batch_size

    n_samples = len(data_loader.sampler)
    print('n_samples', n_samples)
    # log = {'loss': total_loss / n_samples}
    log = {}

    print('Timesteps Rec and pred: ' , T_rec, T_pred)
    for j, name in enumerate(['rec', 'pred']):
        log.update({
            met.__name__: total_metrics[j][i].item() / n_samples for i, met in enumerate(metric_fns)
        })
        print(name)
        logger.info(log)
Ejemplo n.º 15
0
    def pred(self, paths, metas, m_cfg, id):
        print('pred')
        self.cfg = m_cfg
        res = Response()
        if len(paths) != len(metas):
            res.code = -2
            res.msg = "The length of images and meta is not same."
            return res
        # if self.pred_th is not None:
        #     if self.pred_th.is_alive():
        #         res.code = -3
        #         res.msg = "There is a task running, please wait it finish."
        #         return res
        try:
            m_typename = m_cfg["name"].split("-")[1]
            if m_typename == "Deeplab" or m_typename == "UNet":
                from .predthread import SegPredThread
                self.device = torch.device(
                    'cuda:0' if self.n_gpu_use > 0 else 'cpu')
                torch.set_grad_enabled(False)
                m_cfg["save_dir"] = str(self.tmp_path)
                config = ConfigParser(m_cfg, Path(m_cfg["path"]))
                self.logger = config.get_logger('PredServer')
                self.model = config.init_obj('arch', module_arch)
                self.logger.info('Loading checkpoint: {} ...'.format(
                    config.resume))
                if self.n_gpu_use > 1:
                    self.model = torch.nn.DataParallel(self.model)
                if self.n_gpu_use > 0:
                    checkpoint = torch.load(config.resume)
                else:
                    checkpoint = torch.load(config.resume,
                                            map_location=torch.device('cpu'))

                state_dict = checkpoint['state_dict']
                self.model.load_state_dict(state_dict)
                self.model = self.model.to(self.device)
                self.model.eval()

                if "crop_size" in config["tester"]:
                    self.crop_size = config["tester"]["crop_size"]

                if 'postprocessor' in config["tester"]:
                    module_name = config["tester"]['postprocessor']['type']
                    module_args = dict(
                        config["tester"]['postprocessor']['args'])
                    self.postprocessor = getattr(postps_crf,
                                                 module_name)(**module_args)

                self.tmp_path.mkdir(parents=True, exist_ok=True)

                self.pred_ths.append(
                    SegPredThread(self, paths, metas, self.tmp_path, id))
            elif m_typename == "CycleGAN":
                from .predthread import CycleGANPredThread
                from model import CycleGANOptions, CycleGANModel
                # config = ConfigParser(m_cfg, Path(m_cfg["path"]))
                opt = CycleGANOptions(**m_cfg["arch"]["args"])
                opt.batch_size = self.batch_size
                opt.serial_batches = True
                opt.no_flip = True  # no flip;
                opt.display_id = -1  # no visdom display; the test code saves the results to a HTML file.
                opt.isTrain = False
                opt.gpu_ids = []
                for i in range(0, self.n_gpu_use):
                    opt.gpu_ids.append(i)
                opt.checkpoints_dir = str(self.tmp_path)
                opt.preprocess = "none"
                opt.direction = 'AtoB'
                self.model = CycleGANModel(opt)

                orig_save_dir = self.model.save_dir
                self.model.save_dir = ""
                self.model.load_networks(m_cfg["path"])
                self.model.save_dir = orig_save_dir
                torch.set_grad_enabled(False)
                self.model.set_requires_grad(
                    [self.model.netG_A, self.model.netG_B], False)

                self.pred_ths.append(
                    CycleGANPredThread(self, paths, metas, self.tmp_path, id))
            else:
                raise NotImplementedError("Model type:", m_typename,
                                          "is not supported.")

            print('NotifyStartThread')
            self.pred_ths[-1].start()
            # self.pred_th.is_alive()
        except Exception as e:
            res.code = -1
            res.msg = str(e)
            return res

        res.code = 0
        res.msg = "Success"
        return res
Ejemplo n.º 16
0
def entry_point(config: ConfigParser):
    '''
    entry-point function for a single worker distributed training
    a single worker contain (torch.cuda.device_count() / local_world_size) gpus
    '''

    local_world_size = config['local_world_size']

    # check distributed environment cfgs
    if config['distributed']:  # distributed gpu mode
        # check gpu available
        if torch.cuda.is_available():
            if torch.cuda.device_count() < local_world_size:
                raise RuntimeError(
                    f'the number of GPU ({torch.cuda.device_count()}) is less than '
                    f'the number of processes ({local_world_size}) running on each node'
                )
            local_master = (config['local_rank'] == 0)
        else:
            raise RuntimeError(
                'CUDA is not available, Distributed training is not supported.'
            )
    else:  # one gpu or cpu mode
        if config['local_world_size'] != 1:
            raise RuntimeError(
                'local_world_size must set be to 1, if distributed is set to false.'
            )
        config.update_config('local_rank', 0)
        local_master = True
        config.update_config('global_rank', 0)

    logger = config.get_logger('train') if local_master else None
    if config['distributed']:
        logger.info('Distributed GPU training model start...'
                    ) if local_master else None
    else:
        logger.info(
            'One GPU or CPU training mode start...') if local_master else None
    # else:
    #     sys.stdin.close()

    # cfg CUDNN whether deterministic
    if config['deterministic']:
        fix_random_seed_for_reproduce(config['seed'])
        logger.warn(
            'You have chosen to deterministic training. '
            'This will fix random seed, turn on the CUDNN deterministic setting, turn off the CUDNN benchmark '
            'which can slow down your training considerably! '
        ) if local_master else None
    else:
        torch.backends.cudnn.benchmark = True
        logger.warn(
            'You have chosen to benchmark training. '
            'This will turn on the CUDNN benchmark setting'
            'which can speed up your training considerably! '
            'You may see unexpected behavior when restarting '
            'from checkpoints due to RandomizedMultiLinearMap need deterministic turn on.'
        ) if local_master else None

    if config['distributed']:
        # init process group
        dist.init_process_group(backend='nccl', init_method='env://')
        config.update_config('global_rank', dist.get_rank())
        # log distributed training cfg
        logger.info(
            f'[Process {os.getpid()}] world_size = {dist.get_world_size()}, ' +
            f'rank = {dist.get_rank()}, backend={dist.get_backend()}'
        ) if local_master else None

    # start train
    main(config, local_master, logger if local_master else None)

    # tear down the process group
    dist.destroy_process_group()
Ejemplo n.º 17
0
def main(config: ConfigParser):

    access_token = ''
    with open('./pytorch_line_token') as f:
        access_token = str(f.readline())
    bot = LINENotifyBot(access_token=access_token)

    logger = config.get_logger('train')

    # setup data_loader instances
    data_loader = config.initialize('data_loader', module_data)
    valid_data_loader = data_loader.split_validation()

    # build model architecture, then print to console
    model = config.initialize('arch', module_arch)
    logger.info(model)

    # get function handles of loss and metrics
    loss = getattr(module_loss, config['loss'])
    metrics = [getattr(module_metric, met) for met in config['metrics']]

    # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler
    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = config.initialize('optimizer', torch.optim, trainable_params)

    lr_scheduler = config.initialize('lr_scheduler', torch.optim.lr_scheduler,
                                     optimizer)

    trainer = Trainer(model,
                      loss,
                      metrics,
                      optimizer,
                      config=config,
                      data_loader=data_loader,
                      valid_data_loader=valid_data_loader,
                      lr_scheduler=lr_scheduler)

    trainer.train()
    logger = config.get_logger('trainer', config['trainer']['verbosity'])
    cfg_trainer = config['trainer']

    # mlflow.start_run()で__enter__()を実行できるようにする必要がある。一旦棚上げ。
    # mlflow = MLFlow(config.log_dir, logger, cfg_trainer['mlflow'])

    with mlflow.start_run() as run:
        # Log args into mlflow
        log_params(config.config)

        # Log results into mlflow
        for loss in trainer.train_loss_list:
            mlflow.log_metric('train_loss', loss)
        for loss in trainer.val_loss_list:
            mlflow.log_metric('val_loss', loss)

        # Log other info
        # mlflow.log_param('loss_type', 'CrossEntropy')

        # Log model
        mlflow.pytorch.log_model(model, 'model')

    bot.send(message=f'{config["name"]}の訓練が終了しました。@{socket.gethostname()}')
Ejemplo n.º 18
0
        CustomArgs(['--attn_drop'],
                   type=float,
                   target=('arch', 'args', 'attn_drop')),
        CustomArgs(['--hidden_drop'],
                   type=float,
                   target=('arch', 'args', 'hidden_drop')),
        CustomArgs(['--out_drop'],
                   type=float,
                   target=('arch', 'args', 'out_drop')),
    ]
    config = ConfigParser(args, options)
    args = args.parse_args()
    n_trials = args.n_trials

    if n_trials > 0:
        config.get_logger('train').info(f'number of trials: {n_trials}')
        metrics = config['metrics']
        save_file = config.log_dir / 'evaluations.txt'
        fin = open(save_file, 'w')
        fin.write('\t'.join(metrics))

        evaluations = []
        for i in range(n_trials):
            config.set_save_dir(i + 1)
            res = main(config)
            evaluations.append(res)
            fin.write('\t'.join([f'{i:.3f}' for i in res]))

        evaluations = np.array(evaluations)
        means = evaluations.mean(axis=0)
        stds = evaluations.std(axis=0)
Ejemplo n.º 19
0
def main(config: ConfigParser) -> None:
    """
    Main testing function.

    Parameters
    ----------
    config : parse_config.ConfigParser
        Parsed configuration JSON file.
    """
    logger: Logger = config.get_logger("test")

    # Setup data_loader instance.
    data_loader: DataLoader = getattr(module_data, config["data_loader"]["type"])(
        config["data_loader"]["args"]["data_dir"],
        batch_size=512,
        shuffle=False,
        validation_split=0.0,
        training=False,
        num_workers=2,
    )

    # Build model architecture.
    model: Module = config.initialize("arch", module_arch)
    logger.info(model)

    # Get function handles of loss and metrics as well as args.
    loss_fn: Callable = getattr(module_loss, config["loss"]["type"])
    loss_args: Dict[str, Any] = config["loss"]["args"]
    metric_fns: List[Callable] = [getattr(module_metric, met) for met in config["metrics"]]
    metric_args: List[Dict[str, Any]] = [config["metrics"][met] for met in config["metrics"]]

    logger.info("Loading checkpoint: {} ...".format(config.resume))
    checkpoint: dict = torch.load(config.resume)
    state_dict: dict = checkpoint["state_dict"]
    if config["n_gpu"] > 1:
        model = torch.nn.DataParallel(model)
    model.load_state_dict(state_dict)

    # Prepare model for testing.
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    total_loss: float = 0.0
    total_metrics: Tensor = torch.zeros(len(metric_fns))

    with torch.no_grad():
        i: int
        data: Tensor
        target: Tensor
        for i, (data, target) in enumerate(tqdm(data_loader)):
            data, target = data.to(device), target.to(device)
            output: Tensor = model(data)

            #
            # save sample images, or do something with output here
            #

            # computing loss, metrics on test set
            loss: Tensor = loss_fn(output, target, **loss_args)
            batch_size: int = data.shape[0]
            total_loss += loss.item() * batch_size

            j: int
            metric: Callable
            for j, metric in enumerate(metric_fns):
                total_metrics[j] += metric(output, target, **metric_args[j]) * batch_size

    n_samples: int = len(data_loader.sampler)
    log: Dict[str, Any] = {"loss": total_loss / n_samples}

    met: Callable
    log.update(
        {met.__name__: total_metrics[i].item() / n_samples for i, met in enumerate(metric_fns)}
    )

    logger.info(log)