def test(config):
    """Test point cloud data loader.
  """
    from torch.utils.data import DataLoader
    from lib.utils import Timer
    timer = Timer()
    DatasetClass = StanfordVoxelization2cmDataset
    transformations = [
        t.RandomHorizontalFlip(DatasetClass.ROTATION_AXIS,
                               DatasetClass.IS_TEMPORAL),
        t.ChromaticAutoContrast(),
        t.ChromaticTranslation(config.data_aug_color_trans_ratio),
        t.ChromaticJitter(config.data_aug_color_jitter_std),
        t.HueSaturationTranslation(config.data_aug_hue_max,
                                   config.data_aug_saturation_max),
    ]

    dataset = DatasetClass(config,
                           input_transform=t.Compose(transformations),
                           augment_data=True,
                           cache=True,
                           elastic_distortion=True)

    data_loader = DataLoader(
        dataset=dataset,
        collate_fn=t.cfl_collate_fn_factory(limit_numpoints=False),
        batch_size=4,
        shuffle=True)

    # Start from index 1
    iter = data_loader.__iter__()
    for i in range(100):
        timer.tic()
        data = iter.next()
        print(timer.toc())
Example #2
0
    def __init__(self, net, graph, is_training):
        self.net = net
        self.graph = graph
        self.is_training = is_training
        self.num_epochs = cfg.TRAIN.NUM_EPOCHS
        self.train_timer = Timer()
        self.data_timer = Timer()

        self.global_step = slim.get_or_create_global_step()

        # Build basic ops and tensors
        if self.is_training:
            self.net.build_train_ops(self.global_step)

        if isinstance(net, Classifier):
            self.saver = tf.train.Saver(var_list=net.vars_to_restore,
                                        name='saver')
        else:
            self.saver = tf.train.Saver(max_to_keep=None,
                                        name='saver_all_var')  # Save all vars
        self.init_ops = self.build_init_ops()
        self.val_loss_ph = tf.placeholder(tf.float32,
                                          shape=(),
                                          name='val_loss_ph')
        self.net.build_summary_ops(self.graph)
        self.val_loss_summary = tf.summary.scalar(name='val_loss',
                                                  tensor=self.val_loss_ph)

        print('saver variables:')
        print_tensor_shapes(net.vars_to_restore, prefix='-->')
Example #3
0
def train(lr=0.001, progress_bar=False, fig_dir='./figs', prefix='NET'):
    loss_all, DCS, P, A, R, loss_epoch = trainer.load_checkpoint(prefix=prefix)
    timer = Timer(args.checkpoint_timer)
    for e in trainer.get_range(EPOCHS):
        model.train() if not DEEPVESSEL else None
        print('lesn loss all', len(loss_all), 'ken los all one elemtn',
              type(loss_all))

        loss = trainer.train_epoch(lr=lr, progress_bar=progress_bar)
        mean_loss = np.array(loss).mean()
        loss_epoch.append(mean_loss)
        print('EPOCH ', e, 'loss epoch', mean_loss)
        print('lesn loss all', len(loss_all), 'ken los all one elemtn',
              type(loss_all))
        print('len loss', len(loss), 'lesn loss one element', type(loss))
        new_loss = loss_all + loss
        loss_all = new_loss
        if DEEPVESSEL:
            print('Evaluation Epoch {}/{}...'.format(e, EPOCHS))
            DCS.append(evaluator.DCM(model, progress_bar=progress_bar))
            a, p, r = evaluator.bin_scores(model, progress_bar=progress_bar)
            P.append(p)
            A.append(a)
            R.append(r)
            print('DCS score:', DCS[-1], 'accuracy ', a, 'precision', p,
                  'recall', r)
        else:
            with torch.no_grad():
                print('Evaluation Epoch {}/{}...'.format(e, EPOCHS))
                model.eval()
                DCS.append(evaluator.DCM(model, progress_bar=progress_bar))
                a, p, r = evaluator.bin_scores(model,
                                               progress_bar=progress_bar)
                P.append(p)
                A.append(a)
                R.append(r)
                print('DCS score:', DCS[-1], 'accuracy ', a, 'precision', p,
                      'recall', r)
        if timer.is_time():
            measurements = np.array([DCS, P, A, R, loss_epoch])
            trainer.save_model(MODEL_PATH)
            trainer.save_checkpoint(np.array(loss_all), measurements, prefix,
                                    lr, args.dataset, e, EPOCHS, fig_dir,
                                    args.upload)
    loss_all = np.array(loss_all)
    measurements = np.array([DCS, P, A, R, loss_epoch])
    trainer.save_model(MODEL_PATH)
    trainer.save_checkpoint(loss_all, measurements, prefix, lr, args.dataset,
                            EPOCHS, EPOCHS, fig_dir, args.upload)
Example #4
0
File: drQt.py Project: DrQueue/drQt
    def __init__(self,parent=None):
        super(drQt,self).__init__(parent=parent)
        
        try:
            lib.utils.get_all_jobs()
        except:
            raise "NO MASTER FOUND"
        
        self.setupUi(self)
        self._timer_ = Timer(parent=self)
        self.timer_interrupt = 0
        self.jobs_tab_list = []
        self.nodes_tab_list = []
        self._selected_job_row = None
        
        self.setup_main()
        self.PB_refresh.clicked.connect(self.refresh)
        self.CB_auto_refresh.stateChanged.connect(self.set_autorefresh)
        self.connect(self._timer_,QtCore.SIGNAL("time_elapsed"),self.refresh)
        
        self.SB_refresh_time.setMinimum(1)
        self.SB_refresh_time.setValue(2)

        self.setWindowFlags(QtCore.Qt.Window |
                            QtCore.Qt.WindowMinimizeButtonHint | 
                            QtCore.Qt.WindowCloseButtonHint | 
                            QtCore.Qt.WindowMaximizeButtonHint)   
             
        self.LB_header.setPixmap(QtGui.QPixmap(os.path.join(icons_path, "drQHeader.png")))   
        self.connect(self.TW_job,QtCore.SIGNAL("cellClicked(int,int)"), self._store_selected_job)
        self.connect(self.TW_job,QtCore.SIGNAL("customContextMenuRequested(QPoint)"), self._create_context)
Example #5
0
def test(config, intensity=False):
    """Test point cloud data loader.
  """
    from torch.utils.data import DataLoader
    from lib.utils import Timer
    import open3d as o3d

    def make_pcd(coords, feats):
        pcd = o3d.geometry.PointCloud()
        pcd.points = o3d.utility.Vector3dVector(coords[:, :3].float().numpy())
        pcd.colors = o3d.utility.Vector3dVector(feats[:, :3].numpy() / 255)
        if intensity:
            pcd.intensities = o3d.utility.Vector3dVector(feats[:, 3:3].numpy())
        return pcd

    timer = Timer()
    DatasetClass = FacilityArea5Dataset
    transformations = [
        t.RandomHorizontalFlip(DatasetClass.ROTATION_AXIS,
                               DatasetClass.IS_TEMPORAL),
        t.ChromaticAutoContrast(),
        t.ChromaticTranslation(config.data_aug_color_trans_ratio),
        t.ChromaticJitter(config.data_aug_color_jitter_std),
    ]

    dataset = DatasetClass(config,
                           prevoxel_transform=t.ElasticDistortion(
                               DatasetClass.ELASTIC_DISTORT_PARAMS),
                           input_transform=t.Compose(transformations),
                           augment_data=True,
                           cache=True,
                           elastic_distortion=True)

    data_loader = DataLoader(
        dataset=dataset,
        collate_fn=t.cfl_collate_fn_factory(limit_numpoints=False),
        batch_size=1,
        shuffle=True)

    # Start from index 1
    iter = data_loader.__iter__()
    for i in range(100):
        timer.tic()
        coords, feats, labels = iter.next()
        pcd = make_pcd(coords, feats)
        o3d.visualization.draw_geometries([pcd])
        print(timer.toc())
Example #6
0
    def __init__(self, config):
        self._config = config
        self._train_config = config['train']
        self._model_config = config['model']
        self._data_config = config['data']

        # the folders for model and tensor board
        self._training_folder = None
        self._model_folder = None
        self._tfb_folder = None

        # build model
        self._model = BaseModel.generate_model_from_config(self._model_config)
        self._model_saver = tf.train.Saver(max_to_keep=0)

        # other setting
        self._timer = Timer('m')
Example #7
0
    def test(self):
        from torch.utils.data import DataLoader
        from lib.utils import Timer
        from config import get_config
        config = get_config()

        dataset = SynthiaVoxelizationDataset(config)
        timer = Timer()

        data_loader = DataLoader(
            dataset=dataset,
            collate_fn=cfl_collate_fn_factory(limit_numpoints=False),
            num_workers=0,
            batch_size=4,
            shuffle=True)

        # Start from index 1
        # for i, batch in enumerate(data_loader, 1):
        iter = data_loader.__iter__()
        for i in range(100):
            timer.tic()
            batch = iter.next()
            print(batch, timer.toc())
Example #8
0
    def __init__(self, log_dir, logger, enabled):
        self.writer = None
        self.selected_module = ""

        if enabled:
            log_dir = str(log_dir)

            # Retrieve vizualization writer.
            succeeded = False
            for module in ["torch.utils.tensorboard", "tensorboardX"]:
                try:
                    self.writer = importlib.import_module(
                        module).SummaryWriter(log_dir)
                    succeeded = True
                    break
                except ImportError:
                    succeeded = False
                self.selected_module = module

            if not succeeded:
                message = "Warning: visualization (Tensorboard) is configured to use, but currently not installed on " \
                    "this machine. Please install either TensorboardX with 'pip install tensorboardx', upgrade " \
                    "PyTorch to version >= 1.1 for using 'torch.utils.tensorboard' or turn off the option in " \
                    "the 'config.json' file."
                logger.warning(message)

        self.step = 0
        self.mode = ''

        self.tb_writer_ftns = {
            'add_scalar', 'add_scalars', 'add_image', 'add_images',
            'add_audio', 'add_text', 'add_histogram', 'add_pr_curve',
            'add_embedding'
        }
        self.tag_mode_exceptions = {'add_histogram', 'add_embedding'}

        self.timer = Timer()
Example #9
0
File: drQt.py Project: DrQueue/drQt
class drQt(drQtUI.Ui_MainWindow,QtGui.QMainWindow):
    node_properties=["Id", "Hostname", "Arch", "OS", "Nbits", "Procspeed", "CPUs", "Cores each CPU", "Memory", "Load average", "Pools"]
    job_properties=["Id", "Name", "Owner", "Status", "Total tasks", "Tasks left", "Done", "Pool"]
    
    def __init__(self,parent=None):
        super(drQt,self).__init__(parent=parent)
        
        try:
            lib.utils.get_all_jobs()
        except:
            raise "NO MASTER FOUND"
        
        self.setupUi(self)
        self._timer_ = Timer(parent=self)
        self.timer_interrupt = 0
        self.jobs_tab_list = []
        self.nodes_tab_list = []
        self._selected_job_row = None
        
        self.setup_main()
        self.PB_refresh.clicked.connect(self.refresh)
        self.CB_auto_refresh.stateChanged.connect(self.set_autorefresh)
        self.connect(self._timer_,QtCore.SIGNAL("time_elapsed"),self.refresh)
        
        self.SB_refresh_time.setMinimum(1)
        self.SB_refresh_time.setValue(2)

        self.setWindowFlags(QtCore.Qt.Window |
                            QtCore.Qt.WindowMinimizeButtonHint | 
                            QtCore.Qt.WindowCloseButtonHint | 
                            QtCore.Qt.WindowMaximizeButtonHint)   
             
        self.LB_header.setPixmap(QtGui.QPixmap(os.path.join(icons_path, "drQHeader.png")))   
        self.connect(self.TW_job,QtCore.SIGNAL("cellClicked(int,int)"), self._store_selected_job)
        self.connect(self.TW_job,QtCore.SIGNAL("customContextMenuRequested(QPoint)"), self._create_context)
        
    def _raise_new_job(self):
        log.debug("start new job")
        newjobD = SendJob(self)
        newjobD.show()    
                    
    def _raise_about(self):
        aboutD= AboutDialog(self)
        aboutD.show()   
        
    def _store_selected_job(self, row,column):
        self._selected_job_row = row
            
    def setup_menu_bar(self):
        menu_bar = self.menuBar()
        job_bar = menu_bar.addMenu("&Job")
        new_job = QtGui.QAction("&New Job",self)
        self.connect(new_job, QtCore.SIGNAL('triggered()'), self._raise_new_job)
        self.connect(self.NewJobButton, QtCore.SIGNAL('clicked()'), self._raise_new_job)
        job_bar.addAction(new_job)
        
        help_bar = menu_bar.addMenu("&Help")
        
        About = QtGui.QAction("&About",self)
        self.connect(About, QtCore.SIGNAL('triggered()'), self._raise_about)
        help_bar.addAction(About)

        
    def setup_main(self):
        self.setWindowTitle("DrQueue Manager")
        self.resize(1000,600)
        
        self.setup_menu_bar()
        self.set_main_icons()
        
        self.setup_jobs()
        self.init_jobs_tabs()
        
        self.setup_slaves()
        self.init_slaves_tabs()

        
    def setup_slaves(self):
        self.TW_node.clear()
        
        self.TW_node.setColumnCount(len(self.node_properties))
        self.TW_node.setHorizontalHeaderLabels(self.node_properties) 
        
        self.TW_node.verticalHeader().hide()
        self.TW_node.setAlternatingRowColors(True)
        self.TW_node.setSelectionBehavior(QtGui.QTableView.SelectRows)
        self.TW_node.setSelectionMode(QtGui.QTableView.SingleSelection) 
                               
    def setup_jobs(self):
        self.TW_job.clear()

        self.TW_job.setColumnCount(len(self.job_properties))
        self.TW_job.setHorizontalHeaderLabels(self.job_properties) 

        self.TW_job.verticalHeader().hide()
        self.TW_job.setAlternatingRowColors(True)
        self.TW_job.setSelectionBehavior(QtGui.QTableView.SelectRows)
        self.TW_job.setSelectionMode(QtGui.QTableView.SingleSelection)

    def refresh(self):
        self.setCursor(QtCore.Qt.WaitCursor);
        self.init_jobs_tabs()
        self.init_slaves_tabs()
        
        self.TW_job.repaint()
        self.TW_node.repaint()
        
        if self._selected_job_row != None:
            log.debug("restore row selection %s" % self._selected_job_row)
            self.TW_job.setCurrentCell(self._selected_job_row,0)
        
        self.setCursor(QtCore.Qt.ArrowCursor);
        
    def set_autorefresh(self,status):
        if status:
            log.debug("autorefresh:ON")
            refresh_time = self.SB_refresh_time.value()
            self._timer_.set_run_time(refresh_time)
            self._timer_.start()
        else:
            log.debug("autorefresh:OFF")
            self._timer_.terminate()

    def init_jobs_tabs(self):
        self.jobs_tab_list=[]
        log.debug("building job tabs...")
        self.TW_job.clearContents()
        jobs = lib.utils.get_all_jobs()
        num_jobs = len(jobs)
        log.debug("num jobs %s" % num_jobs)
        self.TW_job.setRowCount(num_jobs)        

        for i in range(num_jobs):
            job_tab = JobTab(jobs[i],parent = self.TW_job)
            job_tab.add_to_table(self.TW_job, i)
            self.connect(job_tab, QtCore.SIGNAL('update'), self.refresh)
            self.jobs_tab_list.append(job_tab)
        
    def init_slaves_tabs(self):
        self.nodes_tab_list = []
        log.debug("building nodes tabs...")
        nodes = lib.utils.get_all_slaves()
        num_nodes = len(nodes)
        self.TW_node.setRowCount(num_nodes)
        for i in range(num_nodes):
            log.debug("create slave Node Tab : %s" % type(nodes[i]))
            node_tab = SlaveNodeTab(nodes[i],parent = self.TW_node)
            node_tab.add_to_table(self.TW_node, i)
            self.connect(node_tab, QtCore.SIGNAL('update'), self.refresh)
            self.nodes_tab_list.append(node_tab)
            
    def set_main_icons(self):
        self.setWindowIcon(QtGui.QIcon(os.path.join(icons_path, "main.svg")))
        self.TW_main.setTabIcon(0,QtGui.QIcon(os.path.join(icons_path, "job.svg")))
        self.TW_main.setTabIcon(1,QtGui.QIcon(os.path.join(icons_path, "network-transmit-receive.svg")))
        
    
    def _create_context(self, QPoint):
        """ create the context menu """
        newAct = QtGui.QAction("&New Job",self)
        newAct.setToolTip("createa new job")
        self.connect(newAct, QtCore.SIGNAL('triggered()'), self._new_job_show)  
        
        menu = QtGui.QMenu("Menu", self)
        menu.addAction(newAct)
        menu.exec_(QtGui.QCursor.pos())  
Example #10
0
    def train(self, train_loader, val_loader=None):
        ''' Given data queues, train the network '''
        # Parameter directory
        save_dir = os.path.join(cfg.DIR.OUT_PATH)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Timer for the training op and parallel data loading op.
        train_timer = Timer()
        data_timer = Timer()
        training_losses = []

        # Setup learning rates
        lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()]

        #Setup the lr_scheduler
        self.lr_scheduler = lr_scheduler.MultiStepLR(self.optimizer,
                                                     lr_steps,
                                                     gamma=0.1)

        start_iter = 0
        # Resume training
        if cfg.TRAIN.RESUME_TRAIN:
            self.load(cfg.CONST.WEIGHTS)
            start_iter = cfg.TRAIN.INITIAL_ITERATION

        # Main training loop
        train_loader_iter = iter(train_loader)
        for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1):
            self.lr_scheduler.step()

            data_timer.tic()
            try:
                batch_img, batch_voxel = train_loader_iter.next()
            except StopIteration:
                train_loader_iter = iter(train_loader)
                batch_img, batch_voxel = train_loader_iter.next()

            data_timer.toc()

            if self.net.is_x_tensor4:
                batch_img = batch_img[0]

            # Apply one gradient step
            train_timer.tic()
            loss = self.train_loss(batch_img, batch_voxel)
            train_timer.toc()

            training_losses.append(loss.item())

            # Decrease learning rate at certain points
            if train_ind in lr_steps:
                #for pytorch optimizer, learning rate can only be set when the optimizer is created
                #or using torch.optim.lr_scheduler
                print('Learing rate decreased to %f: ' %
                      cfg.TRAIN.LEARNING_RATES[str(train_ind)])

            # Debugging modules
            #
            # Print status, run validation, check divergence, and save model.
            if train_ind % cfg.TRAIN.PRINT_FREQ == 0:
                # Print the current loss
                print('%s Iter: %d Loss: %f' %
                      (datetime.now(), train_ind, loss))

            if train_ind % cfg.TRAIN.VALIDATION_FREQ == 0 and val_loader is not None:
                # Print test loss and params to check convergence every N iterations

                val_losses = 0
                val_num_iter = min(cfg.TRAIN.NUM_VALIDATION_ITERATIONS,
                                   len(val_loader))
                val_loader_iter = iter(val_loader)
                for i in range(val_num_iter):
                    batch_img, batch_voxel = val_loader_iter.next()
                    val_loss = self.train_loss(batch_img, batch_voxel)
                    val_losses += val_loss
                var_losses_mean = val_losses / val_num_iter
                print('%s Test loss: %f' % (datetime.now(), var_losses_mean))

            if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0:
                # Check that the network parameters are all valid
                nan_or_max_param = max_or_nan(self.net.parameters())
                if has_nan(nan_or_max_param):
                    print('NAN detected')
                    break

            if (train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0) or \
                    (train_ind == cfg.TRAIN.NUM_ITERATION):
                # Save the checkpoint every a few iterations or at the end.
                self.save(training_losses, save_dir, train_ind)

            #loss is a Variable containing torch.FloatTensor of size 1
            if loss.item() > cfg.TRAIN.LOSS_LIMIT:
                print("Cost exceeds the threshold. Stop training")
                break
Example #11
0
def train(model, data_loader, val_data_loader, config, transform_data_fn=None):

    device = config.device_id
    distributed = get_world_size() > 1

    # Set up the train flag for batch normalization
    model.train()

    # Configuration
    writer = SummaryWriter(log_dir=config.log_dir)
    data_timer, iter_timer = Timer(), Timer()
    fw_timer, bw_timer, ddp_timer = Timer(), Timer(), Timer()

    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    fw_time_avg, bw_time_avg, ddp_time_avg = AverageMeter(), AverageMeter(
    ), AverageMeter()

    losses, scores = AverageMeter(), AverageMeter()

    optimizer = initialize_optimizer(model.parameters(), config)
    scheduler = initialize_scheduler(optimizer, config)
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)

    writer = SummaryWriter(log_dir=config.log_dir)

    # Train the network
    logging.info('===> Start training on {} GPUs, batch-size={}'.format(
        get_world_size(), config.batch_size * get_world_size()))
    best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        checkpoint_fn = config.resume + '/weights.pth'
        if osp.isfile(checkpoint_fn):
            logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
            state = torch.load(
                checkpoint_fn,
                map_location=lambda s, l: default_restore_location(s, 'cpu'))
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            load_state(model, state['state_dict'])

            if config.resume_optimizer:
                scheduler = initialize_scheduler(optimizer,
                                                 config,
                                                 last_step=curr_iter)
                optimizer.load_state_dict(state['optimizer'])
            if 'best_val' in state:
                best_val_miou = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_fn, state['epoch']))
        else:
            raise ValueError(
                "=> no checkpoint found at '{}'".format(checkpoint_fn))

    data_iter = data_loader.__iter__()  # (distributed) infinite sampler
    while is_training:
        for iteration in range(len(data_loader) // config.iter_size):
            optimizer.zero_grad()
            data_time, batch_loss, batch_score = 0, 0, 0
            iter_timer.tic()

            # set random seed for every iteration for trackability
            _set_seed(config, curr_iter)

            for sub_iter in range(config.iter_size):
                # Get training data
                data_timer.tic()
                coords, input, target = data_iter.next()

                # For some networks, making the network invariant to even, odd coords is important
                coords[:, :3] += (torch.rand(3) * 100).type_as(coords)

                # Preprocess input
                color = input[:, :3].int()
                if config.normalize_color:
                    input[:, :3] = input[:, :3] / 255. - 0.5
                sinput = SparseTensor(input, coords).to(device)

                data_time += data_timer.toc(False)

                # Feed forward
                fw_timer.tic()

                inputs = (sinput, ) if config.wrapper_type == 'None' else (
                    sinput, coords, color)
                # model.initialize_coords(*init_args)
                soutput = model(*inputs)
                # The output of the network is not sorted
                target = target.long().to(device)

                loss = criterion(soutput.F, target.long())

                # Compute and accumulate gradient
                loss /= config.iter_size

                pred = get_prediction(data_loader.dataset, soutput.F, target)
                score = precision_at_one(pred, target)

                fw_timer.toc(False)
                bw_timer.tic()

                # bp the loss
                loss.backward()

                bw_timer.toc(False)

                # gather information
                logging_output = {
                    'loss': loss.item(),
                    'score': score / config.iter_size
                }

                ddp_timer.tic()
                if distributed:
                    logging_output = all_gather_list(logging_output)
                    logging_output = {
                        w: np.mean([a[w] for a in logging_output])
                        for w in logging_output[0]
                    }

                batch_loss += logging_output['loss']
                batch_score += logging_output['score']
                ddp_timer.toc(False)

            # Update number of steps
            optimizer.step()
            scheduler.step()

            data_time_avg.update(data_time)
            iter_time_avg.update(iter_timer.toc(False))
            fw_time_avg.update(fw_timer.diff)
            bw_time_avg.update(bw_timer.diff)
            ddp_time_avg.update(ddp_timer.diff)

            losses.update(batch_loss, target.size(0))
            scores.update(batch_score, target.size(0))

            if curr_iter >= config.max_iter:
                is_training = False
                break

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join(
                    ['{:.3e}'.format(x) for x in scheduler.get_lr()])
                debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format(
                    epoch, curr_iter,
                    len(data_loader) // config.iter_size, losses.avg, lrs)
                debug_str += "Score {:.3f}\tData time: {:.4f}, Forward time: {:.4f}, Backward time: {:.4f}, DDP time: {:.4f}, Total iter time: {:.4f}".format(
                    scores.avg, data_time_avg.avg, fw_time_avg.avg,
                    bw_time_avg.avg, ddp_time_avg.avg, iter_time_avg.avg)
                logging.info(debug_str)
                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()
                # Write logs
                writer.add_scalar('training/loss', losses.avg, curr_iter)
                writer.add_scalar('training/precision_at_1', scores.avg,
                                  curr_iter)
                writer.add_scalar('training/learning_rate',
                                  scheduler.get_lr()[0], curr_iter)
                losses.reset()
                scores.reset()

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(model, optimizer, epoch, curr_iter, config,
                           best_val_miou, best_val_iter)

            # Validation
            if curr_iter % config.val_freq == 0:
                val_miou = validate(model, val_data_loader, writer, curr_iter,
                                    config, transform_data_fn)
                if val_miou > best_val_miou:
                    best_val_miou = val_miou
                    best_val_iter = curr_iter
                    checkpoint(model, optimizer, epoch, curr_iter, config,
                               best_val_miou, best_val_iter, "best_val")
                logging.info("Current best mIoU: {:.3f} at iter {}".format(
                    best_val_miou, best_val_iter))

                # Recover back
                model.train()

            if curr_iter % config.empty_cache_freq == 0:
                # Clear cache
                torch.cuda.empty_cache()

            # End of iteration
            curr_iter += 1

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
               best_val_iter)
    val_miou = validate(model, val_data_loader, writer, curr_iter, config,
                        transform_data_fn)
    if val_miou > best_val_miou:
        best_val_miou = val_miou
        best_val_iter = curr_iter
        checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
                   best_val_iter, "best_val")
    logging.info("Current best mIoU: {:.3f} at iter {}".format(
        best_val_miou, best_val_iter))
Example #12
0
class TensorboardWriter():
    def __init__(self, log_dir, logger, enabled):
        self.writer = None
        self.selected_module = ""

        if enabled:
            log_dir = str(log_dir)

            # Retrieve vizualization writer.
            succeeded = False
            for module in ["torch.utils.tensorboard", "tensorboardX"]:
                try:
                    self.writer = importlib.import_module(
                        module).SummaryWriter(log_dir)
                    succeeded = True
                    break
                except ImportError:
                    succeeded = False
                self.selected_module = module

            if not succeeded:
                message = "Warning: visualization (Tensorboard) is configured to use, but currently not installed on " \
                    "this machine. Please install either TensorboardX with 'pip install tensorboardx', upgrade " \
                    "PyTorch to version >= 1.1 for using 'torch.utils.tensorboard' or turn off the option in " \
                    "the 'config.json' file."
                logger.warning(message)

        self.step = 0
        self.mode = ''

        self.tb_writer_ftns = {
            'add_scalar', 'add_scalars', 'add_image', 'add_images',
            'add_audio', 'add_text', 'add_histogram', 'add_pr_curve',
            'add_embedding'
        }
        self.tag_mode_exceptions = {'add_histogram', 'add_embedding'}

        self.timer = Timer()

    def set_step(self, step, mode='train'):
        self.mode = mode
        self.step = step
        if step == 0:
            self.timer.reset()
        else:
            duration = self.timer.check()
            self.add_scalar('steps_per_sec', 1 / duration)

    def __getattr__(self, name):
        """
        If visualization is configured to use:
            return add_data() methods of tensorboard with additional information (step, tag) added.
        Otherwise:
            return a blank function handle that does nothing
        """
        if name in self.tb_writer_ftns:
            add_data = getattr(self.writer, name, None)

            def wrapper(tag, data, *args, **kwargs):
                if add_data is not None:
                    # add mode(train/valid) tag
                    if name not in self.tag_mode_exceptions:
                        tag = '{}/{}'.format(tag, self.mode)
                    add_data(tag, data, self.step, *args, **kwargs)

            return wrapper
        else:
            # default action for returning methods defined in this class, set_step() for instance.
            try:
                attr = object.__getattr__(name)
            except AttributeError:
                raise AttributeError(
                    "type object '{}' has no attribute '{}'".format(
                        self.selected_module, name))
            return attr
Example #13
0
def test(model,
         data_loader,
         config,
         transform_data_fn=None,
         has_gt=True,
         validation=None,
         epoch=None):
    device = get_torch_device(config.is_cuda)
    dataset = data_loader.dataset
    num_labels = dataset.NUM_LABELS
    global_timer, data_timer, iter_timer = Timer(), Timer(), Timer()
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)
    alpha, gamma, eps = 1, 2, 1e-6  # Focal Loss parameters
    losses, scores, ious = AverageMeter(), AverageMeter(), 0
    aps = np.zeros((0, num_labels))
    hist = np.zeros((num_labels, num_labels))

    if not config.is_train:
        checkpoint_fn = config.resume + '/weights.pth'
        if osp.isfile(checkpoint_fn):
            logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
            state = torch.load(checkpoint_fn)
            model.load_state_dict(state['state_dict'])
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_fn, state['epoch']))
        else:
            raise ValueError(
                "=> no checkpoint found at '{}'".format(checkpoint_fn))
    if validation:
        logging.info('===> Start validating')
    else:
        logging.info('===> Start testing')

    global_timer.tic()
    data_iter = data_loader.__iter__()
    max_iter = len(data_loader)
    max_iter_unique = max_iter

    all_preds, all_labels, batch_losses, batch_loss = [], [], {}, 0

    # Fix batch normalization running mean and std
    model.eval()

    # Clear cache (when run in val mode, cleanup training cache)
    torch.cuda.empty_cache()

    if config.save_prediction or config.test_original_pointcloud:
        if config.save_prediction:
            save_pred_dir = config.save_pred_dir
            os.makedirs(save_pred_dir, exist_ok=True)
        else:
            save_pred_dir = tempfile.mkdtemp()
        if os.listdir(save_pred_dir):
            raise ValueError(f'Directory {save_pred_dir} not empty. '
                             'Please remove the existing prediction.')

    with torch.no_grad():
        for iteration in range(max_iter):
            data_timer.tic()
            if config.return_transformation:
                coords, input, target, transformation = data_iter.next()
            else:
                coords, input, target = data_iter.next()
                transformation = None
            data_time = data_timer.toc(False)

            # Preprocess input
            iter_timer.tic()

            if config.wrapper_type != 'None':
                color = input[:, :3].int()
            if config.normalize_color:
                input[:, :3] = input[:, :3] / 255. - 0.5
            sinput = SparseTensor(input, coords).to(device)

            # Feed forward
            inputs = (sinput, ) if config.wrapper_type == 'None' else (sinput,
                                                                       coords,
                                                                       color)
            soutput = model(*inputs)
            output = soutput.F

            pred = get_prediction(dataset, output, target).int()
            iter_time = iter_timer.toc(False)

            all_preds.append(pred.cpu().detach().numpy())
            all_labels.append(target.cpu().detach().numpy())

            if config.save_prediction or config.test_original_pointcloud:
                save_predictions(coords, pred, transformation, dataset, config,
                                 iteration, save_pred_dir)

            if has_gt:
                if config.evaluate_original_pointcloud:
                    raise NotImplementedError('pointcloud')
                    output, pred, target = permute_pointcloud(
                        coords, pointcloud, transformation, dataset.label_map,
                        output, pred)

                target_np = target.numpy()
                num_sample = target_np.shape[0]
                target = target.to(device)
                """# focal loss
        input_soft = nn.functional.softmax(output, dim=1) + eps
        focal_weight = torch.pow(-input_soft + 1., gamma)
        loss = (-alpha * focal_weight * torch.log(input_soft)).mean()"""

                loss = criterion(output, target.long())

                batch_loss += loss

                losses.update(float(loss), num_sample)
                scores.update(precision_at_one(pred, target), num_sample)
                hist += fast_hist(pred.cpu().numpy().flatten(),
                                  target_np.flatten(), num_labels)
                ious = per_class_iu(hist) * 100

                prob = torch.nn.functional.softmax(output, dim=1)
                ap = average_precision(prob.cpu().detach().numpy(), target_np)
                aps = np.vstack((aps, ap))
                # Due to heavy bias in class, there exists class with no test label at all
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=RuntimeWarning)
                    ap_class = np.nanmean(aps, 0) * 100.

            if iteration % config.test_stat_freq == 0 and iteration > 0:
                preds = np.concatenate(all_preds)
                targets = np.concatenate(all_labels)
                to_ignore = [
                    i for i in range(len(targets)) if targets[i] == 255
                ]
                preds_trunc = [
                    preds[i] for i in range(len(preds)) if i not in to_ignore
                ]
                targets_trunc = [
                    targets[i] for i in range(len(targets))
                    if i not in to_ignore
                ]
                cm = confusion_matrix(targets_trunc,
                                      preds_trunc,
                                      normalize='true')
                np.savetxt(config.log_dir + '/cm_epoch_{0}.txt'.format(epoch),
                           cm)

                reordered_ious = dataset.reorder_result(ious)
                reordered_ap_class = dataset.reorder_result(ap_class)
                class_names = dataset.get_classnames()
                print_info(iteration,
                           max_iter_unique,
                           data_time,
                           iter_time,
                           has_gt,
                           losses,
                           scores,
                           reordered_ious,
                           hist,
                           reordered_ap_class,
                           class_names=class_names)

            if iteration % config.empty_cache_freq == 0:
                # Clear cache
                torch.cuda.empty_cache()

            batch_losses[epoch] = batch_loss

    global_time = global_timer.toc(False)

    reordered_ious = dataset.reorder_result(ious)
    reordered_ap_class = dataset.reorder_result(ap_class)
    class_names = dataset.get_classnames()
    print_info(iteration,
               max_iter_unique,
               data_time,
               iter_time,
               has_gt,
               losses,
               scores,
               reordered_ious,
               hist,
               reordered_ap_class,
               class_names=class_names)

    if not config.is_train:
        preds = np.concatenate(all_preds)
        targets = np.concatenate(all_labels)
        to_ignore = [i for i in range(len(targets)) if targets[i] == 255]
        preds_trunc = [
            preds[i] for i in range(len(preds)) if i not in to_ignore
        ]
        targets_trunc = [
            targets[i] for i in range(len(targets)) if i not in to_ignore
        ]
        cm = confusion_matrix(targets_trunc, preds_trunc, normalize='true')
        np.savetxt(config.log_dir + '/cm.txt', cm)

    if config.test_original_pointcloud:
        logging.info('===> Start testing on original pointcloud space.')
        dataset.test_pointcloud(save_pred_dir)
    logging.info("Finished test. Elapsed time: {:.4f}".format(global_time))

    if validation:
        loss_file_name = "/val_loss.txt"
        with open(config.log_dir + loss_file_name, 'a') as val_loss_file:
            for key in batch_losses:
                val_loss_file.writelines('{0}, {1}\n'.format(
                    batch_losses[key], key))
        val_loss_file.close()
        return losses.avg, scores.avg, np.nanmean(ap_class), np.nanmean(
            per_class_iu(hist)) * 100, batch_losses

    else:
        return losses.avg, scores.avg, np.nanmean(ap_class), np.nanmean(
            per_class_iu(hist)) * 100
def train(model, data_loader, val_data_loader, config, transform_data_fn=None):
    device = get_torch_device(config.is_cuda)
    # Set up the train flag for batch normalization
    model.train()

    # Configuration
    writer = SummaryWriter(log_dir=config.log_dir)
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    losses, scores = AverageMeter(), AverageMeter()

    optimizer = initialize_optimizer(model.parameters(), config)
    scheduler = initialize_scheduler(optimizer, config)
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)

    writer = SummaryWriter(log_dir=config.log_dir)

    # Train the network
    logging.info('===> Start training')
    best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        checkpoint_fn = config.resume + '/weights.pth'
        if osp.isfile(checkpoint_fn):
            logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
            state = torch.load(checkpoint_fn)
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            model.load_state_dict(state['state_dict'])
            if config.resume_optimizer:
                scheduler = initialize_scheduler(optimizer,
                                                 config,
                                                 last_step=curr_iter)
                optimizer.load_state_dict(state['optimizer'])
            if 'best_val' in state:
                best_val_miou = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_fn, state['epoch']))
        else:
            raise ValueError(
                "=> no checkpoint found at '{}'".format(checkpoint_fn))

    data_iter = data_loader.__iter__()
    while is_training:
        for iteration in range(len(data_loader) // config.iter_size):
            optimizer.zero_grad()
            data_time, batch_loss = 0, 0
            iter_timer.tic()

            for sub_iter in range(config.iter_size):
                # Get training data
                data_timer.tic()
                if config.return_transformation:
                    coords, input, target, pointcloud, transformation = data_iter.next(
                    )
                else:
                    coords, input, target = data_iter.next()

                # For some networks, making the network invariant to even, odd coords is important
                coords[:, 1:] += (torch.rand(3) * 100).type_as(coords)

                # Preprocess input
                if config.normalize_color:
                    input[:, :3] = input[:, :3] / 255. - 0.5
                sinput = SparseTensor(input, coords).to(device)

                data_time += data_timer.toc(False)

                # model.initialize_coords(*init_args)
                soutput = model(sinput)
                # The output of the network is not sorted
                target = target.long().to(device)

                loss = criterion(soutput.F, target.long())

                # Compute and accumulate gradient
                loss /= config.iter_size
                batch_loss += loss.item()
                loss.backward()

            # Update number of steps
            optimizer.step()
            scheduler.step()

            data_time_avg.update(data_time)
            iter_time_avg.update(iter_timer.toc(False))

            pred = get_prediction(data_loader.dataset, soutput.F, target)
            score = precision_at_one(pred, target)
            losses.update(batch_loss, target.size(0))
            scores.update(score, target.size(0))

            if curr_iter >= config.max_iter:
                is_training = False
                break

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join(
                    ['{:.3e}'.format(x) for x in scheduler.get_lr()])
                debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format(
                    epoch, curr_iter,
                    len(data_loader) // config.iter_size, losses.avg, lrs)
                debug_str += "Score {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format(
                    scores.avg, data_time_avg.avg, iter_time_avg.avg)
                logging.info(debug_str)
                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()
                # Write logs
                writer.add_scalar('training/loss', losses.avg, curr_iter)
                writer.add_scalar('training/precision_at_1', scores.avg,
                                  curr_iter)
                writer.add_scalar('training/learning_rate',
                                  scheduler.get_lr()[0], curr_iter)
                losses.reset()
                scores.reset()

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(model, optimizer, epoch, curr_iter, config,
                           best_val_miou, best_val_iter)

            # Validation
            if curr_iter % config.val_freq == 0:
                val_miou = validate(model, val_data_loader, writer, curr_iter,
                                    config, transform_data_fn)
                if val_miou > best_val_miou:
                    best_val_miou = val_miou
                    best_val_iter = curr_iter
                    checkpoint(model, optimizer, epoch, curr_iter, config,
                               best_val_miou, best_val_iter, "best_val")
                logging.info("Current best mIoU: {:.3f} at iter {}".format(
                    best_val_miou, best_val_iter))

                # Recover back
                model.train()

            # End of iteration
            curr_iter += 1

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
               best_val_iter)
    val_miou = validate(model, val_data_loader, writer, curr_iter, config,
                        transform_data_fn)
    if val_miou > best_val_miou:
        best_val_miou = val_miou
        best_val_iter = curr_iter
        checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
                   best_val_iter, "best_val")
    logging.info("Current best mIoU: {:.3f} at iter {}".format(
        best_val_miou, best_val_iter))
Example #15
0
def test(model, data_loader, config, transform_data_fn=None, has_gt=True):
    device = get_torch_device(config.is_cuda)
    dataset = data_loader.dataset
    num_labels = dataset.NUM_LABELS
    global_timer, data_timer, iter_timer = Timer(), Timer(), Timer()
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)
    losses, scores, ious = AverageMeter(), AverageMeter(), 0
    aps = np.zeros((0, num_labels))
    hist = np.zeros((num_labels, num_labels))

    logging.info('===> Start testing')

    global_timer.tic()
    data_iter = data_loader.__iter__()
    max_iter = len(data_loader)
    max_iter_unique = max_iter

    # Fix batch normalization running mean and std
    model.eval()

    # Clear cache (when run in val mode, cleanup training cache)
    torch.cuda.empty_cache()

    if config.save_prediction or config.test_original_pointcloud:
        if config.save_prediction:
            save_pred_dir = config.save_pred_dir
            os.makedirs(save_pred_dir, exist_ok=True)
        else:
            save_pred_dir = tempfile.mkdtemp()
        if os.listdir(save_pred_dir):
            raise ValueError(f'Directory {save_pred_dir} not empty. '
                             'Please remove the existing prediction.')

    with torch.no_grad():
        for iteration in range(max_iter):
            data_timer.tic()
            if config.return_transformation:
                coords, input, target, transformation = data_iter.next()
            else:
                coords, input, target = data_iter.next()
                transformation = None
            data_time = data_timer.toc(False)

            # Preprocess input
            iter_timer.tic()

            if config.wrapper_type != 'None':
                color = input[:, :3].int()
            if config.normalize_color:
                input[:, :3] = input[:, :3] / 255. - 0.5
            sinput = SparseTensor(input, coords).to(device)

            # Feed forward
            inputs = (sinput, ) if config.wrapper_type == 'None' else (sinput,
                                                                       coords,
                                                                       color)
            soutput = model(*inputs)
            output = soutput.F

            pred = get_prediction(dataset, output, target).int()
            iter_time = iter_timer.toc(False)

            if config.save_prediction or config.test_original_pointcloud:
                save_predictions(coords, pred, transformation, dataset, config,
                                 iteration, save_pred_dir)

            if has_gt:
                if config.evaluate_original_pointcloud:
                    raise NotImplementedError('pointcloud')
                    output, pred, target = permute_pointcloud(
                        coords, pointcloud, transformation, dataset.label_map,
                        output, pred)

                target_np = target.numpy()

                num_sample = target_np.shape[0]

                target = target.to(device)

                cross_ent = criterion(output, target.long())
                losses.update(float(cross_ent), num_sample)
                scores.update(precision_at_one(pred, target), num_sample)
                hist += fast_hist(pred.cpu().numpy().flatten(),
                                  target_np.flatten(), num_labels)
                ious = per_class_iu(hist) * 100

                prob = torch.nn.functional.softmax(output, dim=1)
                ap = average_precision(prob.cpu().detach().numpy(), target_np)
                aps = np.vstack((aps, ap))
                # Due to heavy bias in class, there exists class with no test label at all
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=RuntimeWarning)
                    ap_class = np.nanmean(aps, 0) * 100.

            if iteration % config.test_stat_freq == 0 and iteration > 0:
                reordered_ious = dataset.reorder_result(ious)
                reordered_ap_class = dataset.reorder_result(ap_class)
                class_names = dataset.get_classnames()
                print_info(iteration,
                           max_iter_unique,
                           data_time,
                           iter_time,
                           has_gt,
                           losses,
                           scores,
                           reordered_ious,
                           hist,
                           reordered_ap_class,
                           class_names=class_names)

            if iteration % config.empty_cache_freq == 0:
                # Clear cache
                torch.cuda.empty_cache()

    global_time = global_timer.toc(False)

    reordered_ious = dataset.reorder_result(ious)
    reordered_ap_class = dataset.reorder_result(ap_class)
    class_names = dataset.get_classnames()
    print_info(iteration,
               max_iter_unique,
               data_time,
               iter_time,
               has_gt,
               losses,
               scores,
               reordered_ious,
               hist,
               reordered_ap_class,
               class_names=class_names)

    if config.test_original_pointcloud:
        logging.info('===> Start testing on original pointcloud space.')
        dataset.test_pointcloud(save_pred_dir)

    logging.info("Finished test. Elapsed time: {:.4f}".format(global_time))

    return losses.avg, scores.avg, np.nanmean(ap_class), np.nanmean(
        per_class_iu(hist)) * 100
Example #16
0
    def train(self, train_queue, val_queue=None):
        ''' Given data queues, train the network '''
        # Parameter directory
        save_dir = os.path.join(cfg.DIR.OUT_PATH)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Timer for the training op and parallel data loading op.
        train_timer = Timer()
        data_timer = Timer()
        training_losses = []

        start_iter = 0
        # Resume training
        if cfg.TRAIN.RESUME_TRAIN:
            self.net.load(cfg.CONST.WEIGHTS)
            start_iter = cfg.TRAIN.INITIAL_ITERATION

        # Setup learning rates
        lr = cfg.TRAIN.DEFAULT_LEARNING_RATE
        lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()]

        print('Set the learning rate to %f.' % lr)
        self.set_lr(lr)

        # Main training loop
        for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1):
            data_timer.tic()
            batch_img, batch_voxel = train_queue.get()
            data_timer.toc()

            if self.net.is_x_tensor4:
                batch_img = batch_img[0]

            # Apply one gradient step
            train_timer.tic()
            loss = self.train_loss(batch_img, batch_voxel)
            train_timer.toc()

            training_losses.append(loss)

            # Decrease learning rate at certain points
            if train_ind in lr_steps:
                # edict only takes string for key. Hacky way
                self.set_lr(np.float(cfg.TRAIN.LEARNING_RATES[str(train_ind)]))
                print('Learing rate decreased to %f: ' % self.lr.get_value())

            # Debugging modules
            #
            # Print status, run validation, check divergence, and save model.
            if train_ind % cfg.TRAIN.PRINT_FREQ == 0:
                # Print the current loss
                print('%s Iter: %d Loss: %f' %
                      (datetime.now(), train_ind, loss))

            if train_ind % cfg.TRAIN.VALIDATION_FREQ == 0 and val_queue is not None:
                # Print test loss and params to check convergence every N iterations
                val_losses = []
                for i in range(cfg.TRAIN.NUM_VALIDATION_ITERATIONS):
                    batch_img, batch_voxel = val_queue.get()
                    _, val_loss, _ = self.test_output(batch_img, batch_voxel)
                    val_losses.append(val_loss)
                print('%s Test loss: %f' %
                      (datetime.now(), np.mean(val_losses)))

            if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0:
                # Check that the network parameters are all valid
                max_param = max_or_nan(self.net.params)
                if np.isnan(max_param):
                    print('NAN detected')
                    break

            if train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0:
                self.save(training_losses, save_dir, train_ind)

            if loss > cfg.TRAIN.LOSS_LIMIT:
                print("Cost exceeds the threshold. Stop training")
                break
Example #17
0
class Solver(object):
    """Solver for generic networks.
    """
    def __init__(self, net, graph, is_training):
        self.net = net
        self.graph = graph
        self.is_training = is_training
        self.num_epochs = cfg.TRAIN.NUM_EPOCHS
        self.train_timer = Timer()
        self.data_timer = Timer()

        self.global_step = slim.get_or_create_global_step()

        # Build basic ops and tensors
        if self.is_training:
            self.net.build_train_ops(self.global_step)

        if isinstance(net, Classifier):
            self.saver = tf.train.Saver(var_list=net.vars_to_restore,
                                        name='saver')
        else:
            self.saver = tf.train.Saver(max_to_keep=None,
                                        name='saver_all_var')  # Save all vars
        self.init_ops = self.build_init_ops()
        self.val_loss_ph = tf.placeholder(tf.float32,
                                          shape=(),
                                          name='val_loss_ph')
        self.net.build_summary_ops(self.graph)
        self.val_loss_summary = tf.summary.scalar(name='val_loss',
                                                  tensor=self.val_loss_ph)

        print('saver variables:')
        print_tensor_shapes(net.vars_to_restore, prefix='-->')

    def build_init_ops(self):
        """Builds the init ops.

        Returns:
            init_op: Initialization op.
            ready_op: Initialization op.
            local_init_op: Initialization op.
        """
        with tf.name_scope('init_ops'):
            init_op = tf.global_variables_initializer()
            ready_op = tf.report_uninitialized_variables()
            local_init_op = tf.group(tf.local_variables_initializer(),
                                     tf.tables_initializer())
        return init_op, ready_op, local_init_op

    def restore_checkpoint(self, sess):
        """Restores the network to a previously saved checkpoint if a path is provided from the
        config.

        Args:
            sess: Current session.
        """
        if cfg.DIR.CKPT_PATH is not None:
            tf.logging.info('Restoring checkpoint.')
            self.saver.restore(sess, cfg.DIR.CKPT_PATH)
        else:
            tf.logging.info('Using network with random weights.')

    def train_step(self, sess, train_queue, step):
        """Executes a train step, including saving the summaries if appropriate.

        Args:
            sess: Current session.
            train_queue: Data queue containing train set minibatches.
            step: The current training iteration (0-based indexing).

        Returns:
            print_dict: Dictionary of items such as losses (for just one minibatch) to print.
        """
        print_dict = self.net.train_step(sess,
                                         train_queue,
                                         step,
                                         data_timer=self.data_timer)
        return print_dict

    def val_step(self, sess, val_queue):
        """Executes a validation step, which simply computes the loss.
        Args:
            sess: Current session.
            val_queue: Data queue containing validation set minibatches.
        Returns:
            val_loss: Loss for a single minibatch of validation data.
        """
        raise NotImplementedError('Must be implemented by a subclass.')

    def validate(self, sess, val_queue, step, num_val_iter):
        raise NotImplementedError('Must be implemented by a subclass.')

    def train(self,
              train_iters_per_epoch,
              train_queue,
              val_iters_per_epoch=None,
              val_queue=None):
        """Train the network, computing the validation loss if val_iters_per_epoch and val_queue are
        provided.

        Args:
            train_iters_per_epoch: Number of iterations in a single epoch of train data, as computed
                by the data process.
            train_queue: Data queue containing minibatches of train data.
            val_iters_per_epoch: Optional input describing the number of iterations in a single
                epoch of validation data, as computed by the data process.
            val_queue: Optional input representing the data queue containing minibatches of
                validation data.
        """
        if (val_iters_per_epoch is None and val_queue is not None) or \
                (val_iters_per_epoch is not None and val_queue is None):
            raise ValueError('Need to input both val size and val queue.')
        if val_iters_per_epoch is not None and val_queue is not None:
            run_validation = True
        else:
            run_validation = False

        print('-------------- BEGIN TRAINING --------------')
        num_train_iter = get_num_iterations(train_iters_per_epoch,
                                            num_epochs=cfg.TRAIN.NUM_EPOCHS,
                                            disp=True)
        num_val_iter = 20000 // cfg.CONST.BATCH_SIZE  # Evaluate on roughly 20000 samples
        if val_iters_per_epoch is not None:
            num_val_iter = min(num_val_iter, val_iters_per_epoch)

        with tf.Session() as sess:
            sess.run(self.init_ops)
            self.restore_checkpoint(sess)

            # Train loop
            for step in range(num_train_iter):
                # For randomized model
                # self.save(sess, step)
                # break

                self.train_timer.tic()
                print_dict = self.train_step(sess, train_queue, step)
                self.train_timer.toc()

                if (step + 1) % cfg.CONST.PRINT_FREQ == 0:
                    print_dict['queue size'] = (str(train_queue.qsize()) +
                                                '/' +
                                                str(cfg.CONST.QUEUE_CAPACITY))
                    print_dict[
                        'data fetch (sec/step)'] = '%.2f' % self.data_timer.average_time
                    print_dict[
                        'train step (sec/step)'] = '%.2f' % self.train_timer.average_time
                    print_train_step_data(print_dict, step)

                    # Reset timers
                    self.data_timer.reset()
                    self.train_timer.reset()

                if (run_validation is True) and (
                    (step + 1) % cfg.TRAIN.VALIDATION_FREQ == 0):
                    validation_val = self.validate(sess, val_queue, step,
                                                   num_val_iter)
                    if validation_val == -1:  # Training termination flag
                        tf.logging.info(
                            'Terminating train loop due to decreasing validation performance.'
                        )
                        break
                    else:
                        val_summary = sess.run(
                            self.val_loss_summary,
                            feed_dict={self.val_loss_ph: validation_val})
                        self.net.summary_writer.add_summary(
                            val_summary, (step + 1))

                if (step + 1) % cfg.TRAIN.CKPT_FREQ == 0:
                    self.save(sess, step)

            # save model after training
            self.save(sess, step)

    def forward_pass_batches(self, sess, minibatch_generator):
        """Forward pass a series of minibatches from the minibatch generator.
        """
        minibatch_list = []
        outputs_list = []
        for step, minibatch in enumerate(minibatch_generator):
            np.random.seed(1234)
            try:
                outputs = self.net.forward_pass(sess, minibatch)
            except KeyError:
                outputs = self.net.forward_pass(sess,
                                                minibatch,
                                                full_feed_dict=True)
            # Reduce size of minibatch so we can pass through entire val set
            if isinstance(self.net, LBA):
                minibatch_save = {
                    'raw_embedding_batch': minibatch['raw_embedding_batch'],
                    'caption_label_batch': minibatch['caption_label_batch'],
                    'category_list': minibatch['category_list'],
                    'model_list': minibatch['model_list'],
                }
                minibatch = minibatch_save
            if isinstance(self.net, Classifier):
                minibatch_save = {
                    'class_label_batch': minibatch['class_label_batch'],
                    'model_id_list': minibatch['model_id_list'],
                }
                minibatch = minibatch_save
            minibatch_list.append(minibatch)
            outputs_list.append(outputs)

            if (step + 1) % 100 == 0:
                tf.logging.info('%s  Step: %d' %
                                (str(datetime.now()), step + 1))

        return minibatch_list, outputs_list

    def val_phase_minibatch_generator(self, val_queue, num_val_iter):
        """Return a minibatch generator for the test phase.
        """
        for step in range(num_val_iter):
            minibatch = val_queue.get()
            minibatch['test_queue'] = True
            yield minibatch

    def evaluate(self, minibatch_list, outputs_list):
        """Do some evaluation of the outputs.
        """
        pass

    def test(self,
             test_process,
             test_queue,
             num_minibatches=None,
             save_outputs=False):
        """Compute (and optionally save) the outputs for the test set. This function only computes
        the outputs for num_minibatches minibatches.

        Args:
            test_process: Data process for the test data.
            test_queue: Queue containing minibatches of test data.
            num_minibatches: Number of minibatches to compute the outputs for.
            save_outputs: Boolean flag for whether or not to save the outputs.
        """
        with tf.Session() as sess:
            if cfg.DIR.CKPT_PATH is None:
                raise ValueError('Please provide a checkpoint.')
                sess.run(self.init_ops)
            else:
                self.restore_checkpoint(sess)

            def test_phase_minibatch_generator():
                for step, minibatch in enumerate(
                        get_while_running(test_process, test_queue)):
                    if (num_minibatches is not None) and (step
                                                          == num_minibatches):
                        break
                    yield minibatch

            minibatch_generator = test_phase_minibatch_generator()
            minibatch_list, outputs_list = self.forward_pass_batches(
                sess, minibatch_generator)
            self.evaluate(minibatch_list, outputs_list)

        if save_outputs:
            self.save_outputs(minibatch_list, outputs_list)

    def save(self, sess, step):
        """Save a checkpoint.
        """
        ckpt_path = os.path.join(cfg.DIR.LOG_PATH, 'model.ckpt')
        tf.logging.info('Saving checkpoint (step %d).', (step + 1))
        self.saver.save(sess, ckpt_path, global_step=(step + 1))

    def save_outputs(self, minibatch_list, outputs_list, filename=None):
        """Save the outputs (from the self.test).
        """
        raise NotImplementedError('Must be implemented by a subclass.')
Example #18
0
class ModelRunner(object):
    """
    Train, evaluate, save and restore model.
    """
    def __init__(self, config):
        self._config = config
        self._train_config = config['train']
        self._model_config = config['model']
        self._data_config = config['data']

        # the folders for model and tensor board
        self._training_folder = None
        self._model_folder = None
        self._tfb_folder = None

        # build model
        self._model = BaseModel.generate_model_from_config(self._model_config)
        self._model_saver = tf.train.Saver(max_to_keep=0)

        # other setting
        self._timer = Timer('m')

    @property
    def model(self):
        return self._model

    def train_model(self,
                    sess,
                    train_data_provider,
                    valid_data_provider,
                    test_data_provider=None):

        epoch_num, max_epoch, lr_scheduler, continue_training = self._load_train_status(
        )

        # get logger for this training
        logger = get_logger(os.path.join(self._training_folder,
                                         'training.log'))

        # training from scratch or continue training
        if continue_training:
            # trained model existed, then restore it.
            model_path = self.restore_model(sess)
            epoch_num += 1
            logger.info(f'Restore model from {model_path}')
        else:
            # initialize variables
            sess.run([tf.global_variables_initializer()])

        logger.info(
            f'Training starts on dataset {self._data_config["data_name"]}')
        logger.info(
            f'----------Trainable parameter count: {get_num_trainable_params()} of model {self._model_folder}'
        )

        best_valid_loss = float('inf')
        lr = lr_scheduler.get_lr()
        while lr > 0 and epoch_num <= max_epoch:

            # Train
            loss, _, _, elapse = self._run_epoch(sess,
                                                 train_data_provider,
                                                 lr,
                                                 is_train=True)

            logger.info(
                f'Epoch {epoch_num}: train loss - {loss}, learning rate - {lr}.'
                f' Cost time: {elapse:.3f}{self._timer.unit()}')

            # Valid
            loss, _, _, _ = self._run_epoch(sess,
                                            valid_data_provider,
                                            lr,
                                            is_train=False)

            # Update after train and valid
            # update lr
            lr = lr_scheduler.update_lr(loss=loss, epoch_num=epoch_num)
            # update train_config
            self._update_train_config(lr, epoch_num)

            if loss < best_valid_loss:
                best_valid_loss = loss

                # save best model
                self._save_model_with_config(sess)

                # Test
                loss, preds, labels, elapse = self._run_epoch(
                    sess, test_data_provider, lr, is_train=False)
                metrics = test_data_provider.get_metrics(preds, labels)
                str_metrics = str(metrics)
                logger.info(f'---Test Loss: {loss}, metrics: {str_metrics}. '
                            f'Cost time: {elapse:.3f}{self._timer.unit()}')

            epoch_num += 1
        logger.info('Training Finished!')

    def evaluate_model(self, sess, data_provider):
        self.restore_model(sess)
        loss, preds, labels, _ = self._run_epoch(sess,
                                                 data_provider,
                                                 lr=0,
                                                 is_train=False)
        metrics = data_provider.get_metrics(preds, labels)
        return preds, labels, metrics

    def restore_model(self, sess):
        train_config = self._train_config
        model_path = train_config['model_path']
        self._model_saver.restore(sess, model_path)
        return model_path

    def _load_train_status(self):
        """ Load training status. Create base folders if the config presents a new training.
        :return:
        """
        train_config = self._train_config
        # assign parameters
        epoch_num = train_config.get('epoch')
        max_epoch = train_config.get('max_epoch')

        # get lr scheduler
        lr_scheduler = LRScheduler.generate_scheduler_by_name(
            train_config.get('lr_scheduler'), **train_config)
        model_path = train_config.get('model_path')

        if model_path:
            # continue last training
            continue_training = True
            # read corresponding training path
            self._model_folder = os.path.dirname(model_path)
            self._training_folder = os.path.dirname(self._model_folder)
            self._tfb_folder = create_folder(self._training_folder, 'tfbs')

        else:
            # training from scratch
            continue_training = False
            # create model and tensorflow board folder
            time = datetime.datetime.now()
            timestamp = datetime.datetime.strftime(time, '%m%d%H%M%S')
            model_foldername = make_config_string(
                self._config['model']) + '_' + timestamp

            self._training_folder = create_folder(self._config['base_dir'],
                                                  model_foldername)
            self._model_folder = create_folder(self._training_folder, 'models')
            self._tfb_folder = create_folder(self._training_folder, 'tfbs')

        return epoch_num, max_epoch, lr_scheduler, continue_training

    def _save_model_with_config(self, sess):
        train_config = self._train_config
        # update model path in train config
        train_config['model_path'] = os.path.join(
            self._model_folder, 'model-' + str(train_config['epoch']))

        # save model
        self._model_saver.save(sess, train_config['model_path'])
        # save config to yaml file
        config_path = os.path.join(
            self._model_folder,
            'config-' + str(train_config['epoch']) + '.yaml')
        with open(config_path, 'w') as f:
            yaml.dump(self._config, f)

    def _update_train_config(self, lr, epoch):
        train_config = self._train_config
        train_config['lr'] = lr
        train_config['epoch'] = epoch

    def _run_epoch(self, sess, data_provider, lr, is_train):
        """
        :param sess:
        :param data_provider:
        :param lr:
        :param is_train:
        :return: [epoch_loss, epoch_pred, epoch_label, epoch_cost_time]
        """
        self._timer.start()
        model = self._model
        if is_train:
            run_func = model.train
        else:
            run_func = model.predict
        loss_list = []
        pred_list = []
        real_list = []
        for batch_data in data_provider.iterate_batch_data():
            loss, pred, real = run_func(sess, batch_data, lr=lr)

            loss_list.append(loss)
            pred_list.append(pred)
            real_list.append(real)

        # shape -> [n_items, horizon, D]
        epoch_preds = concat_arrs_of_dict(pred_list)
        epoch_reals = concat_arrs_of_dict(real_list)

        epoch_avg_loss = np.mean(loss_list)
        # inverse scaling data
        epoch_preds = data_provider.epoch_inverse_scaling(epoch_preds)
        epoch_reals = data_provider.epoch_inverse_scaling(epoch_reals)

        return epoch_avg_loss, epoch_preds, epoch_reals, self._timer.end()
def train(model, data_loader, val_data_loader, config, transform_data_fn=None):
    all_losses = []
    device = get_torch_device(config.is_cuda)
    # Set up the train flag for batch normalization
    model.train()

    # Configuration
    writer = SummaryWriter(log_dir=config.log_dir)
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    losses, scores, batch_losses = AverageMeter(), AverageMeter(), {}

    optimizer = initialize_optimizer(model.parameters(), config)
    scheduler = initialize_scheduler(optimizer, config)
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)
    alpha, gamma, eps = 1, 2, 1e-6

    writer = SummaryWriter(log_dir=config.log_dir)

    # Train the network
    logging.info('===> Start training')
    best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        checkpoint_fn = config.resume + '/weights.pth'
        if osp.isfile(checkpoint_fn):
            logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
            state = torch.load(checkpoint_fn)
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            model.load_state_dict(state['state_dict'])
            if config.resume_optimizer:
                scheduler = initialize_scheduler(optimizer,
                                                 config,
                                                 last_step=curr_iter)
                optimizer.load_state_dict(state['optimizer'])
            if 'best_val' in state:
                best_val_miou = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_fn, state['epoch']))
        else:
            raise ValueError(
                "=> no checkpoint found at '{}'".format(checkpoint_fn))

    data_iter = data_loader.__iter__()

    while is_training:
        print(
            "********************************** epoch N° {0} ************************"
            .format(epoch))
        for iteration in range(len(data_loader) // config.iter_size):
            print("####### Iteration N° {0}".format(iteration))
            optimizer.zero_grad()
            data_time, batch_loss = 0, 0
            iter_timer.tic()
            for sub_iter in range(config.iter_size):
                print("------------- Sub_iteration N° {0}".format(sub_iter))
                # Get training data
                data_timer.tic()
                coords, input, target = data_iter.next()
                print("len of coords : {0}".format(len(coords)))

                # For some networks, making the network invariant to even, odd coords is important
                coords[:, :3] += (torch.rand(3) * 100).type_as(coords)

                # Preprocess input
                color = input[:, :3].int()

                if config.normalize_color:
                    input[:, :3] = input[:, :3] / 255. - 0.5
                sinput = SparseTensor(input, coords).to(device)

                data_time += data_timer.toc(False)

                # Feed forward
                inputs = (sinput, ) if config.wrapper_type == 'None' else (
                    sinput, coords, color)
                # model.initialize_coords(*init_args)
                soutput = model(*inputs)
                # The output of the network is not sorted
                target = target.long().to(device)
                print("count of classes : {0}".format(
                    np.unique(target.cpu().numpy(), return_counts=True)))
                print("target : {0}\ntarget_len : {1}".format(
                    target, len(target)))
                print("target [0]: {0}".format(target[0]))
                input_soft = nn.functional.softmax(soutput.F, dim=1) + eps
                print("input_soft[0] : {0}".format(input_soft[0]))
                focal_weight = torch.pow(-input_soft + 1., gamma)
                print("focal_weight : {0}\nweight[0] : {1}".format(
                    focal_weight, focal_weight[0]))
                focal_loss = (-alpha * focal_weight *
                              torch.log(input_soft)).mean()
                loss = criterion(soutput.F, target.long())
                print("focal_loss :{0}\nloss : {1}".format(focal_loss, loss))

                # Compute and accumulate gradient
                loss /= config.iter_size
                #batch_loss += loss
                batch_loss += loss.item()
                print("batch_loss : {0}".format(batch_loss))
                loss.backward()

            # Update number of steps
            optimizer.step()
            scheduler.step()

            data_time_avg.update(data_time)
            iter_time_avg.update(iter_timer.toc(False))

            pred = get_prediction(data_loader.dataset, soutput.F, target)
            score = precision_at_one(pred, target)
            losses.update(batch_loss, target.size(0))
            scores.update(score, target.size(0))

            if curr_iter >= config.max_iter:
                is_training = False
                break

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join(
                    ['{:.3e}'.format(x) for x in scheduler.get_lr()])
                debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format(
                    epoch, curr_iter,
                    len(data_loader) // config.iter_size, losses.avg, lrs)
                debug_str += "Score {:.3f}\tData time: {:.4f}, Total iter time: {:.4f}".format(
                    scores.avg, data_time_avg.avg, iter_time_avg.avg)
                logging.info(debug_str)
                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()
                # Write logs
                writer.add_scalar('training/loss', losses.avg, curr_iter)
                writer.add_scalar('training/precision_at_1', scores.avg,
                                  curr_iter)
                writer.add_scalar('training/learning_rate',
                                  scheduler.get_lr()[0], curr_iter)
                losses.reset()
                scores.reset()

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(model, optimizer, epoch, curr_iter, config,
                           best_val_miou, best_val_iter)

            # Validation
            if curr_iter % config.val_freq == 0:
                val_miou, val_losses = validate(model, val_data_loader, writer,
                                                curr_iter, config,
                                                transform_data_fn, epoch)

                if val_miou > best_val_miou:
                    best_val_miou = val_miou
                    best_val_iter = curr_iter
                    checkpoint(model, optimizer, epoch, curr_iter, config,
                               best_val_miou, best_val_iter, "best_val")
                logging.info("Current best mIoU: {:.3f} at iter {}".format(
                    best_val_miou, best_val_iter))

                # Recover back
                model.train()

            if curr_iter % config.empty_cache_freq == 0:
                # Clear cache
                torch.cuda.empty_cache()

            batch_losses[epoch] = batch_loss
            # End of iteration
            curr_iter += 1
        with open(config.log_dir + "/train_loss.txt", 'a') as train_loss_log:
            train_loss_log.writelines('{0}, {1}\n'.format(
                batch_losses[epoch], epoch))
        train_loss_log.close()
        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
               best_val_iter)
    val_miou = validate(model, val_data_loader, writer, curr_iter, config,
                        transform_data_fn, epoch)[0]
    if val_miou > best_val_miou:
        best_val_miou = val_miou
        best_val_iter = curr_iter
        checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
                   best_val_iter, "best_val")
    logging.info("Current best mIoU: {:.3f} at iter {}".format(
        best_val_miou, best_val_iter))
def test(model, data_loader, config, transform_data_fn=None, has_gt=True, save_pred=False, split=None, submit_dir=None):
    device = get_torch_device(config.is_cuda)
    dataset = data_loader.dataset
    num_labels = dataset.NUM_LABELS
    global_timer, data_timer, iter_timer = Timer(), Timer(), Timer()
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)
    losses, scores, ious = AverageMeter(), AverageMeter(), 0
    aps = np.zeros((0, num_labels))
    hist = np.zeros((num_labels, num_labels))

    # some cfgs concerning the usage of instance-level information
    config.save_pred = save_pred
    if split is not None:
        assert save_pred
    if config.save_pred:
        save_dict = {}
        save_dict['pred'] = []
        save_dict['coord'] = []

    logging.info('===> Start testing')

    global_timer.tic()
    data_iter = data_loader.__iter__()
    max_iter = len(data_loader)
    max_iter_unique = max_iter

    # Fix batch normalization running mean and std
    model.eval()

    # Clear cache (when run in val mode, cleanup training cache)
    torch.cuda.empty_cache()

    # semantic kitti label inverse mapping
    if config.submit:
        remap_lut = Remap().getRemapLUT()

    with torch.no_grad():

        # Calc of the iou
        total_correct = np.zeros(num_labels)
        total_seen = np.zeros(num_labels)
        total_positive = np.zeros(num_labels)
        point_nums = np.zeros([19])

        for iteration in range(max_iter):
            data_timer.tic()
            if config.return_transformation:
                coords, input, target, unique_map_list, inverse_map_list, pointcloud, transformation, filename = data_iter.next()
            else:
                coords, input, target, unique_map_list, inverse_map_list, filename = data_iter.next()
            data_time = data_timer.toc(False)

            if config.use_aux:
                assert target.shape[1] == 2
                aux = target[:,1]
                target = target[:,0]
            else:
                aux = None

            # Preprocess input
            iter_timer.tic()

            if config.normalize_color:
                input[:, :3] = input[:, :3] / input[:,:3].max() - 0.5
                coords_norm = coords[:,1:] / coords[:,1:].max() - 0.5

            XYZ_INPUT = config.xyz_input
            # cat xyz into the rgb feature
            if XYZ_INPUT:
                input = torch.cat([coords_norm, input], dim=1)

            sinput = ME.SparseTensor(input, coords, device=device)

            # Feed forward
            if aux is not None:
                soutput = model(sinput)
            else:
                soutput = model(sinput, iter_ = iteration / max_iter, enable_point_branch=config.enable_point_branch)
            output = soutput.F
            if torch.isnan(output).sum() > 0:
                import ipdb; ipdb.set_trace()

            pred = get_prediction(dataset, output, target).int()
            assert sum([int(t.shape[0]) for t in unique_map_list]) == len(pred), "number of points in unique_map doesn't match predition, do not enable preprocessing"
            iter_time = iter_timer.toc(False)

            if config.save_pred or config.submit:
                # troublesome processing for splitting each batch's data, and export
                batch_ids = sinput.C[:,0]
                splits_at = torch.stack([torch.where(batch_ids == i)[0][-1] for i in torch.unique(batch_ids)]).int()
                splits_at = splits_at + 1
                splits_at_leftshift_one = splits_at.roll(shifts=1)
                splits_at_leftshift_one[0] = 0
                # len_per_batch = splits_at - splits_at_leftshift_one
                len_sum = 0
                batch_id = 0
                for start, end in zip(splits_at_leftshift_one, splits_at):
                    len_sum += len(pred[int(start):int(end)])
                    pred_this_batch = pred[int(start):int(end)]
                    coord_this_batch = pred[int(start):int(end)]
                    if config.save_pred:
                        save_dict['pred'].append(pred_this_batch[inverse_map_list[batch_id]])
                    else: # save submit result
                        submission_path = filename[batch_id].replace(config.semantic_kitti_path, submit_dir).replace('velodyne', 'predictions').replace('.bin', '.label')
                        parent_dir = Path(submission_path).parent.absolute()
                        if not os.path.exists(parent_dir):
                            os.makedirs(parent_dir)
                        label_pred = pred_this_batch[inverse_map_list[batch_id]].cpu().numpy()
                        label_pred = remap_lut[label_pred].astype(np.uint32)
                        label_pred.tofile(submission_path)
                        print(submission_path)
                    batch_id += 1
                assert len_sum == len(pred)

            # Unpack it to original length
            REVERT_WHOLE_POINTCLOUD = True
            # print('{}/{}'.format(iteration, max_iter))
            if REVERT_WHOLE_POINTCLOUD:
                whole_pred = []
                whole_target = []
                for batch_ in range(config.batch_size):
                    batch_mask_ = (soutput.C[:,0] == batch_).cpu().numpy()
                    if batch_mask_.sum() == 0: # for empty batch, skip em 
                        continue
                    try:
                        whole_pred_ = soutput.F[batch_mask_][inverse_map_list[batch_]]
                    except:
                        import ipdb; ipdb.set_trace()
                    whole_target_ = target[batch_mask_][inverse_map_list[batch_]]
                    whole_pred.append(whole_pred_)
                    whole_target.append(whole_target_)
                whole_pred = torch.cat(whole_pred, dim=0)
                whole_target = torch.cat(whole_target, dim=0)

                pred = get_prediction(dataset, whole_pred, whole_target).int()
                output = whole_pred
                target = whole_target

            if has_gt:
                target_np = target.numpy()
                num_sample = target_np.shape[0]
                target = target.to(device)
                output = output.to(device)

                cross_ent = criterion(output, target.long())
                losses.update(float(cross_ent), num_sample)
                scores.update(precision_at_one(pred, target), num_sample)
                hist += fast_hist(pred.cpu().numpy().flatten(), target_np.flatten(), num_labels) # within fast hist, mark label should >=0 & < num_label to filter out 255 / -1
                ious = per_class_iu(hist) * 100
                prob = torch.nn.functional.softmax(output, dim=-1)

                pred = pred[target != -1]
                target = target[target != -1]

                # for _ in range(num_labels): # debug for SemKITTI: spvnas way of calc miou
                    # total_seen[_] += torch.sum(target == _)
                    # total_correct[_] += torch.sum((pred == target) & (target == _))
                    # total_positive[_] += torch.sum(pred == _)

                # ious_ = []
                # for _ in range(num_labels):
                    # if total_seen[_] == 0:
                        # ious_.append(1)
                    # else:
                        # ious_.append(total_correct[_]/(total_seen[_] + total_positive[_] - total_correct[_]))
                # ious_ = torch.stack(ious_, dim=-1).cpu().numpy()*100
                # print(np.nanmean(per_class_iu(hist)), np.nanmean(ious_))
                # ious = np.array(ious_)*100


                # calc the ratio of total points
                # for i_ in range(19):
                    # point_nums[i_] += (target == i_).sum().detach()

                # skip calculating aps
                ap = average_precision(prob.cpu().detach().numpy(), target_np)
                aps = np.vstack((aps, ap))
                # Due to heavy bias in class, there exists class with no test label at all
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=RuntimeWarning)
                    ap_class = np.nanmean(aps, 0) * 100.

            if iteration % config.test_stat_freq == 0 and iteration > 0 and not config.submit:
                reordered_ious = dataset.reorder_result(ious)
                reordered_ap_class = dataset.reorder_result(ap_class)
                # dirty fix for semnaticcKITTI has no getclassnames
                if hasattr(dataset, "class_names"):
                    class_names = dataset.get_classnames()
                else: # semnantic KITTI
                    class_names = None
                print_info(
                        iteration,
                        max_iter_unique,
                        data_time,
                        iter_time,
                        has_gt,
                        losses,
                        scores,
                        reordered_ious,
                        hist,
                        reordered_ap_class,
                        class_names=class_names)

            if iteration % 5 == 0:
                # Clear cache
                torch.cuda.empty_cache()

    if config.save_pred:
        # torch.save(save_dict, os.path.join(config.log_dir, 'preds_{}_with_coord.pth'.format(split)))
        torch.save(save_dict, os.path.join(config.log_dir, 'preds_{}.pth'.format(split)))
        print("===> saved prediction result")

    global_time = global_timer.toc(False)

    save_map(model, config)

    reordered_ious = dataset.reorder_result(ious)
    reordered_ap_class = dataset.reorder_result(ap_class)
    if hasattr(dataset, "class_names"):
        class_names = dataset.get_classnames()
    else:
        class_names = None
    print_info(
            iteration,
            max_iter_unique,
            data_time,
            iter_time,
            has_gt,
            losses,
            scores,
            reordered_ious,
            hist,
            reordered_ap_class,
            class_names=class_names)

    logging.info("Finished test. Elapsed time: {:.4f}".format(global_time))

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    return losses.avg, scores.avg, np.nanmean(ap_class), np.nanmean(per_class_iu(hist)) * 100
Example #21
0
output_directory = os.path.join(opts.output_path + "/outputs", model_name)
checkpoint_directory, image_directory = prepare_sub_folder(output_directory)
shutil.copy(opts.config,
            os.path.join(output_directory,
                         'config.yaml'))  # copy config file to output folder

# Start training
iterations = trainer.resume(checkpoint_directory,
                            hyperparameters=config) if opts.resume else 0
while True:
    for it, (images_a,
             images_b) in enumerate(zip(train_loader_a, train_loader_b)):
        trainer.update_learning_rate()
        images_a, images_b = images_a.cuda().detach(), images_b.cuda().detach()

        with Timer("Elapsed time in update: %f"):
            # Main training code
            trainer.dis_update(images_a, images_b, config)
            trainer.gen_update(images_a, images_b, config)
            torch.cuda.synchronize()

        # Dump training stats in log file
        if (iterations + 1) % config['log_iter'] == 0:
            print("Iteration: %08d/%08d" % (iterations + 1, max_iter))
            write_loss(iterations, trainer, train_writer)

        # Write images
        if (iterations + 1) % config['image_save_iter'] == 0:
            with torch.no_grad():
                test_image_outputs = trainer.sample(test_display_images_a,
                                                    test_display_images_b)
Example #22
0
def train_worker(gpu,
                 num_devices,
                 NetClass,
                 data_loader,
                 val_data_loader,
                 config,
                 transform_data_fn=None):
    if gpu is not None:
        print("Use GPU: {} for training".format(gpu))
        rank = gpu
    addr = 23491
    dist.init_process_group(backend="nccl",
                            init_method="tcp://127.0.0.1:{}".format(addr),
                            world_size=num_devices,
                            rank=rank)

    # replace with DistributedSampler
    if config.multiprocess:
        from lib.dataloader_dist import InfSampler
        sampler = InfSampler(data_loader.dataset)
        data_loader = DataLoader(dataset=data_loader.dataset,
                                 num_workers=data_loader.num_workers,
                                 batch_size=data_loader.batch_size,
                                 collate_fn=data_loader.collate_fn,
                                 worker_init_fn=data_loader.worker_init_fn,
                                 sampler=sampler)

    if data_loader.dataset.NUM_IN_CHANNEL is not None:
        num_in_channel = data_loader.dataset.NUM_IN_CHANNEL
    else:
        num_in_channel = 3
    num_labels = data_loader.dataset.NUM_LABELS

    # load model
    if config.pure_point:
        model = NetClass(num_class=config.num_labels,
                         N=config.num_points,
                         normal_channel=config.num_in_channel)
    else:
        if config.model == 'MixedTransformer':
            model = NetClass(config,
                             num_class=num_labels,
                             N=config.num_points,
                             normal_channel=num_in_channel)
        elif config.model == 'MinkowskiVoxelTransformer':
            model = NetClass(config, num_in_channel, num_labels)
        elif config.model == 'MinkowskiTransformerNet':
            model = NetClass(config, num_in_channel, num_labels)
        elif "Res" in config.model:
            model = NetClass(num_in_channel, num_labels, config)
        else:
            model = NetClass(num_in_channel, num_labels, config)

    if config.weights == 'modelzoo':
        model.preload_modelzoo()
    elif config.weights.lower() != 'none':
        state = torch.load(config.weights)
        # delete the keys containing the attn since it raises size mismatch
        d = {k: v for k, v in state['state' '_dict'].items() if 'map' not in k}
        if config.weights_for_inner_model:
            model.model.load_state_dict(d)
        else:
            if config.lenient_weight_loading:
                matched_weights = load_state_with_same_shape(
                    model, state['state_dict'])
                model_dict = model.state_dict()
                model_dict.update(matched_weights)
                model.load_state_dict(model_dict)
            else:
                model.load_state_dict(d, strict=False)

    torch.cuda.set_device(gpu)
    model.cuda(gpu)
    # use model with DDP
    model = torch.nn.parallel.DistributedDataParallel(
        model, device_ids=[gpu], find_unused_parameters=False)
    # Synchronized batch norm
    model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(model)

    # Set up the train flag for batch normalization
    model.train()

    # Configuration
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    regs, losses, scores = AverageMeter(), AverageMeter(), AverageMeter()

    optimizer = initialize_optimizer(model.parameters(), config)
    scheduler = initialize_scheduler(optimizer, config)
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)

    # Train the network
    if rank == 0:
        setup_logger(config)
        logging.info('===> Start training')

    best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        # Test loaded ckpt first
        v_loss, v_score, v_mAP, v_mIoU = test(model, val_data_loader, config)

        checkpoint_fn = config.resume + '/weights.pth'
        if osp.isfile(checkpoint_fn):
            logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
            state = torch.load(checkpoint_fn)
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            # we skip attention maps because the shape won't match because voxel number is different
            # e.g. copyting a param with shape (23385, 8, 4) to (43529, 8, 4)
            d = {
                k: v
                for k, v in state['state_dict'].items() if 'map' not in k
            }
            # handle those attn maps we don't load from saved dict
            for k in model.state_dict().keys():
                if k in d.keys(): continue
                d[k] = model.state_dict()[k]
            model.load_state_dict(d)
            if config.resume_optimizer:
                scheduler = initialize_scheduler(optimizer,
                                                 config,
                                                 last_step=curr_iter)
                optimizer.load_state_dict(state['optimizer'])
            if 'best_val' in state:
                best_val_miou = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_fn, state['epoch']))
        else:
            raise ValueError(
                "=> no checkpoint found at '{}'".format(checkpoint_fn))

    data_iter = data_loader.__iter__()
    device = gpu  # multitrain fed in the device
    if config.dataset == "SemanticKITTI":
        num_class = 19
        config.normalize_color = False
        config.xyz_input = False
        val_freq_ = config.val_freq
        config.val_freq = config.val_freq * 10  # origianl val_freq_
    elif config.dataset == 'S3DIS':
        num_class = 13
        config.normalize_color = False
        config.xyz_input = False
        val_freq_ = config.val_freq
    elif config.dataset == "Nuscenes":
        num_class = 16
        config.normalize_color = False
        config.xyz_input = False
        val_freq_ = config.val_freq
        config.val_freq = config.val_freq * 50
    else:
        val_freq_ = config.val_freq
        num_class = 20

    while is_training:

        total_correct_class = torch.zeros(num_class, device=device)
        total_iou_deno_class = torch.zeros(num_class, device=device)

        for iteration in range(len(data_loader) // config.iter_size):

            optimizer.zero_grad()
            data_time, batch_loss = 0, 0
            iter_timer.tic()

            if curr_iter >= config.max_iter:
                # if curr_iter >= max(config.max_iter, config.epochs*(len(data_loader) // config.iter_size):
                is_training = False
                break
            elif curr_iter >= config.max_iter * (2 / 3):
                config.val_freq = val_freq_ * 2  # valid more freq on lower half

            for sub_iter in range(config.iter_size):

                # Get training data
                data_timer.tic()
                if config.return_transformation:
                    coords, input, target, _, _, pointcloud, transformation = data_iter.next(
                    )
                else:
                    coords, input, target, _, _ = data_iter.next(
                    )  # ignore unique_map and inverse_map

                if config.use_aux:
                    assert target.shape[1] == 2
                    aux = target[:, 1]
                    target = target[:, 0]
                else:
                    aux = None

                # For some networks, making the network invariant to even, odd coords is important
                coords[:, 1:] += (torch.rand(3) * 100).type_as(coords)

                # Preprocess input
                if config.normalize_color:
                    input[:, :3] = input[:, :3] / input[:, :3].max() - 0.5
                    coords_norm = coords[:, 1:] / coords[:, 1:].max() - 0.5

                # cat xyz into the rgb feature
                if config.xyz_input:
                    input = torch.cat([coords_norm, input], dim=1)
                # print(device)

                sinput = SparseTensor(input, coords, device=device)

                # d = {}
                # d['coord'] = sinput.C
                # d['feat'] = sinput.F
                # torch.save(d, 'voxel.pth')
                # import ipdb; ipdb.set_trace()

                data_time += data_timer.toc(False)
                # model.initialize_coords(*init_args)
                if aux is not None:
                    soutput = model(sinput, aux)
                elif config.enable_point_branch:
                    soutput = model(sinput,
                                    iter_=curr_iter / config.max_iter,
                                    enable_point_branch=True)
                else:
                    soutput = model(
                        sinput, iter_=curr_iter / config.max_iter
                    )  # feed in the progress of training for annealing inside the model
                    # soutput = model(sinput)
                # The output of the network is not sorted
                target = target.view(-1).long().to(device)

                loss = criterion(soutput.F, target.long())

                # ====== other loss regs =====
                cur_loss = torch.tensor([0.], device=device)
                if hasattr(model, 'module.block1'):
                    cur_loss = torch.tensor([0.], device=device)

                    if hasattr(model.module.block1[0], 'vq_loss'):
                        if model.block1[0].vq_loss is not None:
                            cur_loss = torch.tensor([0.], device=device)
                            for n, m in model.named_children():
                                if 'block' in n:
                                    cur_loss += m[
                                        0].vq_loss  # m is the nn.Sequential obj, m[0] is the TRBlock
                            logging.info(
                                'Cur Loss: {}, Cur vq_loss: {}'.format(
                                    loss, cur_loss))
                            loss += cur_loss

                    if hasattr(model.module.block1[0], 'diverse_loss'):
                        if model.block1[0].diverse_loss is not None:
                            cur_loss = torch.tensor([0.], device=device)
                            for n, m in model.named_children():
                                if 'block' in n:
                                    cur_loss += m[
                                        0].diverse_loss  # m is the nn.Sequential obj, m[0] is the TRBlock
                            logging.info(
                                'Cur Loss: {}, Cur diverse _loss: {}'.format(
                                    loss, cur_loss))
                            loss += cur_loss

                    if hasattr(model.module.block1[0], 'label_reg'):
                        if model.block1[0].label_reg is not None:
                            cur_loss = torch.tensor([0.], device=device)
                            for n, m in model.named_children():
                                if 'block' in n:
                                    cur_loss += m[
                                        0].label_reg  # m is the nn.Sequential obj, m[0] is the TRBlock
                            # logging.info('Cur Loss: {}, Cur diverse _loss: {}'.format(loss, cur_loss))
                            loss += cur_loss

                # Compute and accumulate gradient
                loss /= config.iter_size
                batch_loss += loss.item()
                if not config.use_sam:
                    loss.backward()
                else:
                    with model.no_sync():
                        loss.backward()

            # Update number of steps
            if not config.use_sam:
                optimizer.step()
            else:
                optimizer.first_step(zero_grad=True)
                soutput = model(sinput,
                                iter_=curr_iter / config.max_iter,
                                aux=starget)
                criterion(soutput.F, target.long()).backward()
                optimizer.second_step(zero_grad=True)

            if config.lr_warmup is None:
                scheduler.step()
            else:
                if curr_iter >= config.lr_warmup:
                    scheduler.step()
                else:
                    for g in optimizer.param_groups:
                        g['lr'] = config.lr * (iteration +
                                               1) / config.lr_warmup

            # CLEAR CACHE!
            torch.cuda.empty_cache()

            data_time_avg.update(data_time)
            iter_time_avg.update(iter_timer.toc(False))

            pred = get_prediction(data_loader.dataset, soutput.F, target)
            score = precision_at_one(pred, target, ignore_label=-1)

            regs.update(cur_loss.item(), target.size(0))
            losses.update(batch_loss, target.size(0))
            scores.update(score, target.size(0))

            # calc the train-iou
            for l in range(num_class):
                total_correct_class[l] += ((pred == l) & (target == l)).sum()
                total_iou_deno_class[l] += (((pred == l) & (target != -1)) |
                                            (target == l)).sum()

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join(
                    ['{:.3e}'.format(g['lr']) for g in optimizer.param_groups])
                IoU = ((total_correct_class) /
                       (total_iou_deno_class + 1e-6)).mean() * 100.
                debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format(
                    epoch, curr_iter,
                    len(data_loader) // config.iter_size, losses.avg, lrs)
                debug_str += "Score {:.3f}\tIoU {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format(
                    scores.avg, IoU.item(), data_time_avg.avg,
                    iter_time_avg.avg)
                if regs.avg > 0:
                    debug_str += "\n Additional Reg Loss {:.3f}".format(
                        regs.avg)

                if rank == 0:
                    logging.info(debug_str)
                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()
                # Write logs
                losses.reset()
                scores.reset()

            # only save status on the 1st gpu
            if rank == 0:

                # Save current status, save before val to prevent occational mem overflow
                if curr_iter % config.save_freq == 0:
                    checkpoint(model,
                               optimizer,
                               epoch,
                               curr_iter,
                               config,
                               best_val_miou,
                               best_val_iter,
                               save_inter=True)

                # Validation
                if curr_iter % config.val_freq == 0:
                    val_miou = validate(model, val_data_loader, None,
                                        curr_iter, config, transform_data_fn
                                        )  # feedin None for SummaryWriter args
                    if val_miou > best_val_miou:
                        best_val_miou = val_miou
                        best_val_iter = curr_iter
                        checkpoint(model,
                                   optimizer,
                                   epoch,
                                   curr_iter,
                                   config,
                                   best_val_miou,
                                   best_val_iter,
                                   "best_val",
                                   save_inter=True)
                    if rank == 0:
                        logging.info(
                            "Current best mIoU: {:.3f} at iter {}".format(
                                best_val_miou, best_val_iter))

                    # Recover back
                    model.train()

            # End of iteration
            curr_iter += 1

        IoU = (total_correct_class) / (total_iou_deno_class + 1e-6)
        if rank == 0:
            logging.info('train point avg class IoU: %f' %
                         ((IoU).mean() * 100.))

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    if rank == 0:
        checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
                   best_val_iter)
        v_loss, v_score, v_mAP, val_mIoU = test(model, val_data_loader, config)

        if val_miou > best_val_miou and rank == 0:
            best_val_miou = val_miou
            best_val_iter = curr_iter
            logging.info("Final best miou: {}  at iter {} ".format(
                val_miou, curr_iter))
            checkpoint(model, optimizer, epoch, curr_iter, config,
                       best_val_miou, best_val_iter, "best_val")

            logging.info("Current best mIoU: {:.3f} at iter {}".format(
                best_val_miou, best_val_iter))
Example #23
0
def findsuitjob(stoper, date):
    processmsg = 'Storedailyinit start findsuit:' + date
    with Timer() as t:
        stoper.findsuit(date)
    processmsg = 'analyst done for %s in %d secs' % (date, t.secs)
    return processmsg
Example #24
0
def train_point(model,
                data_loader,
                val_data_loader,
                config,
                transform_data_fn=None):

    device = get_torch_device(config.is_cuda)
    # Set up the train flag for batch normalization
    model.train()

    # Configuration
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    losses, scores = AverageMeter(), AverageMeter()

    optimizer = initialize_optimizer(model.parameters(), config)
    scheduler = initialize_scheduler(optimizer, config)
    criterion = nn.CrossEntropyLoss(ignore_index=-1)

    # Train the network
    logging.info('===> Start training')
    best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        checkpoint_fn = config.resume + '/weights.pth'
        if osp.isfile(checkpoint_fn):
            logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
            state = torch.load(checkpoint_fn)
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            d = {
                k: v
                for k, v in state['state_dict'].items() if 'map' not in k
            }
            model.load_state_dict(d)
            if config.resume_optimizer:
                scheduler = initialize_scheduler(optimizer,
                                                 config,
                                                 last_step=curr_iter)
                optimizer.load_state_dict(state['optimizer'])
            if 'best_val' in state:
                best_val_miou = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_fn, state['epoch']))
        else:
            raise ValueError(
                "=> no checkpoint found at '{}'".format(checkpoint_fn))

    data_iter = data_loader.__iter__()
    while is_training:

        num_class = 20
        total_correct_class = torch.zeros(num_class, device=device)
        total_iou_deno_class = torch.zeros(num_class, device=device)

        for iteration in range(len(data_loader) // config.iter_size):
            optimizer.zero_grad()
            data_time, batch_loss = 0, 0
            iter_timer.tic()
            for sub_iter in range(config.iter_size):
                # Get training data
                data = data_iter.next()
                points, target, sample_weight = data
                if config.pure_point:

                    sinput = points.transpose(1, 2).cuda().float()

                    # DEBUG: use the discrete coord for point-based
                    '''

                        feats = torch.unbind(points[:,:,:], dim=0)
                        voxel_size = config.voxel_size
                        coords = torch.unbind(points[:,:,:3]/voxel_size, dim=0)  # 0.05 is the voxel-size
                        coords, feats= ME.utils.sparse_collate(coords, feats)
                        # assert feats.reshape([16, 4096, -1]) == points[:,:,3:]
                        points_ = ME.TensorField(features=feats.float(), coordinates=coords, device=device)
                        tmp_voxel = points_.sparse()
                        sinput_ = tmp_voxel.slice(points_)
                        sinput = torch.cat([sinput_.C[:,1:]*config.voxel_size, sinput_.F[:,3:]],dim=1).reshape([config.batch_size, config.num_points, 6])
                        # sinput = sinput_.F.reshape([config.batch_size, config.num_points, 6])
                        sinput = sinput.transpose(1,2).cuda().float()

                        # sinput = torch.cat([coords[:,1:], feats],dim=1).reshape([config.batch_size, config.num_points, 6])
                        # sinput = sinput.transpose(1,2).cuda().float()
                        '''

                    # For some networks, making the network invariant to even, odd coords is important
                    # coords[:, 1:] += (torch.rand(3) * 100).type_as(coords)

                    # Preprocess input
                    # if config.normalize_color:
                    # feats = feats / 255. - 0.5

                    # torch.save(points[:,:,:3], './sandbox/tensorfield-c.pth')
                    # torch.save(points_.C, './sandbox/points-c.pth')

                else:
                    # feats = torch.unbind(points[:,:,3:], dim=0) # WRONG: should also feed in xyz as inupt feature
                    voxel_size = config.voxel_size
                    coords = torch.unbind(points[:, :, :3] / voxel_size,
                                          dim=0)  # 0.05 is the voxel-size
                    # Normalize the xyz in feature
                    # points[:,:,:3] = points[:,:,:3] / points[:,:,:3].mean()
                    feats = torch.unbind(points[:, :, :], dim=0)
                    coords, feats = ME.utils.sparse_collate(coords, feats)

                    # For some networks, making the network invariant to even, odd coords is important
                    coords[:, 1:] += (torch.rand(3) * 100).type_as(coords)

                    # Preprocess input
                    # if config.normalize_color:
                    # feats = feats / 255. - 0.5

                    # they are the same
                    points_ = ME.TensorField(features=feats.float(),
                                             coordinates=coords,
                                             device=device)
                    # points_1 = ME.TensorField(features=feats.float(), coordinates=coords, device=device, quantization_mode=ME.SparseTensorQuantizationMode.UNWEIGHTED_AVERAGE)
                    # points_2 = ME.TensorField(features=feats.float(), coordinates=coords, device=device, quantization_mode=ME.SparseTensorQuantizationMode.RANDOM_SUBSAMPLE)
                    sinput = points_.sparse()

                data_time += data_timer.toc(False)
                B, npoint = target.shape

                # model.initialize_coords(*init_args)
                soutput = model(sinput)
                if config.pure_point:
                    soutput = soutput.reshape([B * npoint, -1])
                else:
                    soutput = soutput.slice(points_).F
                    # s1 = soutput.slice(points_)
                    # print(soutput.quantization_mode)
                    # soutput.quantization_mode = ME.SparseTensorQuantizationMode.RANDOM_SUBSAMPLE
                    # s2 = soutput.slice(points_)

                # The output of the network is not sorted
                target = (target - 1).view(-1).long().to(device)

                # catch NAN
                if torch.isnan(soutput).sum() > 0:
                    import ipdb
                    ipdb.set_trace()

                loss = criterion(soutput, target)

                if torch.isnan(loss).sum() > 0:
                    import ipdb
                    ipdb.set_trace()

                loss = (loss * sample_weight.to(device)).mean()

                # Compute and accumulate gradient
                loss /= config.iter_size
                batch_loss += loss.item()
                loss.backward()
                # print(model.input_mlp[0].weight.max())
                # print(model.input_mlp[0].weight.grad.max())

            # Update number of steps
            optimizer.step()
            scheduler.step()

            # CLEAR CACHE!
            torch.cuda.empty_cache()

            data_time_avg.update(data_time)
            iter_time_avg.update(iter_timer.toc(False))

            pred = get_prediction(data_loader.dataset, soutput, target)
            score = precision_at_one(pred, target, ignore_label=-1)
            losses.update(batch_loss, target.size(0))
            scores.update(score, target.size(0))

            # Calc the iou
            for l in range(num_class):
                total_correct_class[l] += ((pred == l) & (target == l)).sum()
                total_iou_deno_class[l] += (((pred == l) & (target >= 0)) |
                                            (target == l)).sum()

            if curr_iter >= config.max_iter:
                is_training = False
                break

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join(
                    ['{:.3e}'.format(x) for x in scheduler.get_lr()])
                debug_str = "===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format(
                    epoch, curr_iter,
                    len(data_loader) // config.iter_size, losses.avg, lrs)
                debug_str += "Score {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format(
                    scores.avg, data_time_avg.avg, iter_time_avg.avg)
                logging.info(debug_str)
                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()
                # Write logs
                losses.reset()
                scores.reset()

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(model,
                           optimizer,
                           epoch,
                           curr_iter,
                           config,
                           best_val_miou,
                           best_val_iter,
                           save_inter=True)

            # Validation:
            # for point-based should use alternate dataloader for eval
            # if curr_iter % config.val_freq == 0:
            # val_miou = test_points(model, val_data_loader, None, curr_iter, config, transform_data_fn)
            # if val_miou > best_val_miou:
            # best_val_miou = val_miou
            # best_val_iter = curr_iter
            # checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou, best_val_iter,
            # "best_val")
            # logging.info("Current best mIoU: {:.3f} at iter {}".format(best_val_miou, best_val_iter))

            # # Recover back
            # model.train()

            # End of iteration
            curr_iter += 1

        IoU = (total_correct_class) / (total_iou_deno_class + 1e-6)
        logging.info('train point avg class IoU: %f' % ((IoU).mean() * 100.))

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
               best_val_iter)

    test_points(model, val_data_loader, config)
    if val_miou > best_val_miou:
        best_val_miou = val_miou
        best_val_iter = curr_iter
        checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
                   best_val_iter, "best_val")
    logging.info("Current best mIoU: {:.3f} at iter {}".format(
        best_val_miou, best_val_iter))
Example #25
0
def train(model, data_loader, val_data_loader, config, transform_data_fn=None):

    device = get_torch_device(config.is_cuda)
    # Set up the train flag for batch normalization
    model.train()

    # Configuration
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    regs, losses, scores = AverageMeter(), AverageMeter(), AverageMeter()

    optimizer = initialize_optimizer(model.parameters(), config)
    scheduler = initialize_scheduler(optimizer, config)
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)

    # Train the network
    logging.info('===> Start training')
    best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        # Test loaded ckpt first
        v_loss, v_score, v_mAP, v_mIoU = test(model, val_data_loader, config)

        checkpoint_fn = config.resume + '/weights.pth'
        if osp.isfile(checkpoint_fn):
            logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
            state = torch.load(checkpoint_fn)
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            # we skip attention maps because the shape won't match because voxel number is different
            # e.g. copyting a param with shape (23385, 8, 4) to (43529, 8, 4)
            d = {
                k: v
                for k, v in state['state_dict'].items() if 'map' not in k
            }
            # handle those attn maps we don't load from saved dict
            for k in model.state_dict().keys():
                if k in d.keys(): continue
                d[k] = model.state_dict()[k]
            model.load_state_dict(d)
            if config.resume_optimizer:
                scheduler = initialize_scheduler(optimizer,
                                                 config,
                                                 last_step=curr_iter)
                optimizer.load_state_dict(state['optimizer'])
            if 'best_val' in state:
                best_val_miou = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_fn, state['epoch']))
        else:
            raise ValueError(
                "=> no checkpoint found at '{}'".format(checkpoint_fn))

    data_iter = data_loader.__iter__()
    if config.dataset == "SemanticKITTI":
        num_class = 19
        config.normalize_color = False
        config.xyz_input = False
        val_freq_ = config.val_freq
        config.val_freq = config.val_freq * 10
    elif config.dataset == "S3DIS":
        num_class = 13
        config.normalize_color = False
        config.xyz_input = False
        val_freq_ = config.val_freq
        config.val_freq = config.val_freq
    elif config.dataset == "Nuscenes":
        num_class = 16
        config.normalize_color = False
        config.xyz_input = False
        val_freq_ = config.val_freq
        config.val_freq = config.val_freq * 50
    else:
        num_class = 20
        val_freq_ = config.val_freq

    while is_training:
        total_correct_class = torch.zeros(num_class, device=device)
        total_iou_deno_class = torch.zeros(num_class, device=device)

        for iteration in range(len(data_loader) // config.iter_size):
            optimizer.zero_grad()
            data_time, batch_loss = 0, 0
            iter_timer.tic()

            if curr_iter >= config.max_iter:
                # if curr_iter >= max(config.max_iter, config.epochs*(len(data_loader) // config.iter_size):
                is_training = False
                break
            elif curr_iter >= config.max_iter * (2 / 3):
                config.val_freq = val_freq_ * 2  # valid more freq on lower half

            for sub_iter in range(config.iter_size):
                # Get training data
                data_timer.tic()
                pointcloud = None

                if config.return_transformation:
                    coords, input, target, _, _, pointcloud, transformation, _ = data_iter.next(
                    )
                else:
                    coords, input, target, _, _, _ = data_iter.next(
                    )  # ignore unique_map and inverse_map

                if config.use_aux:
                    assert target.shape[1] == 2
                    aux = target[:, 1]
                    target = target[:, 0]
                else:
                    aux = None

                # For some networks, making the network invariant to even, odd coords is important
                coords[:, 1:] += (torch.rand(3) * 100).type_as(coords)

                # Preprocess input
                if config.normalize_color:
                    input[:, :3] = input[:, :3] / input[:, :3].max() - 0.5
                    coords_norm = coords[:, 1:] / coords[:, 1:].max() - 0.5

                # cat xyz into the rgb feature
                if config.xyz_input:
                    input = torch.cat([coords_norm, input], dim=1)
                sinput = SparseTensor(input, coords, device=device)
                starget = SparseTensor(
                    target.unsqueeze(-1).float(),
                    coordinate_map_key=sinput.coordinate_map_key,
                    coordinate_manager=sinput.coordinate_manager,
                    device=device
                )  # must share the same coord-manager to align for sinput

                data_time += data_timer.toc(False)
                # model.initialize_coords(*init_args)

                # d = {}
                # d['c'] = sinput.C
                # d['l'] = starget.F
                # torch.save('./plot/test-label.pth')
                # import ipdb; ipdb.set_trace()

                # Set up profiler
                # memory_profiler = CUDAMemoryProfiler(
                # [model, criterion],
                # filename="cuda_memory.profile"
                # )
                # sys.settrace(memory_profiler)
                # threading.settrace(memory_profiler)

                # with torch.autograd.profiler.profile(enabled=True, use_cuda=True, record_shapes=False, profile_memory=True) as prof0:
                if aux is not None:
                    soutput = model(sinput, aux)
                elif config.enable_point_branch:
                    soutput = model(sinput,
                                    iter_=curr_iter / config.max_iter,
                                    enable_point_branch=True)
                else:
                    # label-aux, feed it in as additional reg
                    soutput = model(
                        sinput, iter_=curr_iter / config.max_iter, aux=starget
                    )  # feed in the progress of training for annealing inside the model

                # The output of the network is not sorted
                target = target.view(-1).long().to(device)
                loss = criterion(soutput.F, target.long())

                # ====== other loss regs =====
                if hasattr(model, 'block1'):
                    cur_loss = torch.tensor([0.], device=device)

                    if hasattr(model.block1[0], 'vq_loss'):
                        if model.block1[0].vq_loss is not None:
                            cur_loss = torch.tensor([0.], device=device)
                            for n, m in model.named_children():
                                if 'block' in n:
                                    cur_loss += m[
                                        0].vq_loss  # m is the nn.Sequential obj, m[0] is the TRBlock
                            logging.info(
                                'Cur Loss: {}, Cur vq_loss: {}'.format(
                                    loss, cur_loss))
                            loss += cur_loss

                    if hasattr(model.block1[0], 'diverse_loss'):
                        if model.block1[0].diverse_loss is not None:
                            cur_loss = torch.tensor([0.], device=device)
                            for n, m in model.named_children():
                                if 'block' in n:
                                    cur_loss += m[
                                        0].diverse_loss  # m is the nn.Sequential obj, m[0] is the TRBlock
                            logging.info(
                                'Cur Loss: {}, Cur diverse _loss: {}'.format(
                                    loss, cur_loss))
                            loss += cur_loss

                    if hasattr(model.block1[0], 'label_reg'):
                        if model.block1[0].label_reg is not None:
                            cur_loss = torch.tensor([0.], device=device)
                            for n, m in model.named_children():
                                if 'block' in n:
                                    cur_loss += m[
                                        0].label_reg  # m is the nn.Sequential obj, m[0] is the TRBlock
                            # logging.info('Cur Loss: {}, Cur diverse _loss: {}'.format(loss, cur_loss))
                            loss += cur_loss

                # Compute and accumulate gradient
                loss /= config.iter_size
                batch_loss += loss.item()
                loss.backward()

                # soutput = model(sinput)

            # Update number of steps
            if not config.use_sam:
                optimizer.step()
            else:
                optimizer.first_step(zero_grad=True)
                soutput = model(sinput,
                                iter_=curr_iter / config.max_iter,
                                aux=starget)
                criterion(soutput.F, target.long()).backward()
                optimizer.second_step(zero_grad=True)

            if config.lr_warmup is None:
                scheduler.step()
            else:
                if curr_iter >= config.lr_warmup:
                    scheduler.step()
                for g in optimizer.param_groups:
                    g['lr'] = config.lr * (iteration + 1) / config.lr_warmup

            # CLEAR CACHE!
            torch.cuda.empty_cache()

            data_time_avg.update(data_time)
            iter_time_avg.update(iter_timer.toc(False))

            pred = get_prediction(data_loader.dataset, soutput.F, target)
            score = precision_at_one(pred, target, ignore_label=-1)

            regs.update(cur_loss.item(), target.size(0))
            losses.update(batch_loss, target.size(0))
            scores.update(score, target.size(0))

            # calc the train-iou
            for l in range(num_class):
                total_correct_class[l] += ((pred == l) & (target == l)).sum()
                total_iou_deno_class[l] += (((pred == l) & (target != -1)) |
                                            (target == l)).sum()

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join(
                    ['{:.3e}'.format(x) for x in scheduler.get_lr()])
                IoU = ((total_correct_class) /
                       (total_iou_deno_class + 1e-6)).mean() * 100.
                debug_str = "[{}] ===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format(
                    config.log_dir.split('/')[-2], epoch, curr_iter,
                    len(data_loader) // config.iter_size, losses.avg, lrs)
                debug_str += "Score {:.3f}\tIoU {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format(
                    scores.avg, IoU.item(), data_time_avg.avg,
                    iter_time_avg.avg)
                if regs.avg > 0:
                    debug_str += "\n Additional Reg Loss {:.3f}".format(
                        regs.avg)
                # print(debug_str)
                logging.info(debug_str)
                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()
                # Write logs
                losses.reset()
                scores.reset()

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(model,
                           optimizer,
                           epoch,
                           curr_iter,
                           config,
                           best_val_miou,
                           best_val_iter,
                           save_inter=True)

            # Validation
            if curr_iter % config.val_freq == 0:
                val_miou = validate(model, val_data_loader, None, curr_iter,
                                    config, transform_data_fn)
                if val_miou > best_val_miou:
                    best_val_miou = val_miou
                    best_val_iter = curr_iter
                    checkpoint(model,
                               optimizer,
                               epoch,
                               curr_iter,
                               config,
                               best_val_miou,
                               best_val_iter,
                               "best_val",
                               save_inter=True)
                logging.info("Current best mIoU: {:.3f} at iter {}".format(
                    best_val_miou, best_val_iter))
                # print("Current best mIoU: {:.3f} at iter {}".format(best_val_miou, best_val_iter))

                # Recover back
                model.train()

            # End of iteration
            curr_iter += 1

        IoU = (total_correct_class) / (total_iou_deno_class + 1e-6)
        logging.info('train point avg class IoU: %f' % ((IoU).mean() * 100.))

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
               best_val_iter)
    v_loss, v_score, v_mAP, val_miou = test(model, val_data_loader, config)
    if val_miou > best_val_miou:
        best_val_miou = val_miou
        best_val_iter = curr_iter
        checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
                   best_val_iter, "best_val")
    logging.info("Current best mIoU: {:.3f} at iter {}".format(
        best_val_miou, best_val_iter))
Example #26
0
def train_distill(model,
                  data_loader,
                  val_data_loader,
                  config,
                  transform_data_fn=None):
    '''
    the distillation training
    some cfgs here
    '''

    # distill_lambda = 1
    # distill_lambda = 0.33
    distill_lambda = 0.67

    # TWO_STAGE=True: Transformer is first trained with L2 loss to match ResNet's activation, and then it fintunes like normal training on the second stage.
    # TWO_STAGE=False: Transformer trains with combined loss

    TWO_STAGE = False
    # STAGE_PERCENTAGE = 0.7

    device = get_torch_device(config.is_cuda)
    # Set up the train flag for batch normalization
    model.train()

    # Configuration
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    losses, scores = AverageMeter(), AverageMeter()

    optimizer = initialize_optimizer(model.parameters(), config)
    scheduler = initialize_scheduler(optimizer, config)
    criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_label)

    # Train the network
    logging.info('===> Start training')
    best_val_miou, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    # TODO:
    # load the sub-model only
    # FIXME: some dirty hard-written stuff, only supporting current state

    tch_model_cls = load_model('Res16UNet18A')
    tch_model = tch_model_cls(3, 20, config).to(device)

    # checkpoint_fn = "/home/zhaotianchen/project/point-transformer/SpatioTemporalSegmentation-ScanNet/outputs/ScannetSparseVoxelizationDataset/Res16UNet18A/resnet_base/weights.pth"
    checkpoint_fn = "/home/zhaotianchen/project/point-transformer/SpatioTemporalSegmentation-ScanNet/outputs/ScannetSparseVoxelizationDataset/Res16UNet18A/Res18A/weights.pth"  # voxel-size: 0.05
    assert osp.isfile(checkpoint_fn)
    logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
    state = torch.load(checkpoint_fn)
    d = {k: v for k, v in state['state_dict'].items() if 'map' not in k}
    tch_model.load_state_dict(d)
    if 'best_val' in state:
        best_val_miou = state['best_val']
        best_val_iter = state['best_val_iter']
    logging.info("=> loaded checkpoint '{}' (epoch {})".format(
        checkpoint_fn, state['epoch']))

    if config.resume:
        raise NotImplementedError
        # Test loaded ckpt first

        # checkpoint_fn = config.resume + '/weights.pth'
        # if osp.isfile(checkpoint_fn):
        # logging.info("=> loading checkpoint '{}'".format(checkpoint_fn))
        # state = torch.load(checkpoint_fn)
        # curr_iter = state['iteration'] + 1
        # epoch = state['epoch']
        # d = {k:v for k,v in state['state_dict'].items() if 'map' not in k }
        # model.load_state_dict(d)
        # if config.resume_optimizer:
        # scheduler = initialize_scheduler(optimizer, config, last_step=curr_iter)
        # optimizer.load_state_dict(state['optimizer'])
        # if 'best_val' in state:
        # best_val_miou = state['best_val']
        # best_val_iter = state['best_val_iter']
        # logging.info("=> loaded checkpoint '{}' (epoch {})".format(checkpoint_fn, state['epoch']))
        # else:
        # raise ValueError("=> no checkpoint found at '{}'".format(checkpoint_fn))

    # test after loading the ckpt
    v_loss, v_score, v_mAP, v_mIoU = test(tch_model, val_data_loader, config)
    logging.info('Tch model tested, bes_miou: {}'.format(v_mIoU))

    data_iter = data_loader.__iter__()
    while is_training:

        num_class = 20
        total_correct_class = torch.zeros(num_class, device=device)
        total_iou_deno_class = torch.zeros(num_class, device=device)

        total_iteration = len(data_loader) // config.iter_size
        for iteration in range(total_iteration):

            # NOTE: for single stage distillation, L2 loss might be too large at first
            # so we added a warmup training that don't use L2 loss
            if iteration < 0:
                use_distill = False
            else:
                use_distill = True

            # Stage 1 / Stage 2 boundary
            if TWO_STAGE:
                stage_boundary = int(total_iteration * STAGE_PERCENTAGE)

            optimizer.zero_grad()
            data_time, batch_loss = 0, 0
            iter_timer.tic()

            for sub_iter in range(config.iter_size):
                # Get training data
                data_timer.tic()
                if config.return_transformation:
                    coords, input, target, _, _, pointcloud, transformation = data_iter.next(
                    )
                else:
                    coords, input, target, _, _ = data_iter.next(
                    )  # ignore unique_map and inverse_map

                if config.use_aux:
                    assert target.shape[1] == 2
                    aux = target[:, 1]
                    target = target[:, 0]
                else:
                    aux = None

                # For some networks, making the network invariant to even, odd coords is important
                coords[:, 1:] += (torch.rand(3) * 100).type_as(coords)

                # Preprocess input
                if config.normalize_color:
                    input[:, :3] = input[:, :3] / 255. - 0.5
                    coords_norm = coords[:, 1:] / coords[:, 1:].max() - 0.5

                # cat xyz into the rgb feature
                if config.xyz_input:
                    input = torch.cat([coords_norm, input], dim=1)

                sinput = SparseTensor(input, coords, device=device)

                # TODO: return both-models
                # in order to not breaking the valid interface, use a get_loss to get the regsitered loss

                data_time += data_timer.toc(False)
                # model.initialize_coords(*init_args)
                if aux is not None:
                    raise NotImplementedError

                # flatten ground truth tensor
                target = target.view(-1).long().to(device)

                if TWO_STAGE:
                    if iteration < stage_boundary:
                        # Stage 1: train transformer on L2 loss
                        soutput, anchor = model(sinput, save_anchor=True)
                        # Make sure gradient don't flow to teacher model
                        with torch.no_grad():
                            _, tch_anchor = tch_model(sinput, save_anchor=True)
                        loss = DistillLoss(tch_anchor, anchor)
                    else:
                        # Stage 2: finetune transformer on Cross-Entropy
                        soutput = model(sinput)
                        loss = criterion(soutput.F, target.long())
                else:
                    if use_distill:  # after warm up
                        soutput, anchor = model(sinput, save_anchor=True)
                        # if pretrained teacher, do not let the grad flow to teacher to update its params
                        with torch.no_grad():
                            tch_soutput, tch_anchor = tch_model(
                                sinput, save_anchor=True)

                    else:  # warming up
                        soutput = model(sinput)
                    # The output of the network is not sorted
                    loss = criterion(soutput.F, target.long())
                    #  Add L2 loss if use distillation
                    if use_distill:
                        distill_loss = DistillLoss(tch_anchor,
                                                   anchor) * distill_lambda
                        loss += distill_loss

                # Compute and accumulate gradient
                loss /= config.iter_size
                batch_loss += loss.item()
                loss.backward()

            # Update number of steps
            optimizer.step()
            scheduler.step()

            # CLEAR CACHE!
            torch.cuda.empty_cache()

            data_time_avg.update(data_time)
            iter_time_avg.update(iter_timer.toc(False))

            pred = get_prediction(data_loader.dataset, soutput.F, target)
            score = precision_at_one(pred, target, ignore_label=-1)
            losses.update(batch_loss, target.size(0))
            scores.update(score, target.size(0))

            # calc the train-iou
            for l in range(num_class):
                total_correct_class[l] += ((pred == l) & (target == l)).sum()
                total_iou_deno_class[l] += (((pred == l) & (target != -1)) |
                                            (target == l)).sum()

            if curr_iter >= config.max_iter:
                is_training = False
                break

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join(
                    ['{:.3e}'.format(x) for x in scheduler.get_lr()])
                debug_str = "[{}] ===> Epoch[{}]({}/{}): Loss {:.4f}\tLR: {}\t".format(
                    config.log_dir, epoch, curr_iter,
                    len(data_loader) // config.iter_size, losses.avg, lrs)
                debug_str += "Score {:.3f}\tData time: {:.4f}, Iter time: {:.4f}".format(
                    scores.avg, data_time_avg.avg, iter_time_avg.avg)
                logging.info(debug_str)
                if use_distill and not TWO_STAGE:
                    logging.info('Loss {} Distill Loss:{}'.format(
                        loss, distill_loss))
                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()
                losses.reset()
                scores.reset()

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(model,
                           optimizer,
                           epoch,
                           curr_iter,
                           config,
                           best_val_miou,
                           best_val_iter,
                           save_inter=True)

            # Validation
            if curr_iter % config.val_freq == 0:
                val_miou = validate(model, val_data_loader, None, curr_iter,
                                    config, transform_data_fn)
                if val_miou > best_val_miou:
                    best_val_miou = val_miou
                    best_val_iter = curr_iter
                    checkpoint(model,
                               optimizer,
                               epoch,
                               curr_iter,
                               config,
                               best_val_miou,
                               best_val_iter,
                               "best_val",
                               save_inter=True)
                logging.info("Current best mIoU: {:.3f} at iter {}".format(
                    best_val_miou, best_val_iter))

                # Recover back
                model.train()

            # End of iteration
            curr_iter += 1

        IoU = (total_correct_class) / (total_iou_deno_class + 1e-6)
        logging.info('train point avg class IoU: %f' % ((IoU).mean() * 100.))

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
               best_val_iter)
    v_loss, v_score, v_mAP, val_miou = test(model, val_data_loader, config)
    if val_miou > best_val_miou:
        best_val_miou = val_miou
        best_val_iter = curr_iter
        checkpoint(model, optimizer, epoch, curr_iter, config, best_val_miou,
                   best_val_iter, "best_val")
    logging.info("Current best mIoU: {:.3f} at iter {}".format(
        best_val_miou, best_val_iter))
Example #27
0
def train(pipeline_model, data_loader, val_data_loader, config):
    # Set up the train flag for batch normalization
    pipeline_model.train()

    num_devices = torch.cuda.device_count()
    num_devices = min(config.max_ngpu, num_devices)
    devices = list(range(num_devices))
    target_device = devices[0]
    pipeline_model.to(target_device)
    if num_devices > 1:
        pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(
            pipeline_model, devices)

    # Configuration
    writer = SummaryWriter(logdir=config.log_dir)
    data_timer, iter_timer = Timer(), Timer()
    data_time_avg, iter_time_avg = AverageMeter(), AverageMeter()
    meters = collections.defaultdict(AverageMeter)
    hists = pipeline_model.initialize_hists()

    optimizer = pipeline_model.initialize_optimizer(config)
    scheduler = pipeline_model.initialize_scheduler(optimizer, config)

    writer = SummaryWriter(logdir=config.log_dir)

    # Train the network
    logging.info('===> Start training')
    best_val, best_val_iter, curr_iter, epoch, is_training = 0, 0, 1, 1, True

    if config.resume:
        if osp.isfile(config.resume):
            logging.info("=> loading checkpoint '{}'".format(config.resume))
            state = torch.load(config.resume)
            curr_iter = state['iteration'] + 1
            epoch = state['epoch']
            pipeline_model.load_state_dict(state['state_dict'])
            if config.resume_optimizer:
                curr_iter = state['iteration'] + 1
                scheduler = pipeline_model.initialize_scheduler(
                    optimizer, config, last_step=curr_iter)
                pipeline_model.load_optimizer(optimizer, state['optimizer'])
            if 'best_val' in state:
                best_val = state['best_val']
                best_val_iter = state['best_val_iter']
            logging.info("=> loaded checkpoint '{}' (epoch {})".format(
                config.resume, state['epoch']))
        else:
            logging.info("=> no checkpoint found at '{}'".format(
                config.resume))

    data_iter = data_loader.__iter__()
    while is_training:
        for iteration in range(len(data_loader)):
            pipeline_model.reset_gradient(optimizer)
            iter_timer.tic()

            pipelines = parallel.replicate(pipeline_model, devices)

            # Get training data
            data_timer.tic()
            inputs = []
            for pipeline, device in zip(pipelines, devices):
                with torch.cuda.device(device):
                    while True:
                        datum = pipeline.load_datum(data_iter, has_gt=True)
                        num_boxes = sum(box.shape[0]
                                        for box in datum['bboxes_coords'])
                        if config.skip_empty_boxes and num_boxes == 0:
                            continue
                        break
                    inputs.append(datum)
            data_time_avg.update(data_timer.toc(False))

            outputs = parallel.parallel_apply(pipelines,
                                              [(x, True) for x in inputs],
                                              devices=devices)
            losses = parallel.parallel_apply(
                [pipeline.loss for pipeline in pipelines],
                tuple(zip(inputs, outputs)),
                devices=devices)
            losses = parallel.gather(losses, target_device)
            losses = dict([(k, v.mean()) for k, v in losses.items()])

            meters, hists = pipeline_model.update_meters(meters, hists, losses)

            # Compute and accumulate gradient
            losses['loss'].backward()

            # Update number of steps
            pipeline_model.step_optimizer(losses, optimizer, scheduler,
                                          iteration)

            iter_time_avg.update(iter_timer.toc(False))

            if curr_iter >= config.max_iter:
                is_training = False
                break

            if curr_iter % config.stat_freq == 0 or curr_iter == 1:
                lrs = ', '.join([
                    '{:.3e}'.format(x) for x in scheduler['default'].get_lr()
                ])
                debug_str = "===> Epoch[{}]({}/{}): LR: {}\n".format(
                    epoch, curr_iter, len(data_loader), lrs)
                debug_str += log_meters(meters, log_perclass_meters=False)
                debug_str += f"\n    data time: {data_time_avg.avg:.3f}"
                debug_str += f"    iter time: {iter_time_avg.avg:.3f}"
                logging.info(debug_str)

                # Reset timers
                data_time_avg.reset()
                iter_time_avg.reset()

                # Write logs
                update_writer(writer, meters, curr_iter, 'training')
                writer.add_scalar('training/learning_rate',
                                  scheduler['default'].get_lr()[0], curr_iter)

                # Reset meters
                reset_meters(meters, hists)

            # Save current status, save before val to prevent occational mem overflow
            if curr_iter % config.save_freq == 0:
                checkpoint(pipeline_model, optimizer, epoch, curr_iter, config,
                           best_val, best_val_iter)

            if config.heldout_save_freq > 0 and curr_iter % config.heldout_save_freq == 0:
                checkpoint(pipeline_model,
                           optimizer,
                           epoch,
                           curr_iter,
                           config,
                           best_val,
                           best_val_iter,
                           heldout_save=True)

            # Validation
            if curr_iter % config.val_freq == 0:
                if num_devices > 1:
                    unconvert_sync_batchnorm(pipeline_model)
                best_val, best_val_iter = validate(pipeline_model,
                                                   val_data_loader, config,
                                                   writer, curr_iter, best_val,
                                                   best_val_iter, optimizer,
                                                   epoch)
                if num_devices > 1:
                    pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(
                        pipeline_model, devices)

            if curr_iter % config.empty_cache_freq == 0:
                # Clear cache
                torch.cuda.empty_cache()

            # End of iteration
            curr_iter += 1

        epoch += 1

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    # Save the final model
    if num_devices > 1:
        unconvert_sync_batchnorm(pipeline_model)
    validate(pipeline_model, val_data_loader, config, writer, curr_iter,
             best_val, best_val_iter, optimizer, epoch)
    if num_devices > 1:
        pipeline_model = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(
            pipeline_model, devices)
    checkpoint(pipeline_model, optimizer, epoch, curr_iter, config, best_val,
               best_val_iter)
Example #28
0
    def train(self, train_queue, val_queue=None):
        ''' Given data queues, train the network '''
        # Parameter directory
        save_dir = os.path.join(cfg.DIR.OUT_PATH)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Timer for the training op and parallel data loading op.
        train_timer = Timer()
        data_timer = Timer()
        training_losses = []

        # Setup learning rates
        lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()]

        #Setup the lr_scheduler
        self.lr_scheduler = lr_scheduler.MultiStepLR(self.optimizer,
                                                     lr_steps,
                                                     gamma=0.1)  # gamma为下降系数

        start_iter = 0
        # Resume training
        if cfg.TRAIN.RESUME_TRAIN:
            self.load(cfg.CONST.WEIGHTS)
            start_iter = cfg.TRAIN.INITIAL_ITERATION

        if cfg.TRAIN.SHOW_LOSS:  # 要动态打印
            import matplotlib.pyplot as plot
            plot.figure(1, figsize=(12, 5))
            plot.ion()

        # Main training loop
        for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1):
            self.lr_scheduler.step()

            data_timer.tic()
            batch_img, batch_voxel = train_queue.get()
            data_timer.toc()

            if self.net.is_x_tensor4:
                batch_img = batch_img[0]

            # Apply one gradient step
            train_timer.tic()
            loss = self.train_loss(batch_img, batch_voxel)
            train_timer.toc()

            #             print(loss)
            # training_losses.append(loss.data)  转换为numpy数组
            # print(type(loss))
            # print(loss.data.numpy())
            # print(loss.data.numpy().shape)
            # print(type(loss.data.numpy()))
            if (torch.cuda.is_available()):
                training_losses.append(loss.cpu().data.numpy())
            else:
                training_losses.append(loss.data.numpy())

            # Decrease learning rate at certain points
            if train_ind in lr_steps:
                #for pytorch optimizer, learning rate can only be set when the optimizer is created
                #or using torch.optim.lr_scheduler
                print('Learing rate decreased to %f: ' %
                      cfg.TRAIN.LEARNING_RATES[str(train_ind)])

            # '''
            # Debugging modules
            # '''

            # Print status, run validation, check divergence, and save model.
            if train_ind % cfg.TRAIN.PRINT_FREQ == 0:  #40
                # Print the current loss
                print('%s Iter: %d Loss: %f' %
                      (datetime.now(), train_ind, loss))
                '''
                @TODO(dingyadong): loss dynamic Visualization
                '''
                # plot
                if (train_ind != 0):
                    steps = np.linspace(0,
                                        train_ind,
                                        train_ind + 1,
                                        dtype=np.float32)
                    if cfg.TRAIN.SHOW_LOSS:  # 要动态打印
                        plot.plot(steps, training_losses, 'b-')
                        plot.draw()
                # plot.pause(0.05)

            if train_ind % cfg.TRAIN.VALIDATION_FREQ == 0 and val_queue is not None:
                # Print test loss and params to check convergence every N iterations

                val_losses = 0
                for i in range(cfg.TRAIN.NUM_VALIDATION_ITERATIONS):
                    batch_img, batch_voxel = val_queue.get()
                    val_loss = self.train_loss(batch_img, batch_voxel)
                    val_losses += val_loss
                var_losses_mean = val_losses / cfg.TRAIN.NUM_VALIDATION_ITERATIONS
                print('%s Test loss: %f' % (datetime.now(), var_losses_mean))

            if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0:
                # Check that the network parameters are all valid
                nan_or_max_param = max_or_nan(self.net.parameters())
                if has_nan(nan_or_max_param):
                    print('NAN detected')
                    break

            if train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0:
                self.save(training_losses, save_dir, train_ind)

            #loss is a Variable containing torch.FloatTensor of size 1
            if loss.data > cfg.TRAIN.LOSS_LIMIT:
                print("Cost exceeds the threshold. Stop training")
                break

        if cfg.TRAIN.SHOW_LOSS:  # 要动态打印ƒ
            plot.ioff()
            plot.show()
Example #29
0
def test(pipeline_model, data_loader, config, has_gt=True):
    global_timer, data_timer, iter_timer = Timer(), Timer(), Timer()
    meters = collections.defaultdict(AverageMeter)
    hists = pipeline_model.initialize_hists()

    logging.info('===> Start testing')

    global_timer.tic()
    data_iter = data_loader.__iter__()
    max_iter = len(data_loader)

    # Fix batch normalization running mean and std
    pipeline_model.eval()

    # Clear cache (when run in val mode, cleanup training cache)
    torch.cuda.empty_cache()

    if config.save_prediction or config.test_original_pointcloud:
        if config.save_prediction:
            save_pred_dir = config.save_pred_dir
            os.makedirs(save_pred_dir, exist_ok=True)
        else:
            save_pred_dir = tempfile.mkdtemp()
        if os.listdir(save_pred_dir):
            raise ValueError(f'Directory {save_pred_dir} not empty. '
                             'Please remove the existing prediction.')

    with torch.no_grad():
        for iteration in range(max_iter):
            iter_timer.tic()
            data_timer.tic()
            datum = pipeline_model.load_datum(data_iter, has_gt=has_gt)
            data_time = data_timer.toc(False)

            output_dict = pipeline_model(datum, False)
            iter_time = iter_timer.toc(False)

            if config.save_prediction or config.test_original_pointcloud:
                pipeline_model.save_prediction(datum, output_dict,
                                               save_pred_dir, iteration)

            if config.visualize and iteration % config.visualize_freq == 0:
                pipeline_model.visualize_predictions(datum, output_dict,
                                                     iteration)

            if has_gt:
                loss_dict = pipeline_model.loss(datum, output_dict)
                if config.visualize and iteration % config.visualize_freq == 0:
                    pipeline_model.visualize_groundtruth(datum, iteration)
                loss_dict.update(pipeline_model.evaluate(datum, output_dict))

                meters, hists = pipeline_model.update_meters(
                    meters, hists, loss_dict)

            if iteration % config.test_stat_freq == 0 and iteration > 0:
                debug_str = "===> {}/{}\n".format(iteration, max_iter)
                debug_str += log_meters(meters, log_perclass_meters=True)
                debug_str += f"\n    data time: {data_time:.3f}    iter time: {iter_time:.3f}"
                logging.info(debug_str)

            if iteration % config.empty_cache_freq == 0:
                # Clear cache
                torch.cuda.empty_cache()

    global_time = global_timer.toc(False)

    debug_str = "===> Final test results:\n"
    debug_str += log_meters(meters, log_perclass_meters=True)
    logging.info(debug_str)

    if config.test_original_pointcloud:
        pipeline_model.test_original_pointcloud(save_pred_dir)

    logging.info('Finished test. Elapsed time: {:.4f}'.format(global_time))

    # Explicit memory cleanup
    if hasattr(data_iter, 'cleanup'):
        data_iter.cleanup()

    return meters
Example #30
0
    def train(self, generator_queue, discriminator_queue):
        ''' Given data queues, train the network '''
        # Parameter directory
        save_dir = os.path.join(cfg.DIR.OUT_PATH)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Timer for the training op and parallel data loading op.
        train_timer = Timer()
        data_timer = Timer()

        start_iter = 0
        # Resume training
        if cfg.TRAIN.RESUME_TRAIN:
            self.net.load(cfg.CONST.WEIGHTS)
            start_iter = cfg.TRAIN.INITIAL_ITERATION

        # Setup learning rates
        lr = cfg.TRAIN.DEFAULT_LEARNING_RATE
        lr_steps = [int(k) for k in cfg.TRAIN.LEARNING_RATES.keys()]

        print('Set the learning rate to %f.' % lr)
        gen_lr = lr

        discriminator_losses = []
        generator_losses = []
        mask_losses = []
        generator_idx = 0
        voxel_loss = 0
        generator_loss = 0
        mask_loss = 0
        discriminator_loss = 0

        # Main training loop
        for train_ind in range(start_iter, cfg.TRAIN.NUM_ITERATION + 1):
            self.net.noise.set_value(max(1 - float(train_ind) / 20000., 1e-8))
            data_timer.tic()
            gen_img, gen_camera, _ = generator_queue.get()
            data_timer.toc()
            data_timer.tic()
            _, _, disc_voxel = discriminator_queue.get()
            data_timer.toc()

            # Apply one gradient step
            train_timer.tic()
            if self.net.discriminator_loss is not None:
                error_F, error_R = self.evaluate_discriminator(
                    gen_img, gen_camera, disc_voxel)
                if error_F > 0.2 or error_R > 0.2:
                    self.set_lr(gen_lr / 100.)
                    discriminator_loss = self.discriminator_train_loss(
                        gen_img, gen_camera, disc_voxel)
                    discriminator_losses.append(discriminator_loss)
            self.set_lr(gen_lr)
            results = self.generator_train_loss(gen_img, gen_camera)
            generator_loss = results[0]
            generator_losses.append(generator_loss)
            generator_idx += 1
            mask_loss = results[1]
            mask_losses.append(mask_loss)
            activations = results[2:]
            train_timer.toc()

            # Decrease learning rate at certain points
            if train_ind in lr_steps:
                # edict only takes string for key. Hacky way
                gen_lr = np.float(cfg.TRAIN.LEARNING_RATES[str(train_ind)])
                print('Learing rate decreased to %f: ' % gen_lr)

            # Debugging modules
            #
            # Print status, run validation, check divergence, and save model.
            if train_ind % cfg.TRAIN.PRINT_FREQ == 0:
                # Print the current loss
                get_mean = lambda x, y: np.mean(x) if x else y
                print("""%s Iter %d: Discriminator loss %f, Generator """
                      """loss %f, Mask loss %f""" %
                      (datetime.now(), train_ind,
                       get_mean(discriminator_losses, discriminator_loss),
                       get_mean(generator_losses, generator_loss),
                       get_mean(mask_losses, mask_loss)))
                discriminator_losses = []
                generator_losses = []
                mask_losses = []

            if train_ind % cfg.TRAIN.NAN_CHECK_FREQ == 0:
                # Check that the network parameters are all valid
                max_param = max_or_nan(self.net.all_params)
                if np.isnan(max_param):
                    print('NAN detected')
                    break

            if train_ind % cfg.TRAIN.SAVE_FREQ == 0 and not train_ind == 0:
                self.save(mask_losses, save_dir, train_ind)