Esempio n. 1
0
    def get_embeddings(self, dataloader, evaluate=True):
        args = self.args
        self.model.eval()

        local_step = 0
        push_to_cpu_steps = 32
        idxs_list = []
        embeds_list = []
        master_idxs_list = []
        master_embeds_list = []

        def _synchronize_lists(_embeds_list, _idxs_list):
            gathered_data = all_gather({
                'embeds_list': _embeds_list,
                'idxs_list': _idxs_list,
            })
            if get_rank() == 0:
                _embeds_list = [d['embeds_list'] for d in gathered_data]
                _embeds_list = flatten(_embeds_list)
                _embeds_list = [x.cpu() for x in _embeds_list]
                _idxs_list = [d['idxs_list'] for d in gathered_data]
                _idxs_list = flatten(_idxs_list)
                _idxs_list = [x.cpu() for x in _idxs_list]
                master_embeds_list.extend(_embeds_list)
                master_idxs_list.extend(_idxs_list)
            synchronize()
            return [], []

        batch_iterator = tqdm(dataloader,
                              desc='Getting embeddings...',
                              disable=(not evaluate or get_rank() != 0
                                       or args.disable_logging))
        for batch in batch_iterator:
            batch = tuple(t.to(args.device, non_blocking=True) for t in batch)
            with torch.no_grad():
                inputs = {
                    'input_ids': batch[1],
                    'attention_mask': batch[2],
                    'token_type_ids': batch[3],
                    'concat_input': False
                }
                embeds_list.append(self.model(**inputs))
                idxs_list.append(batch[0])
                local_step += 1
                if local_step % push_to_cpu_steps == 0:
                    embeds_list, idxs_list = _synchronize_lists(
                        embeds_list, idxs_list)

        embeds_list, idxs_list = _synchronize_lists(embeds_list, idxs_list)

        idxs, embeds = None, None
        if get_rank() == 0:
            idxs = torch.cat(master_idxs_list, dim=0).numpy()
            idxs, indices = np.unique(idxs, return_index=True)
            embeds = torch.cat(master_embeds_list, dim=0).numpy()
            embeds = embeds[indices]
        synchronize()
        return idxs, embeds
Esempio n. 2
0
    def metric_3D(self, model, cfg):
        p_json = cfg.DATASET.TEST_PERSON_LIST
        datadir_4D = "/root/ACDC_DataSet/4dData"

        with open(p_json, "r") as f:
            persons = json.load(f)

        total_segMetrics = {"dice": [[], [], []], "hausdorff": [[], [], []]}
        for i, p in enumerate(persons):
            # imgs, gts = personTo4Ddata(p, val_list)
            if p in self.caches_4D.keys():
                imgs, gts = self.caches_4D[p]
            else:
                imgs = np.load(
                    os.path.join(datadir_4D,
                                 p.split('-')[1], '4d_data.npy'))
                gts = np.load(
                    os.path.join(datadir_4D,
                                 p.split('-')[1], '4d_gt.npy'))
                self.caches_4D[p] = [imgs, gts]

            imgs, gts = imgs.astype(np.float32)[..., None, :], gts.astype(
                np.float32)[..., None, :]
            imgs, gts = joint_transform(imgs, gts, cfg)
            gts = [gt[:, 0, ...].numpy() for gt in gts]

            preds = test_person(
                model, imgs, multi_batches=True,
                used_df=cfg.DATASET.DF_USED)  # (times, slices, H, W)

            segMetrics = {"dice": [], "hausdorff": []}
            for j in range(len(preds)):
                segMetrics["dice"].append(
                    metrics.dice3D(preds[j], gts[j], gts[j].shape))
                segMetrics["hausdorff"].append(metrics.hd_3D(preds[j], gts[j]))

            for k, v in segMetrics.items():
                segMetrics[k] = np.array(v).reshape((-1, 3))

            for k, v in total_segMetrics.items():
                for j in range(3):
                    total_segMetrics[k][j] += segMetrics[k][:, j].tolist()
            # person i is done
            if get_rank() == 0:
                print("\r{}/{} {:.0%}\r".format(i, len(persons),
                                                i / len(persons)),
                      end='')
        if get_rank() == 0: print()

        mean = {}
        for k, v in total_segMetrics.items():
            mean.update({"LV_" + k: np.mean(v[1])})
            mean.update({"MYO_" + k: np.mean(v[2])})
            mean.update({"RV_" + k: np.mean(v[0])})
        return mean
Esempio n. 3
0
    def __init__(self, filename, benchmark, organization):
        self.mllogger = mllog.get_mllogger()
        self.comm_rank = comm.get_rank()
        self.comm_size = comm.get_size()
        self.constants = constants

        # create logging dir if it does not exist
        logdir = os.path.dirname(filename)
        if self.comm_rank == 0:
            if not os.path.isdir(logdir):
                os.makedirs(logdir)
        if torch.distributed.is_available(
        ) and torch.distributed.is_initialized():
            torch.distributed.barrier()

        # create config
        mllog.config(filename=filename)
        self.mllogger.logger.propagate = False
        self.log_event(key=constants.SUBMISSION_BENCHMARK, value=benchmark)

        self.log_event(key=constants.SUBMISSION_ORG, value=organization)

        self.log_event(key=constants.SUBMISSION_DIVISION, value='closed')

        self.log_event(key=constants.SUBMISSION_STATUS, value='onprem')

        self.log_event(
            key=constants.SUBMISSION_PLATFORM,
            value=f'{self.comm_size}xSUBMISSION_PLATFORM_PLACEHOLDER')
def main():
    args = parse_args()
    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(
            backend="nccl",
            init_method="env://",
        )
        comm.synchronize()

    cfg = get_default_cfg()
    if args.config_file:
        cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.output_dir
    if output_dir:
        misc.mkdir(output_dir)

    logger = setup_logger("EfficientDet", output_dir, comm.get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Loaded configuration file {}".format(args.config_file))
    logger.info("Running with config:\n{}".format(cfg))

    output_config_path = os.path.join(output_dir, 'config.yml')
    logger.info("Saving config into: {}".format(output_config_path))
    misc.save_config(cfg, output_config_path)

    model = train(cfg, args.local_rank, args.distributed)
Esempio n. 5
0
    def compute_scores_for_inference(self, clusters_mx, per_example_negs):
        # TODO: add description here
        args = self.args

        # get all of the unique examples
        examples = clusters_mx.data.tolist()
        examples.extend(flatten(per_example_negs.tolist()))
        examples = unique(examples)
        examples = list(filter(lambda x: x >= 0, examples))

        # create dataset and dataloader
        dataset = InferenceEmbeddingDataset(args, examples,
                                            args.train_cache_dir)
        dataloader = InferenceEmbeddingDataLoader(args, dataset)

        # get the unique idxs and embeds for each idx
        idxs, embeds = self.get_embeddings(dataloader, evaluate=False)

        sparse_graph = None
        if get_rank() == 0:
            # create inverse index for mapping
            inverse_idxs = {v: k for k, v in enumerate(idxs)}

            ## make the list of pairs of dot products we need
            _row = clusters_mx.row
            # positives:
            local_pos_a, local_pos_b = np.where(
                np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1))
            pos_a = clusters_mx.data[local_pos_a]
            pos_b = clusters_mx.data[local_pos_b]
            # negatives:
            local_neg_a = np.tile(
                np.arange(per_example_negs.shape[0])[:, np.newaxis],
                (1, per_example_negs.shape[1])).flatten()
            neg_a = clusters_mx.data[local_neg_a]
            neg_b = per_example_negs.flatten()

            neg_mask = (neg_b != -1)
            neg_a = neg_a[neg_mask]
            neg_b = neg_b[neg_mask]

            # create subset of the sparse graph we care about
            a = np.concatenate((pos_a, neg_a), axis=0)
            b = np.concatenate((pos_b, neg_b), axis=0)
            edges = list(zip(a, b))
            affinities = [
                np.dot(embeds[inverse_idxs[i]], embeds[inverse_idxs[j]])
                for i, j in edges
            ]

            # convert to coo_matrix
            edges = np.asarray(edges).T
            affinities = np.asarray(affinities)
            _sparse_num = np.max(edges) + 1
            sparse_graph = coo_matrix((affinities, edges),
                                      shape=(_sparse_num, _sparse_num))

        synchronize()
        return sparse_graph
def train(cfg, local_rank, distributed):

    num_classes = COCODataset(cfg.data.train[0], cfg.data.train[1]).num_classes
    model = EfficientDet(num_classes=num_classes, model_name=cfg.model.name)
    inp_size = model.config['inp_size']
    device = torch.device(cfg.device)
    model.to(device)

    optimizer = build_optimizer(model, **optimizer_kwargs(cfg))
    lr_scheduler = build_lr_scheduler(optimizer, **lr_scheduler_kwargs(cfg))

    use_mixed_precision = cfg.dtype == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
            find_unused_parameters=True)

    arguments = {}
    arguments["iteration"] = 0
    output_dir = cfg.output_dir
    save_to_disk = comm.get_rank() == 0
    checkpointer = Checkpointer(model, optimizer, lr_scheduler, output_dir,
                                save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.model.resume)
    arguments.update(extra_checkpoint_data)

    train_dataloader = build_dataloader(cfg,
                                        inp_size,
                                        is_train=True,
                                        distributed=distributed,
                                        start_iter=arguments["iteration"])

    test_period = cfg.test.test_period
    if test_period > 0:
        val_dataloader = build_dataloader(cfg,
                                          inp_size,
                                          is_train=False,
                                          distributed=distributed)
    else:
        val_dataloader = None

    checkpoint_period = cfg.solver.checkpoint_period
    log_period = cfg.solver.log_period

    do_train(cfg, model, train_dataloader, val_dataloader, optimizer,
             lr_scheduler, checkpointer, device, checkpoint_period,
             test_period, log_period, arguments)

    return model
Esempio n. 7
0
 def get_edge_affinities(self, edges, example_dir, knn_index):
     if get_rank() == 0:
         idxs, embeds = knn_index.idxs, knn_index.X
         inverse_idxs = {v: k for k, v in enumerate(idxs)}
         affinities = [
             np.dot(embeds[inverse_idxs[i]], embeds[inverse_idxs[j]])
             for i, j in edges
         ]
         return affinities
Esempio n. 8
0
def train(cfg,
          local_rank,
          distributed,
          logger=None,
          tblogger=None,
          transfer_weight=False,
          change_lr=False):
    device = torch.device('cuda')

    # create model
    logger.info('Creating model "{}"'.format(cfg.MODEL.ARCHITECTURE))
    model = build_model(cfg).to(device)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=255).to(device)
    optimizer = make_optimizer(cfg, model)
    # model, optimizer = apex.amp.initialize(model, optimizer, opt_level='O2')
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        # model = apex.parallel.DistributedDataParallel(model)
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            broadcast_buffers=True,
        )

    save_to_disk = get_rank() == 0

    # checkpoint
    arguments = {}
    arguments['iteration'] = 0
    arguments['best_iou'] = 0
    checkpointer = Checkpointer(model, optimizer, scheduler, cfg.LOGS.DIR,
                                save_to_disk, logger)
    extra_checkpoint_data = checkpointer.load(
        f=cfg.MODEL.WEIGHT,
        model_weight_only=transfer_weight,
        change_scheduler=change_lr)
    arguments.update(extra_checkpoint_data)

    # data_loader
    logger.info('Loading dataset "{}"'.format(cfg.DATASETS.TRAIN))
    data_loader = make_data_loader(cfg, 'train', distributed)
    data_loader_val = make_data_loader(cfg, 'val', distributed)

    do_train(cfg,
             model=model,
             data_loader=data_loader,
             optimizer=optimizer,
             scheduler=scheduler,
             criterion=criterion,
             checkpointer=checkpointer,
             device=device,
             arguments=arguments,
             tblogger=tblogger,
             data_loader_val=data_loader_val,
             distributed=distributed)
Esempio n. 9
0
    def train_step(self, batch):
        args = self.args

        # get the batch of clusters and approx negs for each individual example
        clusters_mx, per_example_negs = batch

        # compute scores using up-to-date model
        #sparse_graph = self.embed_sub_trainer.compute_scores_for_inference(
        #        clusters_mx, per_example_negs)
        #sparse_graph = self._build_temp_sparse_graph(
        #        clusters_mx, per_example_negs)

        # TODO: produce sparse graph w/ concat model in inference mode
        sparse_graph = self.concat_sub_trainer.compute_scores_for_inference(
            clusters_mx, per_example_negs)

        # create custom datasets for training
        embed_dataset_list = None
        concat_dataset_list = None
        dataset_metrics = None
        if get_rank() == 0:
            dataset_lists, dataset_metrics = self.dataset_builder(
                clusters_mx, sparse_graph, self.train_metadata)
            embed_dataset_list, concat_dataset_list = dataset_lists
        dataset_metrics = broadcast(dataset_metrics, src=0)
        embed_dataset_list = broadcast(embed_dataset_list, src=0)
        concat_dataset_list = broadcast(concat_dataset_list, src=0)

        # take care of empty dataset list (should only happen when only considering m-m edges)
        if embed_dataset_list == None or concat_dataset_list == None:
            return {}

        ## train on datasets
        #embed_return_dict = self.embed_sub_trainer.train_on_subset(
        #        embed_dataset_list, self.train_metadata
        #)

        concat_return_dict = self.concat_sub_trainer.train_on_subset(
            concat_dataset_list, self.train_metadata)

        #embed_return_dict = broadcast(embed_return_dict, src=0)
        concat_return_dict = broadcast(concat_return_dict, src=0)

        return_dict = {}
        return_dict.update(dataset_metrics)
        #return_dict.update(embed_return_dict)
        return_dict.update(concat_return_dict)

        #if get_rank() == 0:
        #    embed()
        #synchronize()
        #exit()

        return return_dict
Esempio n. 10
0
    def _train_softmax(self, dataset_list, metadata):
        args = self.args

        losses = []
        time_per_dataset = []
        dataset_sizes = []

        self.model.train()
        self.model.zero_grad()
        criterion = nn.CrossEntropyLoss()
        for dataset in dataset_list:
            _dataset_start_time = time.time()
            dataset_sizes.append(len(dataset))
            dataloader = SoftmaxEmbeddingDataLoader(args, dataset)
            for batch in dataloader:
                batch = tuple(t.to(args.device) for t in batch)
                inputs = {
                    'input_ids': batch[1],
                    'attention_mask': batch[2],
                    'token_type_ids': batch[3],
                    'concat_input': False
                }
                outputs = self.model(**inputs)
                pos_neg_dot_prods = torch.sum(outputs[:, 0:1, :] *
                                              outputs[:, 1:, :],
                                              dim=-1)
                target = torch.zeros(pos_neg_dot_prods.shape[0],
                                     dtype=torch.long).cuda()
                loss = criterion(pos_neg_dot_prods, target)
                losses.append(loss.item())
                loss.backward()

            torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                           args.max_grad_norm)
            self.optimizer.step()
            self.scheduler.step()
            self.model.zero_grad()
            time_per_dataset.append(time.time() - _dataset_start_time)

        gathered_data = all_gather({
            'losses': losses,
        })

        if get_rank() == 0:
            losses = flatten([d['losses'] for d in gathered_data])
            loss = np.mean(losses)

            synchronize()
            return {'embed_loss': loss}
        else:
            synchronize()
            return None
Esempio n. 11
0
    def eval_epoch(self, d_loader):
        self.model.eval()

        eval_dict = {}
        total_loss = 0

        # eval one epoch
        if get_rank() == 0: print("evaluating...")
        sel_num = np.random.choice(len(d_loader), size=1)
        for i, data in enumerate(d_loader, 0):
            self.optimizer.zero_grad()
            vis = True if i == sel_num else False

            loss, tb_dict, disp_dict = self.model_fn_eval(self.model,
                                                          data,
                                                          self.criterion,
                                                          perfermance=True,
                                                          vis=vis)

            total_loss += loss.item()

            for k, v in tb_dict.items():
                if "vis" not in k:
                    eval_dict[k] = eval_dict.get(k, 0) + v
                else:
                    eval_dict[k] = v
            if get_rank() == 0:
                print("\r{}/{} {:.0%}\r".format(i, len(d_loader),
                                                i / len(d_loader)),
                      end='')
        if get_rank() == 0: print()

        for k, v in tb_dict.items():
            if "vis" not in k:
                eval_dict[k] = eval_dict.get(k, 0) / (i + 1)

        return total_loss / (i + 1), eval_dict, disp_dict
Esempio n. 12
0
    def _train_threshold(self, dataset_list, metadata):
        args = self.args

        losses = []
        time_per_dataset = []
        dataset_sizes = []

        self.model.train()
        self.model.zero_grad()
        random.shuffle(dataset_list)
        for dataset in dataset_list:
            _dataset_start_time = time.time()
            dataset_sizes.append(len(dataset))
            dataloader = ScaledPairsEmbeddingDataLoader(args, dataset)
            for batch in dataloader:
                batch = tuple(t.to(args.device) for t in batch)
                inputs = {
                    'input_ids': batch[2],
                    'attention_mask': batch[3],
                    'token_type_ids': batch[4],
                    'concat_input': False
                }
                outputs = self.model(**inputs)
                dot_prods = torch.sum(outputs[:, 0, :] * outputs[:, 1, :],
                                      dim=-1)
                loss = torch.mean(F.relu(args.margin - (batch[0] * dot_prods)))
                losses.append(loss.item())
                loss.backward()

            torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                           args.max_grad_norm)
            self.optimizer.step()
            self.scheduler.step()
            self.model.zero_grad()
            time_per_dataset.append(time.time() - _dataset_start_time)

        gathered_data = all_gather({
            'losses': losses,
        })

        if get_rank() == 0:
            losses = flatten([d['losses'] for d in gathered_data])
            loss = np.mean(losses)

            synchronize()
            return {'embed_loss': loss}
        else:
            synchronize()
            return None
Esempio n. 13
0
    def eval_epoch(self, d_loader):
        self.model.eval()

        eval_dict = {}
        # total_loss = 0

        # eval one epoch
        if get_rank() == 0: print("evaluating...")
        for i, data in enumerate(d_loader, 0):
            self.optimizer.zero_grad()

            _, tb_dict, disp_dict = self.model_fn_eval(self.model, data, self.criterion, perfermance=True)

            # total_loss += loss.item() # removed total loss

            for k, v in tb_dict.items():
                eval_dict[k] = eval_dict.get(k, 0) + v #key -- 字典中要查找的键,default -- 如果指定键的值不存在时,返回该默认值。
            if get_rank() == 0: print("\r{}/{} {:.0%}\r".format((i+1), len(d_loader), (i+1)/len(d_loader)), end='')
        if get_rank() == 0: print()

        for k, v in tb_dict.items():
            eval_dict[k] = eval_dict.get(k, 0) / (i + 1)
        
        return _, eval_dict, disp_dict # remove total_loss / (i+1)
Esempio n. 14
0
 def _synchronize_lists(_embeds_list, _idxs_list):
     gathered_data = all_gather({
         'embeds_list': _embeds_list,
         'idxs_list': _idxs_list,
     })
     if get_rank() == 0:
         _embeds_list = [d['embeds_list'] for d in gathered_data]
         _embeds_list = flatten(_embeds_list)
         _embeds_list = [x.cpu() for x in _embeds_list]
         _idxs_list = [d['idxs_list'] for d in gathered_data]
         _idxs_list = flatten(_idxs_list)
         _idxs_list = [x.cpu() for x in _idxs_list]
         master_embeds_list.extend(_embeds_list)
         master_idxs_list.extend(_idxs_list)
     synchronize()
     return [], []
Esempio n. 15
0
    def _build_temp_sparse_graph(self, clusters_mx, per_example_negs):
        args = self.args

        # get all of the unique examples
        examples = clusters_mx.data.tolist()
        examples.extend(flatten(per_example_negs.tolist()))
        examples = unique(examples)
        examples = list(filter(lambda x: x >= 0, examples))

        sparse_graph = None
        if get_rank() == 0:
            ## make the list of pairs of dot products we need
            _row = clusters_mx.row
            # positives:
            local_pos_a, local_pos_b = np.where(
                np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1))
            pos_a = clusters_mx.data[local_pos_a]
            pos_b = clusters_mx.data[local_pos_b]
            # negatives:
            local_neg_a = np.tile(
                np.arange(per_example_negs.shape[0])[:, np.newaxis],
                (1, per_example_negs.shape[1])).flatten()
            neg_a = clusters_mx.data[local_neg_a]
            neg_b = per_example_negs.flatten()

            neg_mask = (neg_b != -1)
            neg_a = neg_a[neg_mask]
            neg_b = neg_b[neg_mask]

            # create subset of the sparse graph we care about
            a = np.concatenate((pos_a, neg_a), axis=0)
            b = np.concatenate((pos_b, neg_b), axis=0)
            edges = list(zip(a, b))
            affinities = [0.0 for i, j in edges]

            # convert to coo_matrix
            edges = np.asarray(edges).T
            affinities = np.asarray(affinities)
            _sparse_num = np.max(edges) + 1
            sparse_graph = coo_matrix((affinities, edges),
                                      shape=(_sparse_num, _sparse_num))

        synchronize()
        return sparse_graph
def detection_inference(cfg, model, data_loader_val, device, iteration, summary_writer=None, logger=None,
                        visualize=False, fppi=(0.1, 0.01)):
    mAP = MeanOfAveragePrecision(cfg.DATASET.NUM_CLASS, cfg.DATASET.MAX_OBJECTS, fppi=fppi)
    bar = TqdmBar(data_loader_val, 0, get_rank(), data_loader_val.__len__(),
                  description='Inference', use_bar=cfg.USE_BAR)
    for iteration, record in bar.bar:
        record = move_to_device(record, device)
        prediction = model(record)
        prediction = prediction.cpu().detach()
        record = move_to_device(record, torch.device('cpu'))

        mAP.calculate_overlaps(record, prediction)
        if visualize:
            # TODO vis mod
            pass
    bar.close()

    mAP_5095, mAP_50, m_recall = mAP.calculate_map()

    if logger is not None:
        logger.info('====================================================================================')
        # logger.info('Average inference time per image without post process is: %s' % (
        #         sum(model.inference_time_without_postprocess) / max(len(model.inference_time_without_postprocess),
        #                                                             np.finfo(np.float64).eps)))
        # logger.info('Average inference time per image with post process is: %s' % (
        #         sum(model.inference_time_with_postprocess) / max(len(model.inference_time_with_postprocess),
        #                                                          np.finfo(np.float64).eps)))

        logger.info('mAP(@iou=0.5:0.95): %s' % mAP_5095)
        logger.info('mAP(@iou=0.5): %s' % mAP_50)
        logger.info('Recall(@iou=0.5, @fppi=%s): %s' % (fppi[0], m_recall[0]))
        logger.info('Recall(@iou=0.5, @fppi=%s): %s' % (fppi[1], m_recall[1]))
        logger.info('====================================================================================')
    if summary_writer is not None:
        record = {'mAP_iou_0.5_0.95': mAP_5095, 'mAP_iou_0.5': mAP_50,
                  'Recall_iou_0.5_fppi_{}'.format(fppi[0]): m_recall[0],
                  'Recall_iou_0.5_fppi_{}'.format(fppi[1]): m_recall[1]}
        write_summary(summary_writer, iteration, record=record, group='Evaluations')
Esempio n. 17
0
def compute_on_dataset_1stage(model, data_loader, device):
    # single stage inference, for model without memory features
    cpu_device = torch.device("cpu")
    results_dict = {}
    if get_world_size() == 1:
        extra_args = {}
    else:
        rank = get_rank()
        extra_args = dict(desc="rank {}".format(rank))
    for batch in tqdm(data_loader, **extra_args):
        slow_clips, fast_clips, boxes, objects, extras, video_ids = batch
        slow_clips = slow_clips.to(device)
        fast_clips = fast_clips.to(device)
        boxes = [box.to(device) for box in boxes]
        objects = [None if (box is None) else box.to(device) for box in objects]

        with torch.no_grad():
            output = model(slow_clips, fast_clips, boxes, objects, extras)
            output = [o.to(cpu_device) for o in output]
        results_dict.update(
            {video_id: result for video_id, result in zip(video_ids, output)}
        )

    return results_dict
def train(cfg, local_rank, distributed):
    logger = logging.getLogger(cfg.NAME)
    # build model
    model = build_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    # build solver
    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {"iteration": 0}

    save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME)

    save_to_disk = get_rank() == 0
    checkpointer = Checkpointer(
        model=model, optimizer=optimizer, scheduler=scheduler,
        save_dir=save_dir, save_to_disk=save_to_disk, logger=logger
    )
    extra_checkpoint_data = checkpointer.load(cfg.CHECKPOINTER.LOAD_NAME)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    evaluate = cfg.SOLVER.EVALUATE
    if evaluate:
        synchronize()
        data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True)
        synchronize()
    else:
        data_loader_val = None

    save_to_disk = get_rank() == 0
    if cfg.SUMMARY_WRITER and save_to_disk:
        save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME)
        summary_writer = make_summary_writer(cfg.SUMMARY_WRITER, save_dir, model_name=cfg.MODEL.NAME)
    else:
        summary_writer = None

    do_train(
        cfg,
        model,
        data_loader,
        data_loader_val,
        optimizer,
        scheduler,
        checkpointer,
        device,
        arguments,
        summary_writer
    )

    return model
Esempio n. 19
0
def train():
    print(args.local_rank)
    torch.cuda.set_device(args.local_rank)
    # create dataloader & network & optimizer
    model, model_fn_decorator, net_func = create_model(cfg)
    init_weights(model, init_type='kaiming')
    model.cuda()
    root_result_dir = args.output_dir
    os.makedirs(root_result_dir, exist_ok=True)

    log_file = os.path.join(root_result_dir, "log_train.txt")
    logger = create_logger(log_file, get_rank())
    logger.info("**********************Start logging**********************")
    logger.info('TRAINED MODEL:{}'.format(net_func))

    # log to file
    gpu_list = os.environ[
        'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(
        ) else 'ALL'
    logger.info("CUDA_VISIBLE_DEVICES=%s" % gpu_list)

    for key, val in vars(args).items():
        logger.info("{:16} {}".format(key, val))

    logger.info("***********************config infos**********************")
    for key, val in vars(cfg).items():
        logger.info("{:16} {}".format(key, val))

    # log tensorboard
    if get_rank() == 0:
        tb_log = SummaryWriter(
            log_dir=os.path.join(root_result_dir, "tensorboard"))
    else:
        tb_log = None

    train_loader, test_loader = create_dataloader()
    # train_loader, test_loader = create_dataloader_Insensee()

    optimizer = create_optimizer(model)

    # load checkpoint if it is possible
    start_epoch = it = best_res = 0
    last_epoch = -1
    if args.ckpt is not None:
        pure_model = model
        it, start_epoch, best_res = load_checkpoint(pure_model, optimizer,
                                                    args.ckpt, logger)
        last_epoch = start_epoch + 1

    lr_scheduler = create_scheduler(optimizer, last_epoch=last_epoch)
    # lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.98, last_epoch=-1)

    criterion = None

    # start training
    logger.info('**********************Start training**********************')
    ckpt_dir = os.path.join(root_result_dir, "ckpt")
    os.makedirs(ckpt_dir, exist_ok=True)
    trainer = train_utils.Trainer(model,
                                  model_fn=model_fn_decorator(),
                                  criterion=criterion,
                                  optimizer=optimizer,
                                  ckpt_dir=ckpt_dir,
                                  lr_scheduler=lr_scheduler,
                                  model_fn_eval=model_fn_decorator(),
                                  tb_log=tb_log,
                                  logger=logger,
                                  eval_frequency=1,
                                  cfg=cfg)

    trainer.train(start_it=it,
                  start_epoch=start_epoch,
                  n_epochs=args.epochs,
                  train_loader=train_loader,
                  test_loader=test_loader,
                  ckpt_save_interval=args.ckpt_save_interval,
                  best_res=best_res)

    logger.info('**********************End training**********************')
Esempio n. 20
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu  # local rank, local machine cuda id
    args.local_rank = args.gpu
    args.batch_size = args.batch_size_per_gpu
    args.batch_size_total = args.batch_size * args.world_size
    #rescale base lr
    args.lr_scheduler.base_lr = args.lr_scheduler.base_lr * (max(
        1, args.batch_size_total // 256))

    # set random seed, make sure all random subgraph generated would be the same
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.gpu:
        torch.cuda.manual_seed(args.seed)

    global_rank = args.gpu + args.machine_rank * ngpus_per_node
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=global_rank)

    # Setup logging format.
    logging.setup_logging(args.logging_save_path, 'w')

    logger.info(
        f"Use GPU: {args.gpu}, machine rank {args.machine_rank}, num_nodes {args.num_nodes}, \
                    gpu per node {ngpus_per_node}, world size {args.world_size}"
    )

    # synchronize is needed here to prevent a possible timeout after calling
    # init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    args.rank = comm.get_rank()  # global rank
    args.local_rank = args.gpu
    torch.cuda.set_device(args.gpu)

    # build model
    logger.info("=> creating model '{}'".format(args.arch))
    model = models.model_factory.create_model(args)
    model.cuda(args.gpu)

    # use sync batchnorm
    if getattr(args, 'sync_bn', False):
        model.apply(lambda m: setattr(m, 'need_sync', True))

    model = comm.get_parallel_model(model, args.gpu)  #local rank

    logger.info(model)

    criterion = loss_ops.CrossEntropyLossSmooth(args.label_smoothing).cuda(
        args.gpu)
    soft_criterion = loss_ops.AdaptiveLossSoft(args.alpha_min, args.alpha_max,
                                               args.iw_clip).cuda(args.gpu)

    if not getattr(args, 'inplace_distill', True):
        soft_criterion = None

    ## load dataset, train_sampler: distributed
    train_loader, val_loader, train_sampler = build_data_loader(args)
    args.n_iters_per_epoch = len(train_loader)

    logger.info(f'building optimizer and lr scheduler, \
            local rank {args.gpu}, global rank {args.rank}, world_size {args.world_size}'
                )
    optimizer = build_optimizer(args, model)
    lr_scheduler = build_lr_scheduler(args, optimizer)

    # optionally resume from a checkpoint
    if args.resume:
        saver.load_checkpoints(args, model, optimizer, lr_scheduler, logger)

    logger.info(args)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        args.curr_epoch = epoch
        logger.info('Training lr {}'.format(lr_scheduler.get_lr()[0]))

        # train for one epoch
        acc1, acc5 = train_epoch(epoch, model, train_loader, optimizer, criterion, args, \
                soft_criterion=soft_criterion, lr_scheduler=lr_scheduler)

        if comm.is_master_process() or args.distributed:
            # validate supernet model
            validate(train_loader, val_loader, model, criterion, args)

        if comm.is_master_process():
            # save checkpoints
            saver.save_checkpoint(
                args.checkpoint_save_path,
                model,
                optimizer,
                lr_scheduler,
                args,
                epoch,
            )
def main(pargs):

    # this should be global
    global have_wandb

    #init distributed training
    comm.init(pargs.wireup_method)
    comm_rank = comm.get_rank()
    comm_local_rank = comm.get_local_rank()
    comm_size = comm.get_size()

    # set up logging
    pargs.logging_frequency = max([pargs.logging_frequency, 1])
    log_file = os.path.normpath(
        os.path.join(pargs.output_dir, "logs", pargs.run_tag + ".log"))
    logger = mll.mlperf_logger(log_file, "deepcam", "Umbrella Corp.")
    logger.log_start(key="init_start", sync=True)
    logger.log_event(key="cache_clear")

    #set seed
    seed = 333
    logger.log_event(key="seed", value=seed)

    # Some setup
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        device = torch.device("cuda", comm_local_rank)
        torch.cuda.manual_seed(seed)
        #necessary for AMP to work
        torch.cuda.set_device(device)

        # TEST: allowed? Valuable?
        #torch.backends.cudnn.benchark = True
    else:
        device = torch.device("cpu")

    #visualize?
    visualize = (pargs.training_visualization_frequency >
                 0) or (pargs.validation_visualization_frequency > 0)

    #set up directories
    root_dir = os.path.join(pargs.data_dir_prefix)
    output_dir = pargs.output_dir
    plot_dir = os.path.join(output_dir, "plots")
    if comm_rank == 0:
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)
        if visualize and not os.path.isdir(plot_dir):
            os.makedirs(plot_dir)

    # Setup WandB
    if not pargs.enable_wandb:
        have_wandb = False
    if have_wandb and (comm_rank == 0):
        # get wandb api token
        certfile = os.path.join(pargs.wandb_certdir, ".wandbirc")
        try:
            with open(certfile) as f:
                token = f.readlines()[0].replace("\n", "").split()
                wblogin = token[0]
                wbtoken = token[1]
        except IOError:
            print("Error, cannot open WandB certificate {}.".format(certfile))
            have_wandb = False

        if have_wandb:
            # log in: that call can be blocking, it should be quick
            sp.call(["wandb", "login", wbtoken])

            #init db and get config
            resume_flag = pargs.run_tag if pargs.resume_logging else False
            wandb.init(entity=wblogin,
                       project='deepcam',
                       name=pargs.run_tag,
                       id=pargs.run_tag,
                       resume=resume_flag)
            config = wandb.config

            #set general parameters
            config.root_dir = root_dir
            config.output_dir = pargs.output_dir
            config.max_epochs = pargs.max_epochs
            config.local_batch_size = pargs.local_batch_size
            config.num_workers = comm_size
            config.channels = pargs.channels
            config.optimizer = pargs.optimizer
            config.start_lr = pargs.start_lr
            config.adam_eps = pargs.adam_eps
            config.weight_decay = pargs.weight_decay
            config.model_prefix = pargs.model_prefix
            config.amp_opt_level = pargs.amp_opt_level
            config.loss_weight_pow = pargs.loss_weight_pow
            config.lr_warmup_steps = pargs.lr_warmup_steps
            config.lr_warmup_factor = pargs.lr_warmup_factor

            # lr schedule if applicable
            if pargs.lr_schedule:
                for key in pargs.lr_schedule:
                    config.update(
                        {"lr_schedule_" + key: pargs.lr_schedule[key]},
                        allow_val_change=True)

    # Logging hyperparameters
    logger.log_event(key="global_batch_size",
                     value=(pargs.local_batch_size * comm_size))
    logger.log_event(key="opt_name", value=pargs.optimizer)
    logger.log_event(key="opt_base_learning_rate",
                     value=pargs.start_lr * pargs.lr_warmup_factor)
    logger.log_event(key="opt_learning_rate_warmup_steps",
                     value=pargs.lr_warmup_steps)
    logger.log_event(key="opt_learning_rate_warmup_factor",
                     value=pargs.lr_warmup_factor)
    logger.log_event(key="opt_epsilon", value=pargs.adam_eps)

    # Define architecture
    n_input_channels = len(pargs.channels)
    n_output_channels = 3
    net = deeplab_xception.DeepLabv3_plus(n_input=n_input_channels,
                                          n_classes=n_output_channels,
                                          os=16,
                                          pretrained=False,
                                          rank=comm_rank)
    net.to(device)

    #select loss
    loss_pow = pargs.loss_weight_pow
    #some magic numbers
    class_weights = [
        0.986267818390377**loss_pow, 0.0004578708870701058**loss_pow,
        0.01327431072255291**loss_pow
    ]
    fpw_1 = 2.61461122397522257612
    fpw_2 = 1.71641974795896018744
    criterion = losses.fp_loss

    #select optimizer
    optimizer = None
    if pargs.optimizer == "Adam":
        optimizer = optim.Adam(net.parameters(),
                               lr=pargs.start_lr,
                               eps=pargs.adam_eps,
                               weight_decay=pargs.weight_decay)
    elif pargs.optimizer == "AdamW":
        optimizer = optim.AdamW(net.parameters(),
                                lr=pargs.start_lr,
                                eps=pargs.adam_eps,
                                weight_decay=pargs.weight_decay)
    elif have_apex and (pargs.optimizer == "LAMB"):
        optimizer = aoptim.FusedLAMB(net.parameters(),
                                     lr=pargs.start_lr,
                                     eps=pargs.adam_eps,
                                     weight_decay=pargs.weight_decay)
    else:
        raise NotImplementedError("Error, optimizer {} not supported".format(
            pargs.optimizer))

    if have_apex:
        #wrap model and opt into amp
        net, optimizer = amp.initialize(net,
                                        optimizer,
                                        opt_level=pargs.amp_opt_level)

    #make model distributed
    net = DDP(net)

    #restart from checkpoint if desired
    #if (comm_rank == 0) and (pargs.checkpoint):
    #load it on all ranks for now
    if pargs.checkpoint:
        checkpoint = torch.load(pargs.checkpoint, map_location=device)
        start_step = checkpoint['step']
        start_epoch = checkpoint['epoch']
        optimizer.load_state_dict(checkpoint['optimizer'])
        net.load_state_dict(checkpoint['model'])
        if have_apex:
            amp.load_state_dict(checkpoint['amp'])
    else:
        start_step = 0
        start_epoch = 0

    #select scheduler
    if pargs.lr_schedule:
        scheduler_after = ph.get_lr_schedule(pargs.start_lr,
                                             pargs.lr_schedule,
                                             optimizer,
                                             last_step=start_step)

        # LR warmup
        if pargs.lr_warmup_steps > 0:
            if have_warmup_scheduler:
                scheduler = GradualWarmupScheduler(
                    optimizer,
                    multiplier=pargs.lr_warmup_factor,
                    total_epoch=pargs.lr_warmup_steps,
                    after_scheduler=scheduler_after)
            # Throw an error if the package is not found
            else:
                raise Exception(
                    f'Requested {pargs.lr_warmup_steps} LR warmup steps '
                    'but warmup scheduler not found. Install it from '
                    'https://github.com/ildoonet/pytorch-gradual-warmup-lr')
        else:
            scheduler = scheduler_after

    #broadcast model and optimizer state
    steptens = torch.tensor(np.array([start_step, start_epoch]),
                            requires_grad=False).to(device)
    dist.broadcast(steptens, src=0)

    ##broadcast model and optimizer state
    #hvd.broadcast_parameters(net.state_dict(), root_rank = 0)
    #hvd.broadcast_optimizer_state(optimizer, root_rank = 0)

    #unpack the bcasted tensor
    start_step = steptens.cpu().numpy()[0]
    start_epoch = steptens.cpu().numpy()[1]

    # Set up the data feeder
    # train
    train_dir = os.path.join(root_dir, "train")
    train_set = cam.CamDataset(train_dir,
                               statsfile=os.path.join(root_dir, 'stats.h5'),
                               channels=pargs.channels,
                               allow_uneven_distribution=False,
                               shuffle=True,
                               preprocess=True,
                               comm_size=comm_size,
                               comm_rank=comm_rank)
    train_loader = DataLoader(
        train_set,
        pargs.local_batch_size,
        num_workers=min([pargs.max_inter_threads, pargs.local_batch_size]),
        pin_memory=True,
        drop_last=True)

    # validation: we only want to shuffle the set if we are cutting off validation after a certain number of steps
    validation_dir = os.path.join(root_dir, "validation")
    validation_set = cam.CamDataset(validation_dir,
                                    statsfile=os.path.join(
                                        root_dir, 'stats.h5'),
                                    channels=pargs.channels,
                                    allow_uneven_distribution=True,
                                    shuffle=(pargs.max_validation_steps
                                             is not None),
                                    preprocess=True,
                                    comm_size=comm_size,
                                    comm_rank=comm_rank)
    # use batch size = 1 here to make sure that we do not drop a sample
    validation_loader = DataLoader(
        validation_set,
        1,
        num_workers=min([pargs.max_inter_threads, pargs.local_batch_size]),
        pin_memory=True,
        drop_last=True)

    # log size of datasets
    logger.log_event(key="train_samples", value=train_set.global_size)
    if pargs.max_validation_steps is not None:
        val_size = min([
            validation_set.global_size,
            pargs.max_validation_steps * pargs.local_batch_size * comm_size
        ])
    else:
        val_size = validation_set.global_size
    logger.log_event(key="eval_samples", value=val_size)

    # do sanity check
    if pargs.max_validation_steps is not None:
        logger.log_event(key="invalid_submission")

    #for visualization
    #if visualize:
    #    viz = vizc.CamVisualizer()

    # Train network
    if have_wandb and (comm_rank == 0):
        wandb.watch(net)

    step = start_step
    epoch = start_epoch
    current_lr = pargs.start_lr if not pargs.lr_schedule else scheduler.get_last_lr(
    )[0]
    stop_training = False
    net.train()

    # start trining
    logger.log_end(key="init_stop", sync=True)
    logger.log_start(key="run_start", sync=True)

    # training loop
    while True:

        # start epoch
        logger.log_start(key="epoch_start",
                         metadata={
                             'epoch_num': epoch + 1,
                             'step_num': step
                         },
                         sync=True)

        # epoch loop
        for inputs, label, filename in train_loader:

            # send to device
            inputs = inputs.to(device)
            label = label.to(device)

            # forward pass
            outputs = net.forward(inputs)

            # Compute loss and average across nodes
            loss = criterion(outputs,
                             label,
                             weight=class_weights,
                             fpw_1=fpw_1,
                             fpw_2=fpw_2)

            # Backprop
            optimizer.zero_grad()
            if have_apex:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            optimizer.step()

            # step counter
            step += 1

            if pargs.lr_schedule:
                current_lr = scheduler.get_last_lr()[0]
                scheduler.step()

            #visualize if requested
            #if (step % pargs.training_visualization_frequency == 0) and (comm_rank == 0):
            #    # Compute predictions
            #    predictions = torch.max(outputs, 1)[1]
            #
            #    # extract sample id and data tensors
            #    sample_idx = np.random.randint(low=0, high=label.shape[0])
            #    plot_input = inputs.detach()[sample_idx, 0,...].cpu().numpy()
            #    plot_prediction = predictions.detach()[sample_idx,...].cpu().numpy()
            #    plot_label = label.detach()[sample_idx,...].cpu().numpy()
            #
            #    # create filenames
            #    outputfile = os.path.basename(filename[sample_idx]).replace("data-", "training-").replace(".h5", ".png")
            #    outputfile = os.path.join(plot_dir, outputfile)
            #
            #    # plot
            #    viz.plot(filename[sample_idx], outputfile, plot_input, plot_prediction, plot_label)
            #
            #    #log if requested
            #    if have_wandb:
            #        img = Image.open(outputfile)
            #        wandb.log({"train_examples": [wandb.Image(img, caption="Prediction vs. Ground Truth")]}, step = step)

            #log if requested
            if (step % pargs.logging_frequency == 0):

                # allreduce for loss
                loss_avg = loss.detach()
                dist.reduce(loss_avg, dst=0, op=dist.ReduceOp.SUM)
                loss_avg_train = loss_avg.item() / float(comm_size)

                # Compute score
                predictions = torch.max(outputs, 1)[1]
                iou = utils.compute_score(predictions,
                                          label,
                                          device_id=device,
                                          num_classes=3)
                iou_avg = iou.detach()
                dist.reduce(iou_avg, dst=0, op=dist.ReduceOp.SUM)
                iou_avg_train = iou_avg.item() / float(comm_size)

                logger.log_event(key="learning_rate",
                                 value=current_lr,
                                 metadata={
                                     'epoch_num': epoch + 1,
                                     'step_num': step
                                 })
                logger.log_event(key="train_accuracy",
                                 value=iou_avg_train,
                                 metadata={
                                     'epoch_num': epoch + 1,
                                     'step_num': step
                                 })
                logger.log_event(key="train_loss",
                                 value=loss_avg_train,
                                 metadata={
                                     'epoch_num': epoch + 1,
                                     'step_num': step
                                 })

                if have_wandb and (comm_rank == 0):
                    wandb.log(
                        {"train_loss": loss_avg.item() / float(comm_size)},
                        step=step)
                    wandb.log(
                        {"train_accuracy": iou_avg.item() / float(comm_size)},
                        step=step)
                    wandb.log({"learning_rate": current_lr}, step=step)
                    wandb.log({"epoch": epoch + 1}, step=step)

            # validation step if desired
            if (step % pargs.validation_frequency == 0):

                logger.log_start(key="eval_start",
                                 metadata={'epoch_num': epoch + 1})

                #eval
                net.eval()

                count_sum_val = torch.Tensor([0.]).to(device)
                loss_sum_val = torch.Tensor([0.]).to(device)
                iou_sum_val = torch.Tensor([0.]).to(device)

                # disable gradients
                with torch.no_grad():

                    # iterate over validation sample
                    step_val = 0
                    # only print once per eval at most
                    visualized = False
                    for inputs_val, label_val, filename_val in validation_loader:

                        #send to device
                        inputs_val = inputs_val.to(device)
                        label_val = label_val.to(device)

                        # forward pass
                        outputs_val = net.forward(inputs_val)

                        # Compute loss and average across nodes
                        loss_val = criterion(outputs_val,
                                             label_val,
                                             weight=class_weights,
                                             fpw_1=fpw_1,
                                             fpw_2=fpw_2)
                        loss_sum_val += loss_val

                        #increase counter
                        count_sum_val += 1.

                        # Compute score
                        predictions_val = torch.max(outputs_val, 1)[1]
                        iou_val = utils.compute_score(predictions_val,
                                                      label_val,
                                                      device_id=device,
                                                      num_classes=3)
                        iou_sum_val += iou_val

                        # Visualize
                        #if (step_val % pargs.validation_visualization_frequency == 0) and (not visualized) and (comm_rank == 0):
                        #    #extract sample id and data tensors
                        #    sample_idx = np.random.randint(low=0, high=label_val.shape[0])
                        #    plot_input = inputs_val.detach()[sample_idx, 0,...].cpu().numpy()
                        #    plot_prediction = predictions_val.detach()[sample_idx,...].cpu().numpy()
                        #    plot_label = label_val.detach()[sample_idx,...].cpu().numpy()
                        #
                        #    #create filenames
                        #    outputfile = os.path.basename(filename[sample_idx]).replace("data-", "validation-").replace(".h5", ".png")
                        #    outputfile = os.path.join(plot_dir, outputfile)
                        #
                        #    #plot
                        #    viz.plot(filename[sample_idx], outputfile, plot_input, plot_prediction, plot_label)
                        #    visualized = True
                        #
                        #    #log if requested
                        #    if have_wandb:
                        #        img = Image.open(outputfile)
                        #        wandb.log({"eval_examples": [wandb.Image(img, caption="Prediction vs. Ground Truth")]}, step = step)

                        #increase eval step counter
                        step_val += 1

                        if (pargs.max_validation_steps is not None
                            ) and step_val > pargs.max_validation_steps:
                            break

                # average the validation loss
                dist.all_reduce(count_sum_val, op=dist.ReduceOp.SUM)
                dist.all_reduce(loss_sum_val, op=dist.ReduceOp.SUM)
                dist.all_reduce(iou_sum_val, op=dist.ReduceOp.SUM)
                loss_avg_val = loss_sum_val.item() / count_sum_val.item()
                iou_avg_val = iou_sum_val.item() / count_sum_val.item()

                # print results
                logger.log_event(key="eval_accuracy",
                                 value=iou_avg_val,
                                 metadata={
                                     'epoch_num': epoch + 1,
                                     'step_num': step
                                 })
                logger.log_event(key="eval_loss",
                                 value=loss_avg_val,
                                 metadata={
                                     'epoch_num': epoch + 1,
                                     'step_num': step
                                 })

                # log in wandb
                if have_wandb and (comm_rank == 0):
                    wandb.log({"eval_loss": loss_avg_val}, step=step)
                    wandb.log({"eval_accuracy": iou_avg_val}, step=step)

                if (iou_avg_val >= pargs.target_iou):
                    logger.log_event(key="target_accuracy_reached",
                                     value=pargs.target_iou,
                                     metadata={
                                         'epoch_num': epoch + 1,
                                         'step_num': step
                                     })
                    stop_training = True

                # set to train
                net.train()

                logger.log_end(key="eval_stop",
                               metadata={'epoch_num': epoch + 1})

            #save model if desired
            if (pargs.save_frequency > 0) and (step % pargs.save_frequency
                                               == 0):
                logger.log_start(key="save_start",
                                 metadata={
                                     'epoch_num': epoch + 1,
                                     'step_num': step
                                 },
                                 sync=True)
                if comm_rank == 0:
                    checkpoint = {
                        'step': step,
                        'epoch': epoch,
                        'model': net.state_dict(),
                        'optimizer': optimizer.state_dict()
                    }
                    if have_apex:
                        checkpoint['amp'] = amp.state_dict()
                    torch.save(
                        checkpoint,
                        os.path.join(
                            output_dir, pargs.model_prefix + "_step_" +
                            str(step) + ".cpt"))
                logger.log_end(key="save_stop",
                               metadata={
                                   'epoch_num': epoch + 1,
                                   'step_num': step
                               },
                               sync=True)

            # Stop training?
            if stop_training:
                break

        # log the epoch
        logger.log_end(key="epoch_stop",
                       metadata={
                           'epoch_num': epoch + 1,
                           'step_num': step
                       },
                       sync=True)
        epoch += 1

        # are we done?
        if epoch >= pargs.max_epochs or stop_training:
            break

    # run done
    logger.log_end(key="run_stop", sync=True, metadata={'status': 'success'})
Esempio n. 22
0
def main(pargs):

    #init distributed training
    comm_local_group = comm.init(pargs.wireup_method,
                                 pargs.batchnorm_group_size)
    comm_rank = comm.get_rank()
    comm_local_rank = comm.get_local_rank()
    comm_size = comm.get_size()
    comm_local_size = comm.get_local_size()

    # set up logging
    pargs.logging_frequency = max([pargs.logging_frequency, 1])
    log_file = os.path.normpath(
        os.path.join(pargs.output_dir, "logs", pargs.run_tag + ".log"))
    logger = mll.mlperf_logger(log_file, "deepcam", "Umbrella Corp.")
    logger.log_start(key="init_start", sync=True)
    logger.log_event(key="cache_clear")

    #set seed
    seed = pargs.seed
    logger.log_event(key="seed", value=seed)

    # Some setup
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        device = torch.device("cuda", comm_local_rank)
        torch.cuda.manual_seed(seed)
        torch.cuda.set_device(device)
        torch.backends.cudnn.benchmark = True
    else:
        device = torch.device("cpu")

    #set up directories
    root_dir = os.path.join(pargs.data_dir_prefix)
    output_dir = pargs.output_dir
    plot_dir = os.path.join(output_dir, "plots")
    if comm_rank == 0:
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

    # logging of rank information
    logger.log_event(key="number_of_ranks", value=comm_size)
    logger.log_event(key="number_of_nodes",
                     value=(comm_size // comm_local_size))
    logger.log_event(key="accelerators_per_node", value=comm_local_size)

    # Logging hyperparameters
    logger.log_event(key="global_batch_size",
                     value=(pargs.local_batch_size * comm_size))
    logger.log_event(key="batchnorm_group_size",
                     value=pargs.batchnorm_group_size)
    logger.log_event(key="gradient_accumulation_frequency",
                     value=pargs.gradient_accumulation_frequency)
    logger.log_event(key="checkpoint", value=pargs.checkpoint)

    # Define architecture
    n_input_channels = len(pargs.channels)
    n_output_channels = 3
    net = deeplab_xception.DeepLabv3_plus(n_input=n_input_channels,
                                          n_classes=n_output_channels,
                                          os=16,
                                          pretrained=False,
                                          rank=comm_rank,
                                          process_group=comm_local_group)
    net.to(device)

    #select loss
    #some magic numbers
    loss_pow = -0.125
    class_weights = [
        0.986267818390377**loss_pow, 0.0004578708870701058**loss_pow,
        0.01327431072255291**loss_pow
    ]
    # extract loss
    criterion = losses.CELoss(class_weights).to(device)
    criterion = torch.jit.script(criterion)

    #select optimizer
    optimizer = oh.get_optimizer(pargs, net, logger)

    #restart from checkpoint if desired
    if pargs.checkpoint is not None:
        checkpoint = torch.load(pargs.checkpoint, map_location=device)
        start_step = checkpoint['step']
        start_epoch = checkpoint['epoch']
        optimizer.load_state_dict(checkpoint['optimizer'])
        net.load_state_dict(checkpoint['model'])
    else:
        start_step = 0
        start_epoch = 0

    #broadcast model and optimizer state
    steptens = torch.tensor(np.array([start_step, start_epoch]),
                            requires_grad=False).to(device)
    if dist.is_initialized():
        dist.broadcast(steptens, src=0)

    #unpack the bcasted tensor
    start_step = int(steptens.cpu().numpy()[0])
    start_epoch = int(steptens.cpu().numpy()[1])

    #select scheduler
    scheduler = None
    if pargs.lr_schedule:
        pargs.lr_schedule["lr_warmup_steps"] = pargs.lr_warmup_steps
        pargs.lr_schedule["lr_warmup_factor"] = pargs.lr_warmup_factor
        scheduler = oh.get_lr_schedule(pargs.start_lr,
                                       pargs.lr_schedule,
                                       optimizer,
                                       logger,
                                       last_step=start_step)

    # print parameters
    if comm_rank == 0:
        print(net)
        print("Total number of elements:",
              sum(p.numel() for p in net.parameters() if p.requires_grad))

    # get input shapes for the upcoming model preprocessing
    # input_shape:
    tshape, _ = get_datashapes(pargs, root_dir)
    input_shape = tuple([tshape[2], tshape[0], tshape[1]])

    #distributed model parameters
    bucket_cap_mb = 25
    if pargs.batchnorm_group_size > 1:
        bucket_cap_mb = 220

    # get stream, relevant for graph capture
    ddp_net = DDP(net,
                  device_ids=[device.index],
                  output_device=device.index,
                  find_unused_parameters=False,
                  broadcast_buffers=False,
                  bucket_cap_mb=bucket_cap_mb,
                  gradient_as_bucket_view=False)

    # get stats handler here
    bnstats_handler = bns.BatchNormStatsSynchronize(ddp_net,
                                                    reduction="mean",
                                                    inplace=True)

    # create handles
    net_validate = ddp_net
    net_train = ddp_net

    # Set up the data feeder
    train_loader, train_size, validation_loader, validation_size = get_dataloaders(
        pargs, root_dir, device, seed, comm_size, comm_rank)

    # log size of datasets
    logger.log_event(key="train_samples", value=train_size)
    val_size = validation_size
    logger.log_event(key="eval_samples", value=val_size)

    # get start steps
    step = start_step
    epoch = start_epoch
    current_lr = pargs.start_lr if not pargs.lr_schedule else scheduler.get_last_lr(
    )[0]
    stop_training = False
    net_train.train()

    # start trining
    logger.log_end(key="init_stop", sync=True)
    logger.log_start(key="run_start", sync=True)

    # training loop
    while True:

        # start epoch
        logger.log_start(key="epoch_start",
                         metadata={
                             'epoch_num': epoch + 1,
                             'step_num': step
                         },
                         sync=True)

        train_loader.sampler.set_epoch(epoch)

        # training
        step = train_step(pargs, comm_rank, comm_size, device, step, epoch,
                          net_train, criterion, optimizer, scheduler,
                          train_loader, logger)

        # average BN stats
        bnstats_handler.synchronize()

        # validation
        stop_training = validate(pargs, comm_rank, comm_size, device, step,
                                 epoch, net_validate, criterion,
                                 validation_loader, logger)

        # log the epoch
        logger.log_end(key="epoch_stop",
                       metadata={
                           'epoch_num': epoch + 1,
                           'step_num': step
                       },
                       sync=True)
        epoch += 1

        #save model if desired
        if (pargs.save_frequency > 0) and (epoch % pargs.save_frequency == 0):
            logger.log_start(key="save_start",
                             metadata={
                                 'epoch_num': epoch + 1,
                                 'step_num': step
                             },
                             sync=True)
            if comm_rank == 0:
                checkpoint = {
                    'step': step,
                    'epoch': epoch,
                    'model': net_train.state_dict(),
                    'optimizer': optimizer.state_dict()
                }
                torch.save(
                    checkpoint,
                    os.path.join(
                        output_dir,
                        pargs.model_prefix + "_step_" + str(step) + ".cpt"))
                logger.log_end(key="save_stop",
                               metadata={
                                   'epoch_num': epoch + 1,
                                   'step_num': step
                               },
                               sync=True)

        # are we done?
        if (epoch >= pargs.max_epochs) or stop_training:
            break

    # run done
    logger.log_end(key="run_stop", sync=True, metadata={'status': 'success'})
Esempio n. 23
0
def inference(model, criterion, data_loader, dataset_name, save_result=False):
    logger = logging.getLogger('eve.' + __name__)

    device = torch.device('cuda')
    dataset = data_loader.dataset
    logger.info("Start evaluation on {} dataset ({} point clouds).".format(
        dataset_name, len(dataset)))

    if get_world_size() == 1:
        extra_args = {}
    else:
        rank = get_rank()
        extra_args = dict(desc="rank {}".format(rank))

    start_time = time.time()

    model.eval()
    outputs_per_gpu = {}
    targets_per_gpu = {}
    file_path_per_gpu = {}

    times = []

    with torch.no_grad():
        for batch in tqdm(data_loader, **extra_args):
            locs, feats, targets, metadata = batch
            inputs = ME.SparseTensor(feats, coords=locs).to(device)
            targets = targets.to(device, non_blocking=True).long()

            torch.cuda.synchronize()
            start_time = time.time()
            outputs = model(inputs, y=targets)
            torch.cuda.synchronize()
            end_time = time.time()
            times.append(end_time - start_time)

            arch = cfg.MODEL.ARCHITECTURE
            if arch == 'minkunet4d' or arch == 'minkunet_eve':
                for batch_idx in range(len(metadata)):
                    for time_idx in range(cfg.INPUT.VIDEO.NUM_FRAMES):
                        inv_map = metadata[batch_idx][time_idx]['inverse_map']
                        file_path = metadata[batch_idx][time_idx]['file_path']

                        locs_frame = (locs[:, -1] == batch_idx) & \
                            (locs[:, -2] == time_idx)
                        one_output, one_target = compute_one_frame(
                            outputs, targets, locs_frame, inv_map)

                        outputs_per_gpu[file_path] = one_output
                        targets_per_gpu[file_path] = one_target
                        file_path_per_gpu[file_path] = file_path
            else:  # other minknet
                for batch_idx in range(len(metadata)):
                    inv_map = metadata[batch_idx]['inverse_map']
                    file_path = metadata[batch_idx]['file_path']

                    # From MinkowskiEngine v0.3, batch index is on the first column
                    locs_frame = locs[:, -1] == batch_idx
                    one_output, one_target = compute_one_frame(
                        outputs, targets, locs_frame, inv_map)

                    outputs_per_gpu[file_path] = one_output
                    targets_per_gpu[file_path] = one_target
                    file_path_per_gpu[file_path] = file_path

    synchronize()

    logger.info("Total inference time: {}".format(np.sum(times)))

    # NOTE: `all_gather` will lead to CUDA out of memory
    # We use `scatter_gather` to save result of each process
    # in LOGS.DIR/tmp and will be cleared after gathering.
    outputs = scatter_gather(outputs_per_gpu)
    targets = scatter_gather(targets_per_gpu)
    file_paths = scatter_gather(file_path_per_gpu)
    if not is_main_process():
        return None

    all_outputs = {k: v.numpy() for o in outputs for k, v in o.items()}
    all_targets = {k: v.numpy() for t in targets for k, v in t.items()}
    all_file_paths = {k: v for f in file_paths for k, v in f.items()}

    assert len(all_outputs) == len(dataset.all_files), \
        '%d vs %d' % (len(all_outputs), len(dataset.all_files))

    if cfg.LOGS.SAVE_RESULT is False:
        all_file_paths = None
    metrics = evaluate(dataset, all_outputs, all_targets, all_file_paths)

    return metrics
Esempio n. 24
0
def compute_on_dataset_2stage(model, data_loader, device, logger):
    # two stage inference, for model with memory features.
    # first extract features and then do the inference
    cpu_device = torch.device("cpu")
    num_devices = get_world_size()
    dataset = data_loader.dataset
    if num_devices == 1:
        extra_args = {}
    else:
        rank = get_rank()
        extra_args = dict(desc="rank {}".format(rank))

    loader_len = len(data_loader)
    person_feature_pool = MemoryPool()
    batch_info_list = [None]*loader_len
    logger.info("Stage 1: extracting clip features.")
    start_time = time.time()

    for i, batch in enumerate(tqdm(data_loader, **extra_args)):
        slow_clips, fast_clips, boxes, objects, extras, video_ids = batch
        slow_clips = slow_clips.to(device)
        fast_clips = fast_clips.to(device)
        boxes = [box.to(device) for box in boxes]
        objects = [None if (box is None) else box.to(device) for box in objects]
        movie_ids = [e["movie_id"] for e in extras]
        timestamps = [e["timestamp"] for e in extras]
        with torch.no_grad():
            feature = model(slow_clips, fast_clips, boxes, objects, part_forward=0)
            person_feature = [ft.to(cpu_device) for ft in feature[0]]
            object_feature = [ft.to(cpu_device) for ft in feature[1]]
        # store person features into memory pool
        for movie_id, timestamp, p_ft, o_ft in zip(movie_ids, timestamps, person_feature, object_feature):
            person_feature_pool[movie_id, timestamp] = p_ft
        # store other information in list, for further inference
        batch_info_list[i] = (movie_ids, timestamps, video_ids, object_feature)

    # gather feature pools from different ranks
    synchronize()
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    logger.info(
        "Stage 1 time: {} ({} s / video per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset), num_devices
        )
    )
    feature_pool = all_gather(person_feature_pool)
    all_feature_pool_p = MemoryPool()
    all_feature_pool_p.update_list(feature_pool)
    del feature_pool, person_feature_pool

    # do the inference
    results_dict = {}
    logger.info("Stage 2: predicting with extracted feature.")
    start_time = time.time()
    for movie_ids, timestamps, video_ids, object_feature in tqdm(batch_info_list, **extra_args):
        current_feat_p = [all_feature_pool_p[movie_id, timestamp].to(device)
                          for movie_id, timestamp in zip(movie_ids, timestamps)]
        current_feat_o = [ft_o.to(device) for ft_o in object_feature]
        extras = dict(
            person_pool=all_feature_pool_p,
            movie_ids=movie_ids,
            timestamps=timestamps,
            current_feat_p=current_feat_p,
            current_feat_o=current_feat_o,
        )
        with torch.no_grad():
            output = model(None, None, None, None, extras=extras, part_forward=1)
            output = [o.to(cpu_device) for o in output]
        results_dict.update(
            {video_id: result for video_id, result in zip(video_ids, output)}
        )
    synchronize()
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    logger.info(
        "Stage 2 time: {} ({} s / video per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset), num_devices
        )
    )

    return results_dict
def do_train(cfg, model, data_loader_train, data_loader_val, optimizer,
             scheduler, checkpointer, device, arguments, summary_writer):
    # get logger
    logger = logging.getLogger(cfg.NAME)
    logger.info("Start training ...")
    logger.info("Size of training dataset: %s" %
                (data_loader_train.dataset.__len__()))
    logger.info("Size of validation dataset: %s" %
                (data_loader_val.dataset.__len__()))

    model.train()

    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader_train)
    start_iter = arguments["iteration"]
    start_training_time = time.time()
    end = time.time()
    bar = TqdmBar(data_loader_train,
                  start_iter,
                  get_rank(),
                  data_loader_train.__len__(),
                  description="Training",
                  use_bar=cfg.USE_BAR)

    for iteration, record in bar.bar:
        data_time = time.time() - end
        iteration += 1
        arguments["iteration"] = iteration
        record = move_to_device(record, device)

        loss, _ = model(record)
        optimizer.zero_grad()
        loss["total_loss"].backward()
        optimizer.step()
        scheduler.step()

        # reduce losses over all GPUs for logging purposes
        loss_reduced = {key: value.cpu().item() for key, value in loss.items()}
        meters.update(**loss_reduced)

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
        lr = optimizer.param_groups[0]["lr"]
        bar.set_postfix({"lr": lr, "total_loss": loss_reduced["total_loss"]})

        if iteration % cfg.SOLVER.LOGGER_PERIOD == 0 or iteration == max_iter:
            bar.clear(nolock=True)
            logger.info(
                meters.delimiter.join([
                    "iter: {iter:06d}",
                    "lr: {lr:.6f}",
                    "{meters}",
                    "eta: {eta}",
                    "mem: {memory:.0f}",
                ]).format(
                    iter=iteration,
                    lr=lr,
                    meters=str(meters),
                    eta=eta_string,
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))

            if summary_writer:
                write_summary(summary_writer,
                              iteration,
                              record=loss,
                              group='Losses')
                write_summary(summary_writer,
                              iteration,
                              record={'lr': lr},
                              group='LR')

        if iteration % cfg.SOLVER.CHECKPOINT_PERIOD == 0:
            bar.clear(nolock=True)
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
            if data_loader_val is not None:
                do_evaluation(cfg, model, data_loader_val, device, arguments,
                              summary_writer)

    checkpointer.save("model_final", **arguments)

    bar.close()
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / max_iter))
Esempio n. 26
0
        delimiter="  ",
    ) for test_tb_log_dir in test_tb_log_dirs
]

if cfg.EVALUATE:
    for task_name, testloader, test_meter in zip(task_names, testloaders,
                                                 test_meters):
        logging.info("Evaluating dataset: {}".format(task_name))
        validate(testloader,
                 net,
                 criterion_eval,
                 cfg,
                 test_meter,
                 global_step=0,
                 device=device,
                 local_rank=get_rank())

############## training code #############################
if not cfg.EVALUATE:
    scaler = torch.cuda.amp.GradScaler(enabled=cfg.AMP.ENABLED)
    # start from epoch 0 or last checkpoint epoch
    start_epoch = checkpointer.epoch
    for epoch in range(start_epoch, cfg.OPTIM.EPOCHS):
        # wait for all processes before every epoch
        synchronize()
        logging.info("PROGRESS: {}%".format(
            round(100 * epoch / cfg.OPTIM.EPOCHS, 4)))
        global_step = epoch * len(trainloader)

        # an empirical rule for redraw projects in Performer
        if cfg.MODEL.ARCH.startswith(
Esempio n. 27
0
def main(pargs):

    #init distributed training
    comm.init(pargs.wireup_method)
    comm_rank = comm.get_rank()
    comm_local_rank = comm.get_local_rank()
    comm_size = comm.get_size()

    #set seed
    seed = 333

    # Some setup
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        printr("Using GPUs", 0)
        device = torch.device("cuda", comm_local_rank)
        torch.cuda.manual_seed(seed)
        #necessary for AMP to work
        torch.cuda.set_device(device)
    else:
        printr("Using CPUs", 0)
        device = torch.device("cpu")

    #set up directories
    root_dir = os.path.join(pargs.data_dir_prefix)
    output_dir = pargs.output_dir
    plot_dir = os.path.join(output_dir, "plots")
    if comm_rank == 0:
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

    # Define architecture
    n_input_channels = len(pargs.channels)
    n_output_channels = 3
    net = deeplab_xception.DeepLabv3_plus(n_input=n_input_channels,
                                          n_classes=n_output_channels,
                                          os=16,
                                          pretrained=False,
                                          rank=comm_rank)
    net.to(device)

    #select loss
    loss_pow = pargs.loss_weight_pow
    #some magic numbers
    class_weights = [
        0.986267818390377**loss_pow, 0.0004578708870701058**loss_pow,
        0.01327431072255291**loss_pow
    ]
    fpw_1 = 2.61461122397522257612
    fpw_2 = 1.71641974795896018744
    criterion = losses.fp_loss

    #select optimizer
    optimizer = None
    if pargs.optimizer == "Adam":
        optimizer = optim.Adam(net.parameters(),
                               lr=pargs.start_lr,
                               eps=pargs.adam_eps,
                               weight_decay=pargs.weight_decay)
    elif pargs.optimizer == "AdamW":
        optimizer = optim.AdamW(net.parameters(),
                                lr=pargs.start_lr,
                                eps=pargs.adam_eps,
                                weight_decay=pargs.weight_decay)
    elif have_apex and (pargs.optimizer == "LAMB"):
        optimizer = aoptim.FusedLAMB(net.parameters(),
                                     lr=pargs.start_lr,
                                     eps=pargs.adam_eps,
                                     weight_decay=pargs.weight_decay)
    else:
        raise NotImplementedError("Error, optimizer {} not supported".format(
            pargs.optimizer))

    if have_apex:
        #wrap model and opt into amp
        net, optimizer = amp.initialize(net,
                                        optimizer,
                                        opt_level=pargs.amp_opt_level)

    #make model distributed
    net = DDP(net)

    #select scheduler
    if pargs.lr_schedule:
        scheduler = ph.get_lr_schedule(pargs.start_lr,
                                       pargs.lr_schedule,
                                       optimizer,
                                       last_step=0)

    # Set up the data feeder
    # train
    train_dir = os.path.join(root_dir, "train")
    train_set = cam.CamDataset(train_dir,
                               statsfile=os.path.join(root_dir, 'stats.h5'),
                               channels=pargs.channels,
                               shuffle=True,
                               preprocess=True,
                               comm_size=comm_size,
                               comm_rank=comm_rank)
    train_loader = DataLoader(
        train_set,
        pargs.local_batch_size,
        num_workers=min([pargs.max_inter_threads, pargs.local_batch_size]),
        drop_last=True)

    printr(
        '{:14.4f} REPORT: starting warmup'.format(
            dt.datetime.now().timestamp()), 0)
    step = 0
    current_lr = pargs.start_lr if not pargs.lr_schedule else scheduler.get_last_lr(
    )[0]
    current_lr = pargs.start_lr
    net.train()
    while True:

        #for inputs_raw, labels, source in train_loader:
        for inputs, label, filename in train_loader:

            # Print status
            if step == pargs.num_warmup_steps:
                printr(
                    '{:14.4f} REPORT: starting profiling'.format(
                        dt.datetime.now().timestamp()), 0)

            # Forward pass
            with Profile(pargs, "Forward", step):

                #send data to device
                inputs = inputs.to(device)
                label = label.to(device)

                # Compute output
                outputs = net.forward(inputs)

                # Compute loss
                loss = criterion(outputs,
                                 label,
                                 weight=class_weights,
                                 fpw_1=fpw_1,
                                 fpw_2=fpw_2)

            # allreduce for loss
            loss_avg = loss.detach()
            dist.reduce(loss_avg, dst=0, op=dist.ReduceOp.SUM)

            # Compute score
            predictions = torch.max(outputs, 1)[1]
            iou = utils.compute_score(predictions,
                                      label,
                                      device_id=device,
                                      num_classes=3)
            iou_avg = iou.detach()
            dist.reduce(iou_avg, dst=0, op=dist.ReduceOp.SUM)

            # Backprop
            with Profile(pargs, "Backward", step):

                # reset grads
                optimizer.zero_grad()

                # compute grads
                if have_apex:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

            # weight update
            with Profile(pargs, "Optimizer", step):
                # update weights
                optimizer.step()

            # advance the scheduler
            if pargs.lr_schedule:
                current_lr = scheduler.get_last_lr()[0]
                scheduler.step()

            #step counter
            step += 1

            #are we done?
            if step >= (pargs.num_warmup_steps + pargs.num_profile_steps):
                break

        #need to check here too
        if step >= (pargs.num_warmup_steps + pargs.num_profile_steps):
            break

    printr(
        '{:14.4f} REPORT: finishing profiling'.format(
            dt.datetime.now().timestamp()), 0)
Esempio n. 28
0
def printr(msg, rank=0):
    if comm.get_rank() == rank:
        print(msg)
Esempio n. 29
0
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Inference")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    distributed = num_gpus > 1

    if distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    # Merge config file.
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    # Print experimental infos.
    save_dir = ""
    logger = setup_logger("AlphAction", save_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(cfg)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + get_pretty_env_info())

    # Build the model.
    model = build_detection_model(cfg)
    model.to("cuda")

    # load weight.
    output_dir = cfg.OUTPUT_DIR
    checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir)
    checkpointer.load(cfg.MODEL.WEIGHT)

    output_folders = [None] * len(cfg.DATASETS.TEST)
    dataset_names = cfg.DATASETS.TEST
    mem_active = has_memory(cfg.IA_STRUCTURE)
    if cfg.OUTPUT_DIR:
        for idx, dataset_name in enumerate(dataset_names):
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference",
                                         dataset_name)
            os.makedirs(output_folder, exist_ok=True)
            output_folders[idx] = output_folder

    # Do inference.
    data_loaders_test = make_data_loader(cfg,
                                         is_train=False,
                                         is_distributed=distributed)
    for output_folder, dataset_name, data_loader_test in zip(
            output_folders, dataset_names, data_loaders_test):
        inference(
            model,
            data_loader_test,
            dataset_name,
            mem_active=mem_active,
            output_folder=output_folder,
        )
        synchronize()
def main():
    # Add augments
    parser = argparse.ArgumentParser(description="Vision Research Toolkit by PyTorch")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true"
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER
    )

    args = parser.parse_args()

    num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(
            backend="nccl", init_method="env://"
        )
        synchronize()

    # make config
    cfg = make_config(args.config_file, args.opts)

    # obtain absolute dir of project
    project_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))

    if cfg.CHECKPOINTER.DIR:
        if cfg.CHECKPOINTER.DIR[0] is not os.sep:
            # if the saver_dir is not absolute dir
            cfg.CHECKPOINTER.DIR = os.path.join(project_dir, cfg.CHECKPOINTER.DIR)
    else:
        cfg.CHECKPOINTER.DIR = os.path.join(project_dir, 'log')

    if not cfg.CHECKPOINTER.NAME:
        cfg.CHECKPOINTER.NAME = strftime("%Y-%m-%d-%H-%M-%S", localtime())

    cfg.freeze()

    save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME)
    mkdir(save_dir)

    # Init logger
    logger = setup_logger(cfg.NAME, save_dir, get_rank())
    logger.info("Using {} GPU".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info ...")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    logger.info("Running with config:\n{}".format(cfg))

    output_config_path = os.path.join(save_dir, os.path.basename(args.config_file))
    logger.info("Saving config into: {}".format(output_config_path))
    # save overloaded model config in the output directory
    save_config(cfg, output_config_path)

    train(cfg, args.local_rank, args.distributed)
    return