def main():
    args = parse_args()
    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(
            backend="nccl",
            init_method="env://",
        )
        comm.synchronize()

    cfg = get_default_cfg()
    if args.config_file:
        cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.output_dir
    if output_dir:
        misc.mkdir(output_dir)

    logger = setup_logger("EfficientDet", output_dir, comm.get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Loaded configuration file {}".format(args.config_file))
    logger.info("Running with config:\n{}".format(cfg))

    output_config_path = os.path.join(output_dir, 'config.yml')
    logger.info("Saving config into: {}".format(output_config_path))
    misc.save_config(cfg, output_config_path)

    model = train(cfg, args.local_rank, args.distributed)
Ejemplo n.º 2
0
    def compute_scores_for_inference(self, clusters_mx, per_example_negs):
        # TODO: add description here
        args = self.args

        # get all of the unique examples
        examples = clusters_mx.data.tolist()
        examples.extend(flatten(per_example_negs.tolist()))
        examples = unique(examples)
        examples = list(filter(lambda x: x >= 0, examples))

        # create dataset and dataloader
        dataset = InferenceEmbeddingDataset(args, examples,
                                            args.train_cache_dir)
        dataloader = InferenceEmbeddingDataLoader(args, dataset)

        # get the unique idxs and embeds for each idx
        idxs, embeds = self.get_embeddings(dataloader, evaluate=False)

        sparse_graph = None
        if get_rank() == 0:
            # create inverse index for mapping
            inverse_idxs = {v: k for k, v in enumerate(idxs)}

            ## make the list of pairs of dot products we need
            _row = clusters_mx.row
            # positives:
            local_pos_a, local_pos_b = np.where(
                np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1))
            pos_a = clusters_mx.data[local_pos_a]
            pos_b = clusters_mx.data[local_pos_b]
            # negatives:
            local_neg_a = np.tile(
                np.arange(per_example_negs.shape[0])[:, np.newaxis],
                (1, per_example_negs.shape[1])).flatten()
            neg_a = clusters_mx.data[local_neg_a]
            neg_b = per_example_negs.flatten()

            neg_mask = (neg_b != -1)
            neg_a = neg_a[neg_mask]
            neg_b = neg_b[neg_mask]

            # create subset of the sparse graph we care about
            a = np.concatenate((pos_a, neg_a), axis=0)
            b = np.concatenate((pos_b, neg_b), axis=0)
            edges = list(zip(a, b))
            affinities = [
                np.dot(embeds[inverse_idxs[i]], embeds[inverse_idxs[j]])
                for i, j in edges
            ]

            # convert to coo_matrix
            edges = np.asarray(edges).T
            affinities = np.asarray(affinities)
            _sparse_num = np.max(edges) + 1
            sparse_graph = coo_matrix((affinities, edges),
                                      shape=(_sparse_num, _sparse_num))

        synchronize()
        return sparse_graph
Ejemplo n.º 3
0
    def get_embeddings(self, dataloader, evaluate=True):
        args = self.args
        self.model.eval()

        local_step = 0
        push_to_cpu_steps = 32
        idxs_list = []
        embeds_list = []
        master_idxs_list = []
        master_embeds_list = []

        def _synchronize_lists(_embeds_list, _idxs_list):
            gathered_data = all_gather({
                'embeds_list': _embeds_list,
                'idxs_list': _idxs_list,
            })
            if get_rank() == 0:
                _embeds_list = [d['embeds_list'] for d in gathered_data]
                _embeds_list = flatten(_embeds_list)
                _embeds_list = [x.cpu() for x in _embeds_list]
                _idxs_list = [d['idxs_list'] for d in gathered_data]
                _idxs_list = flatten(_idxs_list)
                _idxs_list = [x.cpu() for x in _idxs_list]
                master_embeds_list.extend(_embeds_list)
                master_idxs_list.extend(_idxs_list)
            synchronize()
            return [], []

        batch_iterator = tqdm(dataloader,
                              desc='Getting embeddings...',
                              disable=(not evaluate or get_rank() != 0
                                       or args.disable_logging))
        for batch in batch_iterator:
            batch = tuple(t.to(args.device, non_blocking=True) for t in batch)
            with torch.no_grad():
                inputs = {
                    'input_ids': batch[1],
                    'attention_mask': batch[2],
                    'token_type_ids': batch[3],
                    'concat_input': False
                }
                embeds_list.append(self.model(**inputs))
                idxs_list.append(batch[0])
                local_step += 1
                if local_step % push_to_cpu_steps == 0:
                    embeds_list, idxs_list = _synchronize_lists(
                        embeds_list, idxs_list)

        embeds_list, idxs_list = _synchronize_lists(embeds_list, idxs_list)

        idxs, embeds = None, None
        if get_rank() == 0:
            idxs = torch.cat(master_idxs_list, dim=0).numpy()
            idxs, indices = np.unique(idxs, return_index=True)
            embeds = torch.cat(master_embeds_list, dim=0).numpy()
            embeds = embeds[indices]
        synchronize()
        return idxs, embeds
Ejemplo n.º 4
0
def setup_gpu():
    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    distributed = num_gpus > 1
    if distributed:
        print("use gpu...")
        torch.cuda.set_device(0)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()
    return distributed
Ejemplo n.º 5
0
    def _train_softmax(self, dataset_list, metadata):
        args = self.args

        losses = []
        time_per_dataset = []
        dataset_sizes = []

        self.model.train()
        self.model.zero_grad()
        criterion = nn.CrossEntropyLoss()
        for dataset in dataset_list:
            _dataset_start_time = time.time()
            dataset_sizes.append(len(dataset))
            dataloader = SoftmaxEmbeddingDataLoader(args, dataset)
            for batch in dataloader:
                batch = tuple(t.to(args.device) for t in batch)
                inputs = {
                    'input_ids': batch[1],
                    'attention_mask': batch[2],
                    'token_type_ids': batch[3],
                    'concat_input': False
                }
                outputs = self.model(**inputs)
                pos_neg_dot_prods = torch.sum(outputs[:, 0:1, :] *
                                              outputs[:, 1:, :],
                                              dim=-1)
                target = torch.zeros(pos_neg_dot_prods.shape[0],
                                     dtype=torch.long).cuda()
                loss = criterion(pos_neg_dot_prods, target)
                losses.append(loss.item())
                loss.backward()

            torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                           args.max_grad_norm)
            self.optimizer.step()
            self.scheduler.step()
            self.model.zero_grad()
            time_per_dataset.append(time.time() - _dataset_start_time)

        gathered_data = all_gather({
            'losses': losses,
        })

        if get_rank() == 0:
            losses = flatten([d['losses'] for d in gathered_data])
            loss = np.mean(losses)

            synchronize()
            return {'embed_loss': loss}
        else:
            synchronize()
            return None
Ejemplo n.º 6
0
    def _train_threshold(self, dataset_list, metadata):
        args = self.args

        losses = []
        time_per_dataset = []
        dataset_sizes = []

        self.model.train()
        self.model.zero_grad()
        random.shuffle(dataset_list)
        for dataset in dataset_list:
            _dataset_start_time = time.time()
            dataset_sizes.append(len(dataset))
            dataloader = ScaledPairsEmbeddingDataLoader(args, dataset)
            for batch in dataloader:
                batch = tuple(t.to(args.device) for t in batch)
                inputs = {
                    'input_ids': batch[2],
                    'attention_mask': batch[3],
                    'token_type_ids': batch[4],
                    'concat_input': False
                }
                outputs = self.model(**inputs)
                dot_prods = torch.sum(outputs[:, 0, :] * outputs[:, 1, :],
                                      dim=-1)
                loss = torch.mean(F.relu(args.margin - (batch[0] * dot_prods)))
                losses.append(loss.item())
                loss.backward()

            torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                           args.max_grad_norm)
            self.optimizer.step()
            self.scheduler.step()
            self.model.zero_grad()
            time_per_dataset.append(time.time() - _dataset_start_time)

        gathered_data = all_gather({
            'losses': losses,
        })

        if get_rank() == 0:
            losses = flatten([d['losses'] for d in gathered_data])
            loss = np.mean(losses)

            synchronize()
            return {'embed_loss': loss}
        else:
            synchronize()
            return None
Ejemplo n.º 7
0
def inference(
        model,
        data_loader,
        dataset_name,
        iou_types=("bbox", ),
        box_only=False,
        device="cuda",
        expected_results=(),
        expected_results_sigma_tol=4,
        output_folder=None,
):
    # convert to a torch.device for efficiency
    device = torch.device(device)
    num_devices = (torch.distributed.get_world_size()
                   if torch.distributed.is_initialized() else 1)
    logger = logging.getLogger("maskrcnn_benchmark.inference")
    dataset = data_loader.dataset
    logger.info("Start evaluation on {} dataset({} images).".format(
        dataset_name, len(dataset)))
    start_time = time.time()
    predictions = compute_on_dataset(model, data_loader, device)
    # wait for all processes to complete before measuring the time
    synchronize()
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    logger.info(
        "Total inference time: {} ({} s / img per device, on {} devices)".
        format(total_time_str, total_time * num_devices / len(dataset),
               num_devices))

    predictions = _accumulate_predictions_from_multiple_gpus(predictions)
    if not is_main_process():
        return

    if output_folder:
        torch.save(predictions, os.path.join(output_folder, "predictions.pth"))

    extra_args = dict(
        box_only=box_only,
        iou_types=iou_types,
        expected_results=expected_results,
        expected_results_sigma_tol=expected_results_sigma_tol,
    )

    return evaluate(dataset=dataset,
                    predictions=predictions,
                    output_folder=output_folder,
                    **extra_args)
Ejemplo n.º 8
0
 def _synchronize_lists(_embeds_list, _idxs_list):
     gathered_data = all_gather({
         'embeds_list': _embeds_list,
         'idxs_list': _idxs_list,
     })
     if get_rank() == 0:
         _embeds_list = [d['embeds_list'] for d in gathered_data]
         _embeds_list = flatten(_embeds_list)
         _embeds_list = [x.cpu() for x in _embeds_list]
         _idxs_list = [d['idxs_list'] for d in gathered_data]
         _idxs_list = flatten(_idxs_list)
         _idxs_list = [x.cpu() for x in _idxs_list]
         master_embeds_list.extend(_embeds_list)
         master_idxs_list.extend(_idxs_list)
     synchronize()
     return [], []
Ejemplo n.º 9
0
def init_pretrained_weights(key):
    """Initializes model with pretrained weights.

    Layers that don't match with pretrained layers in name or size are kept unchanged.
    """
    import os
    import errno
    import gdown

    def _get_torch_home():
        ENV_TORCH_HOME = 'TORCH_HOME'
        ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
        DEFAULT_CACHE_DIR = '~/.cache'
        torch_home = os.path.expanduser(
            os.getenv(
                ENV_TORCH_HOME,
                os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR),
                             'torch')))
        return torch_home

    torch_home = _get_torch_home()
    model_dir = os.path.join(torch_home, 'checkpoints')
    try:
        os.makedirs(model_dir)
    except OSError as e:
        if e.errno == errno.EEXIST:
            # Directory already exists, ignore.
            pass
        else:
            # Unexpected OSError, re-raise.
            raise

    filename = model_urls[key].split('/')[-1]

    cached_file = os.path.join(model_dir, filename)

    if not os.path.exists(cached_file):
        if comm.is_main_process():
            gdown.download(model_urls[key], cached_file, quiet=False)

    comm.synchronize()

    logger.info(f"Loading pretrained model from {cached_file}")
    state_dict = torch.load(cached_file, map_location=torch.device('cpu'))

    return state_dict
Ejemplo n.º 10
0
    def _build_temp_sparse_graph(self, clusters_mx, per_example_negs):
        args = self.args

        # get all of the unique examples
        examples = clusters_mx.data.tolist()
        examples.extend(flatten(per_example_negs.tolist()))
        examples = unique(examples)
        examples = list(filter(lambda x: x >= 0, examples))

        sparse_graph = None
        if get_rank() == 0:
            ## make the list of pairs of dot products we need
            _row = clusters_mx.row
            # positives:
            local_pos_a, local_pos_b = np.where(
                np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1))
            pos_a = clusters_mx.data[local_pos_a]
            pos_b = clusters_mx.data[local_pos_b]
            # negatives:
            local_neg_a = np.tile(
                np.arange(per_example_negs.shape[0])[:, np.newaxis],
                (1, per_example_negs.shape[1])).flatten()
            neg_a = clusters_mx.data[local_neg_a]
            neg_b = per_example_negs.flatten()

            neg_mask = (neg_b != -1)
            neg_a = neg_a[neg_mask]
            neg_b = neg_b[neg_mask]

            # create subset of the sparse graph we care about
            a = np.concatenate((pos_a, neg_a), axis=0)
            b = np.concatenate((pos_b, neg_b), axis=0)
            edges = list(zip(a, b))
            affinities = [0.0 for i, j in edges]

            # convert to coo_matrix
            edges = np.asarray(edges).T
            affinities = np.asarray(affinities)
            _sparse_num = np.max(edges) + 1
            sparse_graph = coo_matrix((affinities, edges),
                                      shape=(_sparse_num, _sparse_num))

        synchronize()
        return sparse_graph
def cache_url(url, model_dir=None, progress=True):
    r"""Loads the Torch serialized object at the given URL.
    If the object is already present in `model_dir`, it's deserialized and
    returned. The filename part of the URL should follow the naming convention
    ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more
    digits of the SHA256 hash of the contents of the file. The hash is used to
    ensure unique names and to verify the contents of the file.
    The default value of `model_dir` is ``$TORCH_HOME/models`` where
    ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be
    overridden with the ``$TORCH_MODEL_ZOO`` environment variable.
    Args:
        url (string): URL of the object to download
        model_dir (string, optional): directory in which to save the object
        progress (bool, optional): whether or not to display a progress bar to stderr
    Example:
        >>> cached_file = utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')
    """
    if model_dir is None:
        torch_home = os.path.expanduser(os.getenv("TORCH_HOME", "~/.torch"))
        model_dir = os.getenv("TORCH_MODEL_ZOO",
                              os.path.join(torch_home, "models"))
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    parts = urlparse(url)
    filename = os.path.basename(parts.path)
    if filename == "model_final.pkl":
        # workaround as pre-trained Caffe2 models from Detectron have all the same filename
        # so make the full path the filename by replacing / with _
        filename = parts.path.replace("/", "_")
    cached_file = os.path.join(model_dir, filename)
    if not os.path.exists(cached_file) and is_main_process():
        sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
        hash_prefix = HASH_REGEX.search(filename)
        if hash_prefix is not None:
            hash_prefix = hash_prefix.group(1)
            # workaround: Caffe2 models don't have a hash, but follow the R-50 convention,
            # which matches the hash PyTorch uses. So we skip the hash matching
            # if the hash_prefix is less than 6 characters
            if len(hash_prefix) < 6:
                hash_prefix = None
        _download_url_to_file(url, cached_file, hash_prefix, progress=progress)
    synchronize()
    return cached_file
Ejemplo n.º 12
0
def val_in_train(model, criterion, dataset_name_val, data_loader_val,
                 tblogger, iteration, checkpointer, distributed):
    logger = logging.getLogger('eve.' + __name__)

    if distributed:
        model_val = model.module
    else:
        model_val = model

    # only main process will return result
    metrics = inference(model_val, criterion,
                        data_loader_val, dataset_name_val)

    synchronize()

    if is_main_process():
        if tblogger is not None:
            for k, v in metrics.items():
                tblogger.add_scalar('val/' + k, v, iteration)
                logger.info("{}: {}".format(k, v))
        return metrics
    else:
        return None
Ejemplo n.º 13
0
def inference(
        model,
        data_loader,
        dataset_name,
        mem_active=False,
        output_folder=None,
):
    # convert to a torch.device for efficiency
    device = torch.device("cuda")
    num_devices = get_world_size()
    logger = logging.getLogger("AlphAction.inference")
    dataset = data_loader.dataset
    logger.info("Start evaluation on {} dataset({} videos).".format(dataset_name, len(dataset)))
    start_time = time.time()
    predictions = compute_on_dataset(model, data_loader, device, logger, mem_active)
    # wait for all processes to complete before measuring the time
    synchronize()
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    logger.info(
        "Total inference time: {} ({} s / video per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset), num_devices
        )
    )

    predictions = _accumulate_predictions_from_multiple_gpus(predictions)
    if not is_main_process():
        return

    if output_folder:
        torch.save(predictions, os.path.join(output_folder, "predictions.pth"))

    return evaluate(
        dataset=dataset,
        predictions=predictions,
        output_folder=output_folder,
    )
Ejemplo n.º 14
0
                 net,
                 criterion_eval,
                 cfg,
                 test_meter,
                 global_step=0,
                 device=device,
                 local_rank=get_rank())

############## training code #############################
if not cfg.EVALUATE:
    scaler = torch.cuda.amp.GradScaler(enabled=cfg.AMP.ENABLED)
    # start from epoch 0 or last checkpoint epoch
    start_epoch = checkpointer.epoch
    for epoch in range(start_epoch, cfg.OPTIM.EPOCHS):
        # wait for all processes before every epoch
        synchronize()
        logging.info("PROGRESS: {}%".format(
            round(100 * epoch / cfg.OPTIM.EPOCHS, 4)))
        global_step = epoch * len(trainloader)

        # an empirical rule for redraw projects in Performer
        if cfg.MODEL.ARCH.startswith(
                'msvit') and cfg.MODEL.VIT.MSVIT.ATTN_TYPE == "performer":
            if hasattr(net, 'module'):
                net.module.feature_redraw_interval = 1 + 5 * epoch
            else:
                net.feature_redraw_interval = 1 + 5 * epoch

        if cfg.MODEL.ARCH.startswith(
                'msvit') and cfg.MODEL.VIT.MSVIT.ATTN_TYPE.startswith(
                    'longformer'):
def do_infer(
    model,
    data_loader,
    dataset_name,
    device="cuda",
    output_folder=None,
):
    # convert to a torch.device for efficiency
    device = torch.device(device)
    num_devices = get_world_size()
    logger = logging.getLogger("EfficientDet.inference")
    dataset = data_loader.dataset
    logger.info("Start evaluation on {} dataset({} images).".format(
        dataset_name, len(dataset)))
    total_timer = Timer()
    inference_timer = Timer()
    total_timer.tic()
    predictions = compute_on_dataset(model, data_loader, device,
                                     inference_timer)
    # wait for all processes to complete before measuring the time
    synchronize()
    total_time = total_timer.toc()
    total_time_str = get_time_str(total_time)
    logger.info(
        "Total run time: {} ({} s / img per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset),
            num_devices))
    total_infer_time = get_time_str(inference_timer.total_time)
    logger.info(
        "Model inference time: {} ({} s / img per device, on {} devices)".
        format(
            total_infer_time,
            inference_timer.total_time * num_devices / len(dataset),
            num_devices,
        ))

    predictions = _accumulate_predictions_from_multiple_gpus(predictions)
    if not is_main_process():
        return

    coco_results = []
    image_ids = []
    for image_id, prediction in enumerate(predictions):
        original_id = dataset.image_ids[image_id]
        image_ids.append(original_id)
        coco_results.extend([{
            "image_id":
            original_id,
            "category_id":
            dataset.return_coco_label(e['class']),
            "bbox":
            e['bbox'],
            "score":
            e['score']
        } for e in prediction])

    map_05_09 = 0
    with tempfile.NamedTemporaryFile() as f:
        file_path = f.name
        output_folder = './'
        if output_folder:
            file_path = os.path.join(output_folder, 'bbox_results.json')
        with open(file_path, "w") as w_obj:
            json.dump(coco_results, w_obj)

        # load results in COCO evaluation tool
        coco_true = dataset.coco
        coco_pred = coco_true.loadRes(file_path)

        # run COCO evaluation
        coco_eval = COCOeval(coco_true, coco_pred, 'bbox')
        coco_eval.params.imgIds = image_ids
        coco_eval.evaluate()
        coco_eval.accumulate()
        coco_eval.summarize()

        map_05_09 = coco_eval.stats[0]
    return map_05_09
Ejemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Inference")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    distributed = num_gpus > 1

    if distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    # Merge config file.
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    # Print experimental infos.
    save_dir = ""
    logger = setup_logger("AlphAction", save_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(cfg)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + get_pretty_env_info())

    # Build the model.
    model = build_detection_model(cfg)
    model.to("cuda")

    # load weight.
    output_dir = cfg.OUTPUT_DIR
    checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir)
    checkpointer.load(cfg.MODEL.WEIGHT)

    output_folders = [None] * len(cfg.DATASETS.TEST)
    dataset_names = cfg.DATASETS.TEST
    mem_active = has_memory(cfg.IA_STRUCTURE)
    if cfg.OUTPUT_DIR:
        for idx, dataset_name in enumerate(dataset_names):
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference",
                                         dataset_name)
            os.makedirs(output_folder, exist_ok=True)
            output_folders[idx] = output_folder

    # Do inference.
    data_loaders_test = make_data_loader(cfg,
                                         is_train=False,
                                         is_distributed=distributed)
    for output_folder, dataset_name, data_loader_test in zip(
            output_folders, dataset_names, data_loaders_test):
        inference(
            model,
            data_loader_test,
            dataset_name,
            mem_active=mem_active,
            output_folder=output_folder,
        )
        synchronize()
def main():
    # Add augments
    parser = argparse.ArgumentParser(description="Vision Research Toolkit by PyTorch")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true"
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER
    )

    args = parser.parse_args()

    num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(
            backend="nccl", init_method="env://"
        )
        synchronize()

    # make config
    cfg = make_config(args.config_file, args.opts)

    # obtain absolute dir of project
    project_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))

    if cfg.CHECKPOINTER.DIR:
        if cfg.CHECKPOINTER.DIR[0] is not os.sep:
            # if the saver_dir is not absolute dir
            cfg.CHECKPOINTER.DIR = os.path.join(project_dir, cfg.CHECKPOINTER.DIR)
    else:
        cfg.CHECKPOINTER.DIR = os.path.join(project_dir, 'log')

    if not cfg.CHECKPOINTER.NAME:
        cfg.CHECKPOINTER.NAME = strftime("%Y-%m-%d-%H-%M-%S", localtime())

    cfg.freeze()

    save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME)
    mkdir(save_dir)

    # Init logger
    logger = setup_logger(cfg.NAME, save_dir, get_rank())
    logger.info("Using {} GPU".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info ...")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    logger.info("Running with config:\n{}".format(cfg))

    output_config_path = os.path.join(save_dir, os.path.basename(args.config_file))
    logger.info("Saving config into: {}".format(output_config_path))
    # save overloaded model config in the output directory
    save_config(cfg, output_config_path)

    train(cfg, args.local_rank, args.distributed)
    return
def train(cfg, local_rank, distributed):
    logger = logging.getLogger(cfg.NAME)
    # build model
    model = build_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    # build solver
    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {"iteration": 0}

    save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME)

    save_to_disk = get_rank() == 0
    checkpointer = Checkpointer(
        model=model, optimizer=optimizer, scheduler=scheduler,
        save_dir=save_dir, save_to_disk=save_to_disk, logger=logger
    )
    extra_checkpoint_data = checkpointer.load(cfg.CHECKPOINTER.LOAD_NAME)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    evaluate = cfg.SOLVER.EVALUATE
    if evaluate:
        synchronize()
        data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True)
        synchronize()
    else:
        data_loader_val = None

    save_to_disk = get_rank() == 0
    if cfg.SUMMARY_WRITER and save_to_disk:
        save_dir = os.path.join(cfg.CHECKPOINTER.DIR, cfg.CHECKPOINTER.NAME)
        summary_writer = make_summary_writer(cfg.SUMMARY_WRITER, save_dir, model_name=cfg.MODEL.NAME)
    else:
        summary_writer = None

    do_train(
        cfg,
        model,
        data_loader,
        data_loader_val,
        optimizer,
        scheduler,
        checkpointer,
        device,
        arguments,
        summary_writer
    )

    return model
Ejemplo n.º 19
0
def eval_wdoc(args, example_dir, metadata, sub_trainer, save_fname=None):
    assert save_fname != None

    logger.info('Building within doc sparse graphs...')
    doc_level_graphs = []
    per_doc_coref_clusters = []
    for doc_clusters in tqdm(metadata.wdoc_clusters.values(),
                             disable=(get_rank() != 0)):
        per_doc_coref_clusters.append([[x for x in v if x != k]
                                       for k, v in doc_clusters.items()])
        doc_mentions = np.asarray(
            [x for k, v in doc_clusters.items() for x in v if x != k])
        doc_mentions = np.sort(doc_mentions)
        doc_level_graphs.append(
            build_sparse_affinity_graph(args,
                                        doc_mentions,
                                        example_dir,
                                        metadata,
                                        None,
                                        sub_trainer,
                                        build_coref_graph=True,
                                        build_linking_graph=True))

    logger.info('Done.')

    # don't need other processes at this point
    if get_rank() != 0:
        synchronize()
        return

    # build everything needed to compute metrics and compute them!
    coref_graphs, linking_graphs = [], []
    for coref_graph, linking_graph in doc_level_graphs:
        coref_graphs.append(coref_graph)
        linking_graphs.append(linking_graph)

    # build the joint whole graph
    joint_whole_graph = deepcopy(
        _merge_sparse_graphs(coref_graphs + linking_graphs))

    logger.info('Computing coref metrics...')
    coref_metrics = compute_coref_metrics(per_doc_coref_clusters, coref_graphs,
                                          args.eval_coref_threshold)
    logger.info('Done.')

    logger.info('Computing linking metrics...')
    linking_metrics, slim_linking_graph = compute_linking_metrics(
        metadata, linking_graphs)
    logger.info('Done.')

    ########################################################################
    ## FIXME: hacking to get HAC working
    #
    #joint_whole_graph = _merge_sparse_graphs(coref_graphs + linking_graphs)

    #hierarchy_tree = np.full((2*joint_whole_graph.shape[0], 2), -1)
    #proposed_merges = np.vstack((joint_whole_graph.row, joint_whole_graph.col)).T

    #def _get_leaves(hierarchy_tree, internal_node):
    #    q = [internal_node]
    #    leaves = []
    #    while len(q) > 0:
    #        curr_node = q.pop()
    #        left_child = hierarchy_tree[curr_node][0]
    #        right_child = hierarchy_tree[curr_node][1]
    #        if left_child == -1:
    #            assert right_child == -1
    #            leaves.append(curr_node)
    #        else:
    #            q.append(left_child)
    #            q.append(right_child)
    #    return leaves

    #def _avg_linkage(joint_whole_graph, leaves_a, leaves_b):
    #    row_mask = np.isin(joint_whole_graph.row, leaves_a)\
    #               ^ np.isin(joint_whole_graph.row, leaves_b)
    #    col_mask = np.isin(joint_whole_graph.col, leaves_a)\
    #               ^ np.isin(joint_whole_graph.col, leaves_b)
    #    edge_weights = joint_whole_graph.data[row_mask & col_mask]
    #    if edge_weights.size == 0:
    #        return -np.inf
    #    return np.mean(edge_weights)
    #
    #merge_node_id = joint_whole_graph.shape[0] # start with the next possible index
    #valid_merge_exists = True
    #count = 0
    #while valid_merge_exists:
    #    valid_merge_exists = False
    #    max_linkage = 0.0
    #    max_a, max_b = None, None
    #    for pair in proposed_merges:
    #        a, b = tuple(pair)
    #        if a == b:
    #            continue
    #        valid_merge_exists = True
    #        leaves_a = _get_leaves(hierarchy_tree, a)
    #        leaves_b = _get_leaves(hierarchy_tree, b)
    #        linkage_score = _avg_linkage(joint_whole_graph, leaves_a, leaves_b)
    #        if linkage_score > max_linkage:
    #            max_a = a
    #            max_b = b
    #            max_linkage = linkage_score

    #    if not valid_merge_exists:
    #        continue

    #    # create new node in the hierarchy with id = `merge_node_id`
    #    hierarchy_tree[merge_node_id][0] = max_a
    #    hierarchy_tree[merge_node_id][1] = max_b

    #    # update all the relevant edges in `proposed_merges`
    #    join_mask = np.isin(proposed_merges, [max_a, max_b])
    #    proposed_merges[join_mask] = merge_node_id

    #    # increment for next merger
    #    merge_node_id += 1

    #    count += 1
    #    print(count)

    ########################################################################

    logger.info('Computing joint metrics...')
    slim_coref_graph = _get_global_maximum_spanning_tree(coref_graphs)
    joint_metrics = compute_joint_metrics(
        metadata, [slim_coref_graph, slim_linking_graph])
    logger.info('Done.')

    metrics = {
        'coref_fmi': coref_metrics['fmi'],
        'coref_rand_index': coref_metrics['rand_index'],
        'coref_threshold': coref_metrics['threshold'],
        'vanilla_recall': linking_metrics['vanilla_recall'],
        'vanilla_accuracy': linking_metrics['vanilla_accuracy'],
        'joint_accuracy': joint_metrics['joint_accuracy'],
        'joint_cc_recall': joint_metrics['joint_cc_recall']
    }

    # save all of the predictions for later analysis
    save_data = {}
    save_data.update(coref_metrics)
    save_data.update(linking_metrics)
    save_data.update(joint_metrics)
    save_data.update({'metadata': metadata})
    save_data.update({'joint_whole_graph': joint_whole_graph})

    with open(save_fname, 'wb') as f:
        pickle.dump(save_data, f)

    synchronize()
    return metrics
def do_train(
    cfg,
    model,
    train_dataloader,
    val_dataloader,
    optimizer,
    lr_scheduler,
    checkpointer,
    device,
    checkpoint_period,
    test_period,
    log_period,
    arguments,
):
    logger = logging.getLogger("EfficientDet.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(train_dataloader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()

    for iteration, (images, targets, _) in enumerate(train_dataloader,
                                                     start_iter):

        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        images = images.to(device)
        targets = targets.to(device)

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        with amp.scale_loss(losses, optimizer) as scaled_losses:
            scaled_losses.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        # lr_scheduler.step(losses_reduced)
        lr_scheduler.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % log_period == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))
        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if val_dataloader is not None and (
            (test_period > 0 and iteration % test_period == 0)
                or iteration == max_iter):
            meters_val = MetricLogger(delimiter="  ")
            synchronize()
            map_05_09 = do_infer(  # The result can be used for additional logging, e. g. for TensorBoard
                model,
                val_dataloader,
                dataset_name="[Validation]",
                device=cfg.device,
                output_folder=None,
            )
            logger.info("Validation MAP 0.5:0.9 ===> {}".format(map_05_09))
            synchronize()
            model.train()
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))
Ejemplo n.º 21
0
Archivo: misc.py Proyecto: iesl/lerac
def initialize_exp(args, logger_filename='train.log'):
    """
    Initialize the experiment:
    - dump parameters
    - create a logger
    - set the random seed
    - setup distributed computation
    """
    # setup cuda using torch's distributed framework
    setup_cuda_and_distributed(args)

    # random seed
    set_seed(args)

    # don't overwrite previous output directory
    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError("Output directory ({}) already exists and is not "
                         "empty. Use --overwrite_output_dir "
                         "to overcome.".format(args.output_dir))

    # create output directory and dump parameters
    if get_rank() == 0:
        # create output directory
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
            time.sleep(3)

        # args file prefix
        if args.do_train:
            prefix = "train"
        elif args.do_train_eval:
            prefix = "train_eval"
        elif args.do_val:
            prefix = "val"
        elif args.do_test:
            prefix = "test"
        else:
            raise ValueError("No valid train or validation mode selected")
        args_file = prefix + "_args.pkl"
        pickle.dump(args, open(os.path.join(args.output_dir, args_file), "wb"))
    synchronize()

    # get running command
    command = ["python", sys.argv[0]]
    for x in sys.argv[1:]:
        if x.startswith('--'):
            assert '"' not in x and "'" not in x
            command.append(x)
        else:
            assert "'" not in x
            command.append("'%s'" % x)
    command = ' '.join(command)
    args.command = command

    # create a logger
    logger = create_logger(args, logger_filename)
    logger.info('============ Initialized logger ============')
    logger.info('\n'.join(
        ['%s: %s' % (k, str(v)) for k, v in sorted(dict(vars(args)).items())]))
    logger.info('The experiment will be stored in %s\n' % args.output_dir)
    logger.info('Running command: %s\n' % args.command)
    return logger
Ejemplo n.º 22
0
def inference(model, criterion, data_loader, dataset_name, save_result=False):
    logger = logging.getLogger('eve.' + __name__)

    device = torch.device('cuda')
    dataset = data_loader.dataset
    logger.info("Start evaluation on {} dataset ({} point clouds).".format(
        dataset_name, len(dataset)))

    if get_world_size() == 1:
        extra_args = {}
    else:
        rank = get_rank()
        extra_args = dict(desc="rank {}".format(rank))

    start_time = time.time()

    model.eval()
    outputs_per_gpu = {}
    targets_per_gpu = {}
    file_path_per_gpu = {}

    times = []

    with torch.no_grad():
        for batch in tqdm(data_loader, **extra_args):
            locs, feats, targets, metadata = batch
            inputs = ME.SparseTensor(feats, coords=locs).to(device)
            targets = targets.to(device, non_blocking=True).long()

            torch.cuda.synchronize()
            start_time = time.time()
            outputs = model(inputs, y=targets)
            torch.cuda.synchronize()
            end_time = time.time()
            times.append(end_time - start_time)

            arch = cfg.MODEL.ARCHITECTURE
            if arch == 'minkunet4d' or arch == 'minkunet_eve':
                for batch_idx in range(len(metadata)):
                    for time_idx in range(cfg.INPUT.VIDEO.NUM_FRAMES):
                        inv_map = metadata[batch_idx][time_idx]['inverse_map']
                        file_path = metadata[batch_idx][time_idx]['file_path']

                        locs_frame = (locs[:, -1] == batch_idx) & \
                            (locs[:, -2] == time_idx)
                        one_output, one_target = compute_one_frame(
                            outputs, targets, locs_frame, inv_map)

                        outputs_per_gpu[file_path] = one_output
                        targets_per_gpu[file_path] = one_target
                        file_path_per_gpu[file_path] = file_path
            else:  # other minknet
                for batch_idx in range(len(metadata)):
                    inv_map = metadata[batch_idx]['inverse_map']
                    file_path = metadata[batch_idx]['file_path']

                    # From MinkowskiEngine v0.3, batch index is on the first column
                    locs_frame = locs[:, -1] == batch_idx
                    one_output, one_target = compute_one_frame(
                        outputs, targets, locs_frame, inv_map)

                    outputs_per_gpu[file_path] = one_output
                    targets_per_gpu[file_path] = one_target
                    file_path_per_gpu[file_path] = file_path

    synchronize()

    logger.info("Total inference time: {}".format(np.sum(times)))

    # NOTE: `all_gather` will lead to CUDA out of memory
    # We use `scatter_gather` to save result of each process
    # in LOGS.DIR/tmp and will be cleared after gathering.
    outputs = scatter_gather(outputs_per_gpu)
    targets = scatter_gather(targets_per_gpu)
    file_paths = scatter_gather(file_path_per_gpu)
    if not is_main_process():
        return None

    all_outputs = {k: v.numpy() for o in outputs for k, v in o.items()}
    all_targets = {k: v.numpy() for t in targets for k, v in t.items()}
    all_file_paths = {k: v for f in file_paths for k, v in f.items()}

    assert len(all_outputs) == len(dataset.all_files), \
        '%d vs %d' % (len(all_outputs), len(dataset.all_files))

    if cfg.LOGS.SAVE_RESULT is False:
        all_file_paths = None
    metrics = evaluate(dataset, all_outputs, all_targets, all_file_paths)

    return metrics
Ejemplo n.º 23
0
    def _train_triplet(self, dataset_list, metadata):
        args = self.args

        losses = []
        time_per_dataset = []
        dataset_sizes = []
        pos_m_neg_m_losses = []
        pos_m_neg_e_losses = []
        pos_e_neg_m_losses = []
        pos_e_neg_e_losses = []

        self.model.train()
        self.model.zero_grad()

        for dataset in dataset_list:
            _dataset_start_time = time.time()
            dataset_sizes.append(len(dataset))
            dataloader = TripletEmbeddingDataLoader(args, dataset)
            for batch in dataloader:
                batch = tuple(t.to(args.device) for t in batch)
                inputs = {
                    'input_ids': batch[1],
                    'attention_mask': batch[2],
                    'token_type_ids': batch[3],
                    'concat_input': False
                }
                outputs = self.model(**inputs)

                pos_neg_dot_prods = torch.sum(outputs[:, 0:1, :] *
                                              outputs[:, 1:, :],
                                              dim=-1)

                if args.training_method == 'triplet_max_margin':
                    # max-margin
                    per_triplet_loss = F.relu(
                        pos_neg_dot_prods[:, 1]  # negative dot products
                        - pos_neg_dot_prods[:, 0]  # positive dot products
                        + args.margin)
                elif args.training_method == 'triplet_bpr':
                    # BPR
                    per_triplet_loss = torch.sigmoid(
                        pos_neg_dot_prods[:, 1]  # negative dot products
                        - pos_neg_dot_prods[:, 0]  # positive dot products
                        + args.margin)
                else:
                    raise ValueError('unsupported training_method')

                # record triplet specific losses
                _detached_per_triplet_loss = per_triplet_loss.clone().detach(
                ).cpu()
                _mask = batch[0] < metadata.num_entities
                pos_m_neg_m_mask = ~_mask[:, 1] & ~_mask[:, 2]
                pos_m_neg_e_mask = ~_mask[:, 1] & _mask[:, 2]
                pos_e_neg_m_mask = _mask[:, 1] & ~_mask[:, 2]
                pos_e_neg_e_mask = _mask[:, 1] & _mask[:, 2]

                pos_m_neg_m_losses.extend(
                    _detached_per_triplet_loss[pos_m_neg_m_mask].numpy(
                    ).tolist())
                pos_m_neg_e_losses.extend(
                    _detached_per_triplet_loss[pos_m_neg_e_mask].numpy(
                    ).tolist())
                pos_e_neg_m_losses.extend(
                    _detached_per_triplet_loss[pos_e_neg_m_mask].numpy(
                    ).tolist())
                pos_e_neg_e_losses.extend(
                    _detached_per_triplet_loss[pos_e_neg_e_mask].numpy(
                    ).tolist())
                loss = torch.mean(per_triplet_loss)
                loss.backward()

                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               args.max_grad_norm)
                self.optimizer.step()
                self.scheduler.step()
                self.model.zero_grad()

            time_per_dataset.append(time.time() - _dataset_start_time)

        gathered_data = all_gather({
            'pos_m_neg_m_losses': pos_m_neg_m_losses,
            'pos_m_neg_e_losses': pos_m_neg_e_losses,
            'pos_e_neg_m_losses': pos_e_neg_m_losses,
            'pos_e_neg_e_losses': pos_e_neg_e_losses
        })

        if get_rank() == 0:
            pos_m_neg_m_losses = flatten(
                [d['pos_m_neg_m_losses'] for d in gathered_data])
            pos_m_neg_e_losses = flatten(
                [d['pos_m_neg_e_losses'] for d in gathered_data])
            pos_e_neg_m_losses = flatten(
                [d['pos_e_neg_m_losses'] for d in gathered_data])
            pos_e_neg_e_losses = flatten(
                [d['pos_e_neg_e_losses'] for d in gathered_data])
            losses = pos_m_neg_m_losses + pos_m_neg_e_losses + pos_e_neg_m_losses + pos_e_neg_e_losses

            pos_m_neg_m_loss = 0.0 if len(
                pos_m_neg_m_losses) == 0 else np.mean(pos_m_neg_m_losses)
            pos_m_neg_e_loss = 0.0 if len(
                pos_m_neg_e_losses) == 0 else np.mean(pos_m_neg_e_losses)
            pos_e_neg_m_loss = 0.0 if len(
                pos_e_neg_m_losses) == 0 else np.mean(pos_e_neg_m_losses)
            pos_e_neg_e_loss = 0.0 if len(
                pos_e_neg_e_losses) == 0 else np.mean(pos_e_neg_e_losses)
            loss = np.mean(losses)

            synchronize()
            return {
                'embed_loss': loss,
                'embed_num_examples': len(losses),
                'embed_pos_m_neg_m_loss': pos_m_neg_m_loss,
                'embed_pos_m_neg_e_loss': pos_m_neg_e_loss,
                'embed_pos_e_neg_m_loss': pos_e_neg_m_loss,
                'embed_pos_e_neg_e_loss': pos_e_neg_e_loss,
                'embed_pos_m_neg_m_num_examples': len(pos_m_neg_m_losses),
                'embed_pos_m_neg_e_num_examples': len(pos_m_neg_e_losses),
                'embed_pos_e_neg_m_num_examples': len(pos_e_neg_m_losses),
                'embed_pos_e_neg_e_num_examples': len(pos_e_neg_e_losses)
            }
        else:
            synchronize()
            return None
Ejemplo n.º 24
0
    def train(self):
        args = self.args

        # set up data structures for choosing available negatives on-the-fly
        if args.clustering_domain == 'within_doc':
            self._neg_choosing_prep()
        else:
            raise NotImplementedError('xdoc not implemented yet')

        global_step = 0
        log_return_dicts = []

        logger.info('Starting training...')

        batch = None
        for epoch in range(args.num_train_epochs):
            logger.info(
                '********** [START] epoch: {} **********'.format(epoch))

            num_batches = None
            if get_rank() == 0:
                data_iterator = iter(self.train_dataloader)
                num_batches = len(data_iterator)
            num_batches = broadcast(num_batches, src=0)

            logger.info('num_batches: {}'.format(num_batches))

            for _ in trange(num_batches,
                            desc='Epoch: {} - Batches'.format(epoch),
                            disable=(get_rank() != 0 or args.disable_logging)):

                ### FIXME: hack for hyperparameter scheduling
                #if global_step > 400:
                #    args.training_edges_considered = 'all'
                #if global_step % 200 == 199:
                #    if get_rank() == 0:
                #        self.embed_sub_trainer.save_model(global_step)
                #    synchronize()
                #    val_metrics = self.evaluate(
                #            split='val',
                #            suffix='checkpoint-{}'.format(global_step)
                #    )
                #    synchronize()
                #    exit()

                # get batch from rank0 and broadcast it to the other processes
                if get_rank() == 0:
                    try:
                        next_batch = next(data_iterator)
                        # make sure the cluster_mx is sorted correctly
                        _row, _col, _data = [], [], []
                        current_row = 0
                        ctr = 0
                        for r, d in sorted(zip(next_batch.row,
                                               next_batch.data)):
                            if current_row != r:
                                current_row = r
                                ctr = 0
                            _row.append(r)
                            _col.append(ctr)
                            _data.append(d)
                            ctr += 1
                        next_batch = coo_matrix((_data, (_row, _col)),
                                                shape=next_batch.shape)
                        negs = self._choose_negs(next_batch)
                        batch = (next_batch, negs)
                    except StopIteration:
                        batch = None

                batch = broadcast(batch, src=0)

                if batch is None:
                    break

                # run train_step
                log_return_dicts.append(self.train_step(batch))
                global_step += 1

                # logging stuff for babysitting
                if global_step % args.logging_steps == 0:
                    avg_return_dict = reduce(dict_merge_with, log_return_dicts)
                    for stat_name, stat_value in avg_return_dict.items():
                        logger.info('Average %s: %s at global step: %s',
                                    stat_name,
                                    str(stat_value / args.logging_steps),
                                    str(global_step))
                    logger.info('Using {} edges for training'.format(
                        args.training_edges_considered))
                    log_return_dicts = []

                # refresh the knn index
                if args.knn_refresh_steps > 0 and global_step % args.knn_refresh_steps == 0:
                    logger.info('Refreshing kNN index...')
                    self.train_knn_index.refresh_index()
                    logger.info('Done.')

            # save the model at the end of every epoch
            if get_rank() == 0:
                #self.embed_sub_trainer.save_model(global_step)
                self.concat_sub_trainer.save_model(global_step)
            synchronize()

            logger.info('********** [END] epoch: {} **********'.format(epoch))

            # run full evaluation at the end of each epoch
            #if args.evaluate_during_training and epoch % 10 == 9:
            if args.evaluate_during_training:
                if args.do_train_eval:
                    train_eval_metrics = self.evaluate(
                        split='train',
                        suffix='checkpoint-{}'.format(global_step))
                if args.do_val:
                    val_metrics = self.evaluate(
                        split='val',
                        suffix='checkpoint-{}'.format(global_step))
Ejemplo n.º 25
0
def compute_on_dataset_2stage(model, data_loader, device, logger):
    # two stage inference, for model with memory features.
    # first extract features and then do the inference
    cpu_device = torch.device("cpu")
    num_devices = get_world_size()
    dataset = data_loader.dataset
    if num_devices == 1:
        extra_args = {}
    else:
        rank = get_rank()
        extra_args = dict(desc="rank {}".format(rank))

    loader_len = len(data_loader)
    person_feature_pool = MemoryPool()
    batch_info_list = [None]*loader_len
    logger.info("Stage 1: extracting clip features.")
    start_time = time.time()

    for i, batch in enumerate(tqdm(data_loader, **extra_args)):
        slow_clips, fast_clips, boxes, objects, extras, video_ids = batch
        slow_clips = slow_clips.to(device)
        fast_clips = fast_clips.to(device)
        boxes = [box.to(device) for box in boxes]
        objects = [None if (box is None) else box.to(device) for box in objects]
        movie_ids = [e["movie_id"] for e in extras]
        timestamps = [e["timestamp"] for e in extras]
        with torch.no_grad():
            feature = model(slow_clips, fast_clips, boxes, objects, part_forward=0)
            person_feature = [ft.to(cpu_device) for ft in feature[0]]
            object_feature = [ft.to(cpu_device) for ft in feature[1]]
        # store person features into memory pool
        for movie_id, timestamp, p_ft, o_ft in zip(movie_ids, timestamps, person_feature, object_feature):
            person_feature_pool[movie_id, timestamp] = p_ft
        # store other information in list, for further inference
        batch_info_list[i] = (movie_ids, timestamps, video_ids, object_feature)

    # gather feature pools from different ranks
    synchronize()
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    logger.info(
        "Stage 1 time: {} ({} s / video per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset), num_devices
        )
    )
    feature_pool = all_gather(person_feature_pool)
    all_feature_pool_p = MemoryPool()
    all_feature_pool_p.update_list(feature_pool)
    del feature_pool, person_feature_pool

    # do the inference
    results_dict = {}
    logger.info("Stage 2: predicting with extracted feature.")
    start_time = time.time()
    for movie_ids, timestamps, video_ids, object_feature in tqdm(batch_info_list, **extra_args):
        current_feat_p = [all_feature_pool_p[movie_id, timestamp].to(device)
                          for movie_id, timestamp in zip(movie_ids, timestamps)]
        current_feat_o = [ft_o.to(device) for ft_o in object_feature]
        extras = dict(
            person_pool=all_feature_pool_p,
            movie_ids=movie_ids,
            timestamps=timestamps,
            current_feat_p=current_feat_p,
            current_feat_o=current_feat_o,
        )
        with torch.no_grad():
            output = model(None, None, None, None, extras=extras, part_forward=1)
            output = [o.to(cpu_device) for o in output]
        results_dict.update(
            {video_id: result for video_id, result in zip(video_ids, output)}
        )
    synchronize()
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    logger.info(
        "Stage 2 time: {} ({} s / video per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset), num_devices
        )
    )

    return results_dict
Ejemplo n.º 26
0
def eval_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu  # local rank, local machine cuda id
    args.local_rank = args.gpu
    args.batch_size = args.batch_size_per_gpu

    global_rank = args.gpu + args.machine_rank * ngpus_per_node
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=global_rank)

    # Setup logging format.
    logging.setup_logging("stdout.log", 'w')

    # synchronize is needed here to prevent a possible timeout after calling
    # init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    args.rank = comm.get_rank()  # global rank
    torch.cuda.set_device(args.gpu)

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    # build the supernet
    logger.info("=> creating model '{}'".format(args.arch))
    model = models.model_factory.create_model(args)
    model.cuda(args.gpu)
    model = comm.get_parallel_model(model, args.gpu)  #local rank

    # define loss function (criterion)
    criterion = nn.CrossEntropyLoss().cuda()

    ## load dataset, train_sampler: distributed
    train_loader, val_loader, train_sampler = build_data_loader(args)

    assert args.resume
    #reloading model
    model.module.load_weights_from_pretrained_models(args.resume)

    if train_sampler:
        train_sampler.set_epoch(0)

    targeted_min_flops = args.evo_search.targeted_min_flops
    targeted_max_flops = args.evo_search.targeted_max_flops

    # run evolutionary search
    parent_popu = []
    for idx in range(args.evo_search.parent_popu_size):
        if idx == 0:
            cfg = model.module.sample_min_subnet()
        else:
            cfg = model.module.sample_active_subnet_within_range(
                targeted_min_flops, targeted_max_flops)
        cfg['net_id'] = f'net_{idx % args.world_size}_evo_0_{idx}'
        parent_popu.append(cfg)

    pareto_global = {}
    for evo in range(args.evo_search.evo_iter):
        # partition the set of candidate sub-networks
        # and send them to each GPU for parallel evaluation

        # sub-networks to be evaluated on GPU {args.rank}
        my_subnets_to_be_evaluated = {}
        n_evaluated = len(parent_popu) // args.world_size * args.world_size
        for cfg in parent_popu[:n_evaluated]:
            if cfg['net_id'].startswith(f'net_{args.rank}_'):
                my_subnets_to_be_evaluated[cfg['net_id']] = cfg

        # aggregating all evaluation results
        eval_results = attentive_nas_eval.validate(
            my_subnets_to_be_evaluated,
            train_loader,
            val_loader,
            model,
            criterion,
            args,
            logger,
        )

        # update the Pareto frontier
        # in this case, we search the best FLOPs vs. accuracy trade-offs
        for cfg in eval_results:
            f = round(
                cfg['flops'] / args.evo_search.step) * args.evo_search.step
            if f not in pareto_global or pareto_global[f]['acc1'] < cfg['acc1']:
                pareto_global[f] = cfg

        # next batch of sub-networks to be evaluated
        parent_popu = []
        # mutate
        for idx in range(args.evo_search.mutate_size):
            while True:
                old_cfg = random.choice(list(pareto_global.values()))
                cfg = model.module.mutate_and_reset(
                    old_cfg, prob=args.evo_search.mutate_prob)
                flops = model.module.compute_active_subnet_flops()
                if flops >= targeted_min_flops and flops <= targeted_max_flops:
                    break
            cfg['net_id'] = f'net_{idx % args.world_size}_evo_{evo}_mutate_{idx}'
            parent_popu.append(cfg)

        # cross over
        for idx in range(args.evo_search.crossover_size):
            while True:
                cfg1 = random.choice(list(pareto_global.values()))
                cfg2 = random.choice(list(pareto_global.values()))
                cfg = model.module.crossover_and_reset(cfg1, cfg2)
                flops = model.module.compute_active_subnet_flops()
                if flops >= targeted_min_flops and flops <= targeted_max_flops:
                    break
            cfg['net_id'] = f'net_{idx % args.world_size}_evo_{evo}_crossover_{idx}'
            parent_popu.append(cfg)
Ejemplo n.º 27
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu  # local rank, local machine cuda id
    args.local_rank = args.gpu
    args.batch_size = args.batch_size_per_gpu
    args.batch_size_total = args.batch_size * args.world_size
    #rescale base lr
    args.lr_scheduler.base_lr = args.lr_scheduler.base_lr * (max(
        1, args.batch_size_total // 256))

    # set random seed, make sure all random subgraph generated would be the same
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.gpu:
        torch.cuda.manual_seed(args.seed)

    global_rank = args.gpu + args.machine_rank * ngpus_per_node
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=global_rank)

    # Setup logging format.
    logging.setup_logging(args.logging_save_path, 'w')

    logger.info(
        f"Use GPU: {args.gpu}, machine rank {args.machine_rank}, num_nodes {args.num_nodes}, \
                    gpu per node {ngpus_per_node}, world size {args.world_size}"
    )

    # synchronize is needed here to prevent a possible timeout after calling
    # init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    args.rank = comm.get_rank()  # global rank
    args.local_rank = args.gpu
    torch.cuda.set_device(args.gpu)

    # build model
    logger.info("=> creating model '{}'".format(args.arch))
    model = models.model_factory.create_model(args)
    model.cuda(args.gpu)

    # use sync batchnorm
    if getattr(args, 'sync_bn', False):
        model.apply(lambda m: setattr(m, 'need_sync', True))

    model = comm.get_parallel_model(model, args.gpu)  #local rank

    logger.info(model)

    criterion = loss_ops.CrossEntropyLossSmooth(args.label_smoothing).cuda(
        args.gpu)
    soft_criterion = loss_ops.AdaptiveLossSoft(args.alpha_min, args.alpha_max,
                                               args.iw_clip).cuda(args.gpu)

    if not getattr(args, 'inplace_distill', True):
        soft_criterion = None

    ## load dataset, train_sampler: distributed
    train_loader, val_loader, train_sampler = build_data_loader(args)
    args.n_iters_per_epoch = len(train_loader)

    logger.info(f'building optimizer and lr scheduler, \
            local rank {args.gpu}, global rank {args.rank}, world_size {args.world_size}'
                )
    optimizer = build_optimizer(args, model)
    lr_scheduler = build_lr_scheduler(args, optimizer)

    # optionally resume from a checkpoint
    if args.resume:
        saver.load_checkpoints(args, model, optimizer, lr_scheduler, logger)

    logger.info(args)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        args.curr_epoch = epoch
        logger.info('Training lr {}'.format(lr_scheduler.get_lr()[0]))

        # train for one epoch
        acc1, acc5 = train_epoch(epoch, model, train_loader, optimizer, criterion, args, \
                soft_criterion=soft_criterion, lr_scheduler=lr_scheduler)

        if comm.is_master_process() or args.distributed:
            # validate supernet model
            validate(train_loader, val_loader, model, criterion, args)

        if comm.is_master_process():
            # save checkpoints
            saver.save_checkpoint(
                args.checkpoint_save_path,
                model,
                optimizer,
                lr_scheduler,
                args,
                epoch,
            )
Ejemplo n.º 28
0
def train():
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend="nccl", init_method="env://")
    synchronize()

    # create dataloader & network & optimizer
    model, model_fn_decorator = create_model(cfg)
    init_weights(model, init_type='kaiming')
    # model.to('cuda')
    model.cuda()
    model = nn.parallel.DistributedDataParallel(model,
                                                device_ids=[args.local_rank],
                                                output_device=args.local_rank)

    root_result_dir = args.output_dir
    os.makedirs(root_result_dir, exist_ok=True)

    log_file = os.path.join(root_result_dir, "log_train.txt")
    logger = create_logger(log_file, get_rank())
    logger.info("**********************Start logging**********************")

    # log to file
    gpu_list = os.environ[
        'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(
        ) else 'ALL'
    logger.info("CUDA_VISIBLE_DEVICES=%s" % gpu_list)

    for key, val in vars(args).items():
        logger.info("{:16} {}".format(key, val))

    logger.info("***********************config infos**********************")
    for key, val in vars(cfg).items():
        logger.info("{:16} {}".format(key, val))

    # log tensorboard
    if get_rank() == 0:
        tb_log = SummaryWriter(
            log_dir=os.path.join(root_result_dir, "tensorboard"))
    else:
        tb_log = None

    train_loader, test_loader = create_dataloader(logger)

    optimizer = create_optimizer(model)

    # load checkpoint if it is possible
    start_epoch = it = best_res = 0
    last_epoch = -1
    if args.ckpt is not None:
        pure_model = model.module if isinstance(
            model, (torch.nn.DataParallel,
                    torch.nn.parallel.DistributedDataParallel)) else model
        it, start_epoch, best_res = load_checkpoint(pure_model, optimizer,
                                                    args.ckpt, logger)
        last_epoch = start_epoch + 1

    lr_scheduler = create_scheduler(model,
                                    optimizer,
                                    total_steps=len(train_loader) *
                                    args.epochs,
                                    last_epoch=last_epoch)

    if cfg.DATASET.DF_USED:
        criterion = Total_loss(boundary=cfg.DATASET.BOUNDARY)
    else:
        criterion = nn.CrossEntropyLoss()

    # start training
    logger.info('**********************Start training**********************')
    ckpt_dir = os.path.join(root_result_dir, "ckpt")
    os.makedirs(ckpt_dir, exist_ok=True)
    trainer = train_utils.Trainer(model,
                                  model_fn=model_fn_decorator(),
                                  criterion=criterion,
                                  optimizer=optimizer,
                                  ckpt_dir=ckpt_dir,
                                  lr_scheduler=lr_scheduler,
                                  model_fn_eval=model_fn_decorator(),
                                  tb_log=tb_log,
                                  logger=logger,
                                  eval_frequency=1,
                                  grad_norm_clip=cfg.TRAIN.GRAD_NORM_CLIP,
                                  cfg=cfg)

    trainer.train(start_it=it,
                  start_epoch=start_epoch,
                  n_epochs=args.epochs,
                  train_loader=train_loader,
                  test_loader=test_loader,
                  ckpt_save_interval=args.ckpt_save_interval,
                  lr_scheduler_each_iter=False,
                  best_res=best_res)

    logger.info('**********************End training**********************')