Exemple #1
0
def main(args: Namespace, cfg: AttrDict):
    setup_logging(__name__, output_dir=get_checkpoint_folder(cfg))

    # Extract the features if the feature extract is enabled
    if cfg.CLUSTERFIT.FEATURES.EXTRACT:

        # We cannot have automatic extraction with more than 1 node or otherwise
        # we would have to run this script on several nodes and thus have several
        # parallel clustering of the features. The automatic extraction is only
        # there as a shortcut when running on a single node
        assert (cfg.DISTRIBUTED.NUM_NODES == 1
                ), "Automatic extraction can only work with 1 node"

        # Make sure to dump the features at the desired path
        cfg.CHECKPOINT.DIR = cfg.CLUSTERFIT.FEATURES.PATH
        cfg.CHECKPOINT.APPEND_DISTR_RUN_ID = False

        # Run the extraction of features
        set_env_vars(local_rank=0, node_id=0, cfg=cfg)
        logging.info("Setting seed....")
        set_seeds(cfg, args.node_id)
        launch_distributed(
            cfg,
            args.node_id,
            engine_name="extract_features",
            hook_generator=default_hook_generator,
        )

    # Else setup the path manager (done in set_env_vars) in
    # case of feature extraction above
    else:
        setup_path_manager()

    cluster_features(cfg)
    shutdown_logging()
Exemple #2
0
def main(args: Namespace, cfg: AttrDict):
    # setup logging
    setup_logging(__name__)

    # print the cfg
    print_cfg(cfg)

    # setup the environment variables
    set_env_vars(local_rank=0, node_id=0, cfg=cfg)

    output_dir = get_checkpoint_folder(cfg)

    assert cfg.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON, (
        "Feature eval mode is not ON. Can't run train_svm. "
        "Set config.MODEL.FEATURE_EVAL_SETTINGS.EVAL_MODE_ON=True "
        "in your config or from command line.")
    extract_low_shot_features(args, cfg, output_dir)

    # Get the names of the features that we extracted features for. If user doesn't
    # specify the features to evaluate, we get the full model output and freeze
    # head/trunk both as caution.
    layers = get_trunk_output_feature_names(cfg.MODEL)
    if len(layers) == 0:
        layers = ["heads"]

    # train low shot svm for each layer.
    output = {}
    for layer in layers:
        results = train_svm_low_shot(cfg, output_dir, layer)
        output[layer] = results
    logging.info(f"Results: {output}")

    # close the logging streams including the filehandlers
    shutdown_logging()
Exemple #3
0
def train_sample_places_low_shot(
    low_shot_trainer: SVMLowShotTrainer,
    k_values: List[int],
    sample_inds: List[int],
    sample_num: int,
    output_dir: str,
    layername: str,
    cfg: AttrDict,
):
    # setup the environment variables
    set_env_vars(local_rank=0, node_id=0, cfg=cfg)

    for low_shot_kvalue in k_values:
        checkpoint_dir = f"{output_dir}/sample{sample_num}_k{low_shot_kvalue}"
        train_data = merge_features(checkpoint_dir, "train", layername)
        train_features = train_data["features"]
        train_targets = train_data["targets"]
        checkpoint_dir = f"{output_dir}/sample{sample_inds[0]}_k{k_values[0]}"
        test_data = merge_features(checkpoint_dir, "test", layername)
        test_features = test_data["features"]
        test_targets = test_data["targets"]
        low_shot_trainer.train(train_features, train_targets, sample_num,
                               low_shot_kvalue)
        low_shot_trainer.test(test_features, test_targets, sample_num,
                              low_shot_kvalue)
def main(args: Namespace, config: AttrDict):
    # setup logging
    setup_logging(__name__)

    # print the coniguration used
    print_cfg(config)

    # setup the environment variables
    set_env_vars(local_rank=0, node_id=0, cfg=config)

    # Extract the features if no path to the extract features is provided
    if not config.NEAREST_NEIGHBOR.FEATURES.PATH:
        launch_distributed(
            config,
            args.node_id,
            engine_name="extract_features",
            hook_generator=default_hook_generator,
        )
        config.NEAREST_NEIGHBOR.FEATURES.PATH = get_checkpoint_folder(config)

    # Run KNN at all the extract features
    run_knn_at_all_layers(config)

    # close the logging streams including the filehandlers
    shutdown_logging()
def main(args: Namespace, config: AttrDict):
    # setup logging
    setup_logging(__name__)

    # print the coniguration used
    print_cfg(config)

    # setup the environment variables
    set_env_vars(local_rank=0, node_id=0, cfg=config)

    # extract the features
    launch_distributed(
        config,
        args.node_id,
        engine_name="extract_features",
        hook_generator=default_hook_generator,
    )

    # Get the names of the features that we are extracting. If user doesn't
    # specify the features to evaluate, we get the full model output and freeze
    # head/trunk both as caution.
    feat_names = get_trunk_output_feature_names(config.MODEL)
    if len(feat_names) == 0:
        feat_names = ["heads"]

    for layer in feat_names:
        top1, top5 = nearest_neighbor_test(config, layer_name=layer)
        logging.info(f"layer: {layer} Top1: {top1}, Top5: {top5}")
    # close the logging streams including the filehandlers
    shutdown_logging()
Exemple #6
0
def extract_clusters(
    cfg: AttrDict,
    dist_run_id: str,
    checkpoint_folder: str,
    local_rank: int = 0,
    node_id: int = 0,
):
    """
    Sets up and executes model visualisation extraction workflow on one node
    """

    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)
    dist_rank = int(os.environ["RANK"])

    # setup logging
    setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank)

    logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}")
    # print the environment info for the current node
    if local_rank == 0:
        current_env = os.environ.copy()
        print_system_env_info(current_env)

    # setup the multiprocessing to be forkserver.
    # See https://fb.quip.com/CphdAGUaM5Wf
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg, dist_rank)

    # We set the CUDA device here as well as a safe solution for all downstream
    # `torch.cuda.current_device()` calls to return correct device.
    if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available():
        local_rank, _ = get_machine_local_and_dist_rank()
        torch.cuda.set_device(local_rank)

    # print the training settings and system settings
    if local_rank == 0:
        print_cfg(cfg)
        logging.info("System config:\n{}".format(collect_env_info()))

    # Build the SSL trainer to set up distributed training and then
    # extract the cluster assignments for all entries in the dataset
    trainer = SelfSupervisionTrainer(cfg, dist_run_id)
    cluster_assignments = trainer.extract_clusters()

    # Save the cluster assignments in the output folder
    if dist_rank == 0:
        ClusterAssignmentLoader.save_cluster_assignment(
            output_dir=get_checkpoint_folder(cfg),
            assignments=ClusterAssignment(
                config=cfg, cluster_assignments=cluster_assignments),
        )

    # close the logging streams including the file handlers
    logging.info("All Done!")
    shutdown_logging()
def setup_pathmanager():
    """
    Setup PathManager. A bit hacky -- we use the #set_env_vars method to setup pathmanager
    and as such we need to create a dummy config, and dummy values for local_rank and node_id.
    """
    with initialize_config_module(config_module="vissl.config"):
        cfg = compose(
            "defaults",
            overrides=["config=test/integration_test/quick_swav"],
        )
    config = AttrDict(cfg).config
    set_env_vars(local_rank=0, node_id=0, cfg=config)
def main(args: Namespace, config: AttrDict):
    # setup the logging
    setup_logging(__name__)

    # print the config
    print_cfg(config)

    # setup the environment variables
    set_env_vars(local_rank=0, node_id=0, cfg=config)

    instance_retrieval_test(args, config)
    # close the logging streams including the filehandlers
    shutdown_logging()
Exemple #9
0
def main(args: Namespace, config: AttrDict):
    config = validate_and_infer_config(config)
    # setup the environment variables
    set_env_vars(local_rank=0, node_id=0, cfg=config)

    # setup the logging
    checkpoint_folder = get_checkpoint_folder(config)
    setup_logging(__name__, output_dir=checkpoint_folder)

    # print the config
    print_cfg(config)

    instance_retrieval_test(args, config)
    # close the logging streams including the filehandlers
    shutdown_logging()
Exemple #10
0
def train_svm(cfg: AttrDict, output_dir: str, layername: str):
    # setup the environment variables
    set_env_vars(local_rank=0, node_id=0, cfg=cfg)

    # train the svm
    logging.info(f"Training SVM for layer: {layername}")
    trainer = SVMTrainer(cfg["SVM"], layer=layername, output_dir=output_dir)
    train_data = merge_features(output_dir, "train", layername)
    train_features, train_targets = train_data["features"], train_data["targets"]
    trainer.train(train_features, train_targets)

    # test the svm
    test_data = merge_features(output_dir, "test", layername)
    test_features, test_targets = test_data["features"], test_data["targets"]
    trainer.test(test_features, test_targets)
    logging.info("All Done!")
Exemple #11
0
def main(args: Namespace, config: AttrDict, node_id=0):
    config = validate_and_infer_config(config)

    # setup the environment variables
    set_env_vars(local_rank=0, node_id=node_id, cfg=config)

    # setup the logging
    checkpoint_folder = get_checkpoint_folder(config)
    setup_logging(__name__,
                  output_dir=checkpoint_folder,
                  rank=os.environ["RANK"])

    if (config.IMG_RETRIEVAL.USE_FEATURE_EXTRACTION_ENGINE
            and not config.IMG_RETRIEVAL.FEATURE_EXTRACTION_DIR):
        # extract the train/database features.
        config = adapt_train_database_extract_config(config, checkpoint_folder)

        logging.info("Beginning extract features for database set.")
        launch_distributed(
            config,
            args.node_id,
            engine_name="extract_features",
            hook_generator=default_hook_generator,
        )

        # extract the query features.
        config = adapt_query_extract_config(config, checkpoint_folder)

        logging.info("Beginning extract features for query set.")

        launch_distributed(
            config,
            args.node_id,
            engine_name="extract_features",
            hook_generator=default_hook_generator,
        )

    # print the config
    print_cfg(config)

    instance_retrieval_test(args, config)
    logging.info(f"Performance time breakdow:\n{PERF_STATS.report_str()}")

    # close the logging streams including the filehandlers
    shutdown_logging()
Exemple #12
0
    def _test_synch_bn_pytorch_worker(gpu_id: int, world_size: int,
                                      group_size: int, sync_file: str):
        torch.cuda.set_device(gpu_id)
        init_distributed_on_file(world_size=world_size,
                                 gpu_id=gpu_id,
                                 sync_file=sync_file)

        config = AttrDict({
            "MODEL": {
                "SYNC_BN_CONFIG": {
                    "SYNC_BN_TYPE": "pytorch",
                    "GROUP_SIZE": group_size,
                }
            },
            "DISTRIBUTED": {
                "NUM_PROC_PER_NODE": world_size,
                "NUM_NODES": 1,
                "NCCL_DEBUG": False,
                "NCCL_SOCKET_NTHREADS": 4,
            },
        })
        set_env_vars(local_rank=gpu_id, node_id=0, cfg=config)

        channels = 8
        model = nn.Sequential(
            nn.BatchNorm2d(num_features=channels),
            nn.AdaptiveAvgPool2d(output_size=(1, 1)),
        )
        model = convert_sync_bn(config, model).cuda(gpu_id)
        model = DistributedDataParallel(model, device_ids=[gpu_id])
        x = torch.full(size=(5, channels, 4, 4), fill_value=float(gpu_id))
        model(x)
        running_mean = model.module[0].running_mean.cpu()
        print(gpu_id, running_mean)
        if group_size == 1:
            if gpu_id == 0:
                assert torch.allclose(running_mean,
                                      torch.full(size=(8, ), fill_value=0.0))
            elif gpu_id == 1:
                assert torch.allclose(running_mean,
                                      torch.full(size=(8, ), fill_value=0.1))
        else:
            if gpu_id in {0, 1}:
                assert torch.allclose(running_mean,
                                      torch.full(size=(8, ), fill_value=0.05))
Exemple #13
0
def extract_features_and_run_knn(node_id: int, config: AttrDict):
    setup_logging(__name__)
    print_cfg(config)
    set_env_vars(local_rank=0, node_id=0, cfg=config)

    # Extract the features if no path to the extract features is provided
    if not config.NEAREST_NEIGHBOR.FEATURES.PATH:
        launch_distributed(
            config,
            node_id,
            engine_name="extract_features",
            hook_generator=default_hook_generator,
        )
        config.NEAREST_NEIGHBOR.FEATURES.PATH = get_checkpoint_folder(config)

    # Run KNN on all the extract features
    run_knn_at_all_layers(config)

    # close the logging streams including the file handlers
    shutdown_logging()
Exemple #14
0
def train_svm(cfg: AttrDict, output_dir: str, layername: str):
    # setup the environment variables
    set_env_vars(local_rank=0, node_id=0, cfg=cfg)
    features_dir = cfg.SVM_FEATURES_PATH

    # train the svm
    logging.info(f"Training SVM for layer: {layername}")
    trainer = SVMTrainer(cfg["SVM"], layer=layername, output_dir=output_dir)
    train_data = ExtractedFeaturesLoader.load_features(features_dir,
                                                       "train",
                                                       layername,
                                                       flatten_features=True)
    trainer.train(train_data["features"], train_data["targets"])

    # test the svm
    test_data = ExtractedFeaturesLoader.load_features(features_dir,
                                                      "test",
                                                      layername,
                                                      flatten_features=True)
    trainer.test(test_data["features"], test_data["targets"])
    logging.info("All Done!")
Exemple #15
0
def main(args: Namespace, config: AttrDict):
    # setup logging
    setup_logging(__name__)

    # print the coniguration used
    print_cfg(config)

    # setup the environment variables
    set_env_vars(local_rank=0, node_id=0, cfg=config)

    # extract the label predictions on the test set
    launch_distributed(
        config,
        args.node_id,
        engine_name="extract_label_predictions",
        hook_generator=default_hook_generator,
    )

    geolocalization_test(config)

    # close the logging streams including the filehandlers
    shutdown_logging()
Exemple #16
0
def main(args: Namespace, cfg: AttrDict):
    # setup logging
    setup_logging(__name__)

    # setup the environment variables
    set_env_vars(local_rank=0, node_id=0, cfg=cfg)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg, args.node_id)

    # extract the features. We enable the feature extraction as well.
    launch_distributed(
        cfg,
        args.node_id,
        engine_name="extract_features",
        hook_generator=default_hook_generator,
    )

    # cluster the extracted features
    cluster_features_and_label(args, cfg)
    # close the logging streams including the filehandlers
    shutdown_logging()
def launch_distributed(
    cfg: AttrDict,
    node_id: int,
    engine_name: str,
    hook_generator: Callable[[Any], List[ClassyHook]],
):
    """
    Launch the distributed training across gpus of the current node according to the cfg.

    If more than 1 nodes are needed for training, this function should be called on each
    of the different nodes, each time with an unique node_id in the range [0..N-1] if N
    is the total number of nodes to take part in training.

    Alternatively, you can use SLURM or any cluster management system to run this function
    for you.

    Configure the node_id, dist_run_id, setup the environment variabled

    Args:
        cfg (AttrDict): VISSL yaml configuration
        node_id (int): node_id for this node
        engine_name (str): what engine to run: train or extract_features
        hook_generator (Callable): Callback to generate all the ClassyVision hooks
            for this engine
    """

    setup_logging(__name__)
    node_id = get_node_id(node_id)
    dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES)
    world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
    set_env_vars(local_rank=0, node_id=node_id, cfg=cfg)
    _copy_to_local(cfg)

    # given the checkpoint folder, we check that there's not already a final checkpoint
    checkpoint_folder = get_checkpoint_folder(cfg)
    if is_training_finished(cfg, checkpoint_folder=checkpoint_folder):
        logging.info(f"Training already succeeded on node: {node_id}, exiting.")
        return

    # Get the checkpoint where to load from. The load_checkpoints function will
    # automatically take care of detecting whether it's a resume or not.
    symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and PathManager.exists(
        symlink_checkpoint_path
    ):
        checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    else:
        checkpoint_path = get_resume_checkpoint(
            cfg, checkpoint_folder=checkpoint_folder
        )

    try:
        if world_size > 1:
            torch.multiprocessing.spawn(
                _distributed_worker,
                nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE,
                args=(
                    cfg,
                    node_id,
                    dist_run_id,
                    engine_name,
                    checkpoint_path,
                    checkpoint_folder,
                    hook_generator,
                ),
                daemon=False,
            )
        else:
            _distributed_worker(
                local_rank=0,
                cfg=cfg,
                node_id=node_id,
                dist_run_id=dist_run_id,
                engine_name=engine_name,
                checkpoint_path=checkpoint_path,
                checkpoint_folder=checkpoint_folder,
                hook_generator=hook_generator,
            )

    except (KeyboardInterrupt, RuntimeError) as e:
        logging.error("Wrapping up, caught exception: ", e)
        if isinstance(e, RuntimeError):
            raise e
    finally:
        _cleanup_local_dir(cfg)

    logging.info("All Done!")
    shutdown_logging()
def extract_main(
    cfg: AttrDict,
    dist_run_id: str,
    checkpoint_folder: str,
    local_rank: int = 0,
    node_id: int = 0,
):
    """
    Sets up and executes feature extraction workflow per machine.

    Args:
        cfg (AttrDict): user specified input config that has optimizer, loss, meters etc
                        settings relevant to the training
        dist_run_id (str): For multi-gpu training with PyTorch, we have to specify
                           how the gpus are going to rendezvous. This requires specifying
                           the communication method: file, tcp and the unique rendezvous
                           run_id that is specific to 1 run.
                           We recommend:
                                1) for 1node: use init_method=tcp and run_id=auto
                                2) for multi-node, use init_method=tcp and specify
                                run_id={master_node}:{port}
        local_rank (int): id of the current device on the machine. If using gpus,
                        local_rank = gpu number on the current machine
        node_id (int): id of the current machine. starts from 0. valid for multi-gpu
    """

    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)
    dist_rank = int(os.environ["RANK"])

    # setup logging
    setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank)

    logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}")
    # print the environment info for the current node
    if local_rank == 0:
        current_env = os.environ.copy()
        print_system_env_info(current_env)

    # setup the multiprocessing to be forkserver.
    # See https://fb.quip.com/CphdAGUaM5Wf
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg, dist_rank)

    # We set the CUDA device here as well as a safe solution for all downstream
    # `torch.cuda.current_device()` calls to return correct device.
    if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available():
        local_rank, _ = get_machine_local_and_dist_rank()
        torch.cuda.set_device(local_rank)

    # print the training settings and system settings
    if local_rank == 0:
        print_cfg(cfg)
        logging.info("System config:\n{}".format(collect_env_info()))

    trainer = SelfSupervisionTrainer(cfg, dist_run_id)
    features = trainer.extract()

    for split in features.keys():
        logging.info(f"============== Split: {split} =======================")
        for layer_name, layer_features in features[split].items():
            out_feat_file = os.path.join(
                checkpoint_folder,
                f"rank{dist_rank}_{split}_{layer_name}_features.npy")
            out_target_file = os.path.join(
                checkpoint_folder,
                f"rank{dist_rank}_{split}_{layer_name}_targets.npy")
            out_inds_file = os.path.join(
                checkpoint_folder,
                f"rank{dist_rank}_{split}_{layer_name}_inds.npy")
            feat_shape = layer_features["features"].shape
            logging.info(
                f"Saving extracted features of {layer_name} with shape {feat_shape} to: {out_feat_file}"
            )
            save_file(layer_features["features"], out_feat_file)
            logging.info(
                f"Saving extracted targets of {layer_name} to: {out_target_file}"
            )
            save_file(layer_features["targets"], out_target_file)
            logging.info(
                f"Saving extracted indices of {layer_name} to: {out_inds_file}"
            )
            save_file(layer_features["inds"], out_inds_file)

    logging.info("All Done!")
    # close the logging streams including the filehandlers
    shutdown_logging()
Exemple #19
0
def extract_label_predictions_main(
    cfg: AttrDict,
    dist_run_id: str,
    checkpoint_folder: str,
    local_rank: int = 0,
    node_id: int = 0,
):
    """
    Sets up and executes label predictions workflow per machine. Runs the
    model in eval mode only to extract the label predicted per class.

    Args:
        cfg (AttrDict): user specified input config that has optimizer, loss, meters etc
                        settings relevant for the feature extraction.
        dist_run_id (str): For multi-gpu training with PyTorch, we have to specify
                           how the gpus are going to rendezvous. This requires specifying
                           the communication method: file, tcp and the unique rendezvous
                           run_id that is specific to 1 run.
                           We recommend:
                                1) for 1node: use init_method=tcp and run_id=auto
                                2) for multi-node, use init_method=tcp and specify
                                run_id={master_node}:{port}
        local_rank (int): id of the current device on the machine. If using gpus,
                        local_rank = gpu number on the current machine
        node_id (int): id of the current machine. starts from 0. valid for multi-gpu
    """

    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)
    dist_rank = int(os.environ["RANK"])

    # setup logging
    setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank)

    # setup the multiprocessing to be forkserver. See https://fb.quip.com/CphdAGUaM5Wf
    logging.info(
        f"Setting multiprocessing method: {cfg.MULTI_PROCESSING_METHOD}")
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg, dist_rank)

    # We set the CUDA device here as well as a safe solution for all downstream
    # `torch.cuda.current_device()` calls to return correct device.
    if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available():
        local_rank, _ = get_machine_local_and_dist_rank()
        torch.cuda.set_device(local_rank)

    # print the training settings and system settings
    # print the environment info for the current node
    logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}")
    if local_rank == 0:
        current_env = os.environ.copy()
        print_system_env_info(current_env)
        print_cfg(cfg)
        logging.info(f"System config:\n{collect_env_info()}")

    # Identify the hooks to run for the extract label engine
    # TODO - we need to plug this better with the engine registry
    #  - we either need to use the global hooks registry
    #  - or we need to create specific hook registry by engine
    hooks = extract_label_hook_generator(cfg)

    trainer = SelfSupervisionTrainer(cfg, dist_run_id, hooks=hooks)
    trainer.extract(
        output_folder=cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder,
        extract_features=False,
        extract_predictions=True,
    )

    logging.info("All Done!")
    # close the logging streams including the filehandlers
    shutdown_logging()
Exemple #20
0
def train_main(
    cfg: AttrDict,
    dist_run_id: str,
    checkpoint_path: str,
    checkpoint_folder: str,
    local_rank: int = 0,
    node_id: int = 0,
    hook_generator: Callable[[Any], List[ClassyHook]] = default_hook_generator,
):
    """
    Sets up and executes training workflow per machine.

    Args:
        cfg (AttrDict): user specified input config that has optimizer, loss, meters etc
                        settings relevant to the training
        dist_run_id (str): For multi-gpu training with PyTorch, we have to specify
                           how the gpus are going to rendezvous. This requires specifying
                           the communication method: file, tcp and the unique rendezvous
                           run_id that is specific to 1 run.
                           We recommend:
                                1) for 1node: use init_method=tcp and run_id=auto
                                2) for multi-node, use init_method=tcp and specify
                                run_id={master_node}:{port}
        checkpoint_path (str): if the training is being resumed from a checkpoint, path to
                          the checkpoint. The tools/run_distributed_engines.py automatically
                          looks for the checkpoint in the checkpoint directory.
        checkpoint_folder (str): what directory to use for checkpointing. The
                          tools/run_distributed_engines.py creates the directory based on user
                          input in the yaml config file.
        local_rank (int): id of the current device on the machine. If using gpus,
                        local_rank = gpu number on the current machine
        node_id (int): id of the current machine. starts from 0. valid for multi-gpu
        hook_generator (Callable): The utility function that prepares all the hoooks that will
                         be used in training based on user selection. Some basic hooks are used
                         by default.
    """

    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)
    dist_rank = int(os.environ["RANK"])

    # setup logging
    setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank)

    logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}")
    # print the environment info for the current node
    if local_rank == 0:
        current_env = os.environ.copy()
        print_system_env_info(current_env)

    # setup the multiprocessing to be forkserver.
    # See https://fb.quip.com/CphdAGUaM5Wf
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg, dist_rank)

    # We set the CUDA device here as well as a safe solution for all downstream
    # `torch.cuda.current_device()` calls to return correct device.
    if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available():
        local_rank, _ = get_machine_local_and_dist_rank()
        torch.cuda.set_device(local_rank)

    # print the training settings and system settings
    if local_rank == 0:
        print_cfg(cfg)
        logging.info("System config:\n{}".format(collect_env_info()))

    # get the hooks - these hooks are executed per replica
    hooks = hook_generator(cfg)

    # build the SSL trainer. The trainer first prepares a "task" object which
    # acts as a container for various things needed in a training: datasets,
    # dataloader, optimizers, losses, hooks, etc. "Task" will also have information
    # about phases (train, test) both. The trainer then sets up distributed
    # training.
    trainer = SelfSupervisionTrainer(
        cfg, dist_run_id, checkpoint_path, checkpoint_folder, hooks
    )
    trainer.train()
    logging.info("All Done!")
    # close the logging streams including the filehandlers
    shutdown_logging()
Exemple #21
0
def launch_distributed(
    cfg: AttrDict,
    node_id: int,
    engine_name: str,
    hook_generator: Callable[[Any], List[ClassyHook]],
):
    """
    Launch the distributed training across gpus, according to the cfg

    Args:
        cfg  -- VISSL yaml configuration
        node_id -- node_id for this node
        engine_name -- what engine to run: train or extract_features
        hook_generator -- Callback to generate all the ClassyVision hooks for this engine
    """
    node_id = get_node_id(node_id)
    dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES)
    world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
    set_env_vars(local_rank=0, node_id=node_id, cfg=cfg)
    copy_to_local(cfg)

    # given the checkpoint folder, we check that there's not already a final checkpoint
    checkpoint_folder = get_checkpoint_folder(cfg)
    if is_training_finished(cfg, checkpoint_folder=checkpoint_folder):
        logging.info(
            f"Training already succeeded on node: {node_id}, exiting.")
        return

    # Get the checkpoint where to load from. The load_checkpoints function will
    # automatically take care of detecting whether it's a resume or not.
    symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and PathManager.exists(
            symlink_checkpoint_path):
        checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    else:
        checkpoint_path = get_resume_checkpoint(
            cfg, checkpoint_folder=checkpoint_folder)

    try:
        if world_size > 1:
            torch.multiprocessing.spawn(
                _distributed_worker,
                nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE,
                args=(
                    cfg,
                    node_id,
                    dist_run_id,
                    engine_name,
                    checkpoint_path,
                    checkpoint_folder,
                    hook_generator,
                ),
                daemon=False,
            )
        else:
            _distributed_worker(
                local_rank=0,
                cfg=cfg,
                node_id=node_id,
                dist_run_id=dist_run_id,
                engine_name=engine_name,
                checkpoint_path=checkpoint_path,
                checkpoint_folder=checkpoint_folder,
                hook_generator=hook_generator,
            )

    except (KeyboardInterrupt, RuntimeError) as e:
        logging.error("Wrapping up, caught exception: ", e)
        if isinstance(e, RuntimeError):
            raise e
    finally:
        cleanup_local_dir(cfg)

    logging.info("All Done!")
Exemple #22
0
def extract_main(cfg: AttrDict,
                 dist_run_id: str,
                 local_rank: int = 0,
                 node_id: int = 0):
    """
    Sets up and executes feature extraction workflow per machine.

    Args:
        cfg (AttrDict): user specified input config that has optimizer, loss, meters etc
                        settings relevant to the training
        dist_run_id (str): For multi-gpu training with PyTorch, we have to specify
                           how the gpus are going to rendezvous. This requires specifying
                           the communication method: file, tcp and the unique rendezvous
                           run_id that is specific to 1 run.
                           We recommend:
                                1) for 1node: use init_method=tcp and run_id=auto
                                2) for multi-node, use init_method=tcp and specify
                                run_id={master_node}:{port}
        local_rank (int): id of the current device on the machine. If using gpus,
                        local_rank = gpu number on the current machine
        node_id (int): id of the current machine. starts from 0. valid for multi-gpu
    """

    # setup logging
    setup_logging(__name__)
    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)

    # setup the multiprocessing to be forkserver.
    # See https://fb.quip.com/CphdAGUaM5Wf
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg)

    # print the training settings and system settings
    local_rank, _ = get_machine_local_and_dist_rank()
    if local_rank == 0:
        print_cfg(cfg)
        logging.info("System config:\n{}".format(collect_env_info()))

    output_dir = get_checkpoint_folder(cfg)
    trainer = SelfSupervisionTrainer(cfg, dist_run_id)
    features = trainer.extract()

    for split in features.keys():
        logging.info(f"============== Split: {split} =======================")
        layers = features[split].keys()
        for layer in layers:
            out_feat_file = (
                f"{output_dir}/rank{local_rank}_{split}_{layer}_features.npy")
            out_target_file = (
                f"{output_dir}/rank{local_rank}_{split}_{layer}_targets.npy")
            out_inds_file = f"{output_dir}/rank{local_rank}_{split}_{layer}_inds.npy"
            logging.info("Saving extracted features: {} {} to: {}".format(
                layer, features[split][layer]["features"].shape,
                out_feat_file))
            save_file(features[split][layer]["features"], out_feat_file)
            logging.info("Saving extracted targets: {} to: {}".format(
                features[split][layer]["targets"].shape, out_target_file))
            save_file(features[split][layer]["targets"], out_target_file)
            logging.info("Saving extracted indices: {} to: {}".format(
                features[split][layer]["inds"].shape, out_inds_file))
            save_file(features[split][layer]["inds"], out_inds_file)
    logging.info("All Done!")
    # close the logging streams including the filehandlers
    shutdown_logging()
Exemple #23
0
def extract_features_main(
    cfg: AttrDict,
    dist_run_id: str,
    checkpoint_folder: str,
    local_rank: int = 0,
    node_id: int = 0,
):
    """
    Sets up and executes feature extraction workflow per machine.

    Args:
        cfg (AttrDict): user specified input config that has optimizer, loss, meters etc
                        settings relevant to the training
        dist_run_id (str): For multi-gpu training with PyTorch, we have to specify
                           how the gpus are going to rendezvous. This requires specifying
                           the communication method: file, tcp and the unique rendezvous
                           run_id that is specific to 1 run.
                           We recommend:
                                1) for 1node: use init_method=tcp and run_id=auto
                                2) for multi-node, use init_method=tcp and specify
                                run_id={master_node}:{port}
        checkpoint_folder (str): what directory to use for checkpointing. This folder
                                 will be used to output the extracted features as well
                                 in case config.EXTRACT_FEATURES.OUTPUT_DIR is not set
        local_rank (int): id of the current device on the machine. If using gpus,
                        local_rank = gpu number on the current machine
        node_id (int): id of the current machine. starts from 0. valid for multi-gpu
    """

    # setup the environment variables
    set_env_vars(local_rank, node_id, cfg)
    dist_rank = int(os.environ["RANK"])

    # setup logging
    setup_logging(__name__, output_dir=checkpoint_folder, rank=dist_rank)

    logging.info(f"Env set for rank: {local_rank}, dist_rank: {dist_rank}")
    # print the environment info for the current node
    if local_rank == 0:
        current_env = os.environ.copy()
        print_system_env_info(current_env)

    # setup the multiprocessing to be forkserver.
    # See https://fb.quip.com/CphdAGUaM5Wf
    setup_multiprocessing_method(cfg.MULTI_PROCESSING_METHOD)

    # set seeds
    logging.info("Setting seed....")
    set_seeds(cfg, dist_rank)

    # We set the CUDA device here as well as a safe solution for all downstream
    # `torch.cuda.current_device()` calls to return correct device.
    if cfg.MACHINE.DEVICE == "gpu" and torch.cuda.is_available():
        local_rank, _ = get_machine_local_and_dist_rank()
        torch.cuda.set_device(local_rank)

    # print the training settings and system settings
    if local_rank == 0:
        print_cfg(cfg)
        logging.info("System config:\n{}".format(collect_env_info()))

    # Identify the hooks to run for the extract label engine
    # TODO - we need to plug this better with the engine registry
    #  - we either need to use the global hooks registry
    #  - or we need to create specific hook registry by engine
    hooks = extract_features_hook_generator(cfg)

    # Run the label prediction extraction
    trainer = SelfSupervisionTrainer(cfg, dist_run_id, hooks=hooks)
    output_dir = cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder
    trainer.extract(
        output_folder=cfg.EXTRACT_FEATURES.OUTPUT_DIR or checkpoint_folder,
        extract_features=True,
        extract_predictions=False,
    )

    # TODO (prigoyal): merge this function with _extract_features
    if dist_rank == 0 and cfg.EXTRACT_FEATURES.MAP_FEATURES_TO_IMG_NAME:
        # Get the names of the features that we extracted features for. If user doesn't
        # specify the features to evaluate, we get the full model output and freeze
        # head/trunk both as caution.
        layers = get_trunk_output_feature_names(cfg.MODEL)
        if len(layers) == 0:
            layers = ["heads"]
        available_splits = [
            item.lower() for item in trainer.task.available_splits
        ]
        for split in available_splits:
            image_paths = trainer.task.datasets[split].get_image_paths()[0]
            for layer in layers:
                ExtractedFeaturesLoader.map_features_to_img_filepath(
                    image_paths=image_paths,
                    input_dir=output_dir,
                    split=split,
                    layer=layer,
                )

    logging.info("All Done!")
    # close the logging streams including the filehandlers
    shutdown_logging()
Exemple #24
0
def launch_distributed(
    cfg: AttrDict,
    node_id: int,
    engine_name: str,
    hook_generator: Callable[[Any], List[ClassyHook]],
):
    """
    Launch the distributed training across gpus of the current node according to the cfg.

    If more than 1 nodes are needed for training, this function should be called on each
    of the different nodes, each time with an unique node_id in the range [0..N-1] if N
    is the total number of nodes to take part in training.

    Alternatively, you can use SLURM or any cluster management system to run this function
    for you.

    Configure the node_id, dist_run_id, setup the environment variabled

    Args:
        cfg (AttrDict): VISSL yaml configuration
        node_id (int): node_id for this node
        engine_name (str): what engine to run: train or extract_features
        hook_generator (Callable): Callback to generate all the ClassyVision hooks
            for this engine
    """

    setup_logging(__name__)
    node_id = get_node_id(node_id)
    dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES)

    # If using gpus, we check that the user has specified <= gpus available on user system.
    if cfg.MACHINE.DEVICE == "gpu":
        assert cfg.DISTRIBUTED.NUM_PROC_PER_NODE <= torch.cuda.device_count(
        ), (f"User system doesn't have requested {cfg.DISTRIBUTED.NUM_PROC_PER_NODE} gpus "
            f"available. Number of gpus found on user system={torch.cuda.device_count()}. "
            "Please set the DISTRIBUTED.NUM_PROC_PER_NODE properly.")

    # set the environment variables including local rank, node id etc.
    set_env_vars(local_rank=0, node_id=node_id, cfg=cfg)

    # given the checkpoint folder, we check that there's not already a final checkpoint
    # and that if there already exists a final checkpoint and user is not overriding
    # to ignore the final checkpoint
    checkpoint_folder = get_checkpoint_folder(cfg)
    if is_training_finished(cfg, checkpoint_folder=checkpoint_folder):
        logging.info(
            f"Training already succeeded on node: {node_id}, exiting.")
        return

    # Get the checkpoint where to resume from. The get_resume_checkpoint function will
    # automatically take care of detecting whether it's a resume or not.
    symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and g_pathmgr.exists(
            symlink_checkpoint_path):
        checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    else:
        checkpoint_path = get_resume_checkpoint(
            cfg, checkpoint_folder=checkpoint_folder)

    # assert that if the user set the PARAMS_FILE, it must exist and be valid.
    # we only use the PARAMS_FILE init if the checkpoint doesn't exist for the
    # given training. This ensures that if the same training resumes, then it
    # resumes from the checkpoint and not the weight init
    if checkpoint_path is None and cfg["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"]:
        params_file = cfg["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"]
        error_message = f"Specified PARAMS_FILE does NOT exist: {params_file}"
        assert g_pathmgr.exists(params_file), error_message

    # copy the data to local if user wants. This can speed up dataloading.
    _copy_to_local(cfg)

    try:
        torch.multiprocessing.spawn(
            _distributed_worker,
            nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE,
            args=(
                cfg,
                node_id,
                dist_run_id,
                engine_name,
                checkpoint_path,
                checkpoint_folder,
                hook_generator,
            ),
            daemon=False,
        )
    except (KeyboardInterrupt, RuntimeError) as e:
        logging.error("Wrapping up, caught exception: ", e)
        if isinstance(e, RuntimeError):
            raise e
    finally:
        _cleanup_local_dir(cfg)

    logging.info("All Done!")
    shutdown_logging()