コード例 #1
0
def dist_is_initialized():
    if dist.is_available():
        if dist.is_initialized():
            return True
    return False
コード例 #2
0
def get_rank() -> int:
    if not dist.is_available():
        return 0
    if not dist.is_initialized():
        return 0
    return dist.get_rank()
コード例 #3
0
ファイル: run_test.py プロジェクト: garymm/pytorch
def get_selected_tests(options):
    # First make sure run specific test cases options are processed.
    if options.run_specified_test_cases:
        if options.use_specified_test_cases_by == "include":
            options.include = list(SPECIFIED_TEST_CASES_DICT.keys())
        elif options.use_specified_test_cases_by == "bring-to-front":
            options.bring_to_front = list(SPECIFIED_TEST_CASES_DICT.keys())

    selected_tests = options.include

    # filter if there's JIT only and distributed only test options
    if options.jit:
        selected_tests = list(
            filter(lambda test_name: "jit" in test_name, selected_tests))

    if options.distributed_tests:
        selected_tests = list(
            filter(lambda test_name: test_name in DISTRIBUTED_TESTS,
                   selected_tests))

    # Filter to only run core tests when --core option is specified
    if options.core:
        selected_tests = list(
            filter(lambda test_name: test_name in CORE_TEST_LIST,
                   selected_tests))

    # process reordering
    if options.bring_to_front:
        to_front = set(options.bring_to_front)
        selected_tests = options.bring_to_front + list(
            filter(lambda name: name not in to_front, selected_tests))

    if options.first:
        first_index = find_test_index(options.first, selected_tests)
        selected_tests = selected_tests[first_index:]

    if options.last:
        last_index = find_test_index(options.last,
                                     selected_tests,
                                     find_last_index=True)
        selected_tests = selected_tests[:last_index + 1]

    # process exclusion
    if options.exclude_jit_executor:
        options.exclude.extend(JIT_EXECUTOR_TESTS)

    if options.exclude_distributed_tests:
        options.exclude.extend(DISTRIBUTED_TESTS)

    selected_tests = exclude_tests(options.exclude, selected_tests)

    if sys.platform == "win32" and not options.ignore_win_blocklist:
        target_arch = os.environ.get("VSCMD_ARG_TGT_ARCH")
        if target_arch != "x64":
            WINDOWS_BLOCKLIST.append("cpp_extensions_aot_no_ninja")
            WINDOWS_BLOCKLIST.append("cpp_extensions_aot_ninja")
            WINDOWS_BLOCKLIST.append("cpp_extensions_jit")
            WINDOWS_BLOCKLIST.append("jit")
            WINDOWS_BLOCKLIST.append("jit_fuser")

        # This is exception that's caused by this issue https://github.com/pytorch/pytorch/issues/69460
        # This below code should be removed once this issue is solved
        if torch.version.cuda is not None and LooseVersion(
                torch.version.cuda) >= "11.5":
            WINDOWS_BLOCKLIST.append("test_cpp_extensions_aot")
            WINDOWS_BLOCKLIST.append("test_cpp_extensions_aot_ninja")
            WINDOWS_BLOCKLIST.append("test_cpp_extensions_aot_no_ninja")

        selected_tests = exclude_tests(WINDOWS_BLOCKLIST, selected_tests,
                                       "on Windows")

    elif TEST_WITH_ROCM:
        selected_tests = exclude_tests(ROCM_BLOCKLIST, selected_tests,
                                       "on ROCm")

    # sharding
    if options.shard:
        assert len(options.shard) == 2, "Unexpected shard format"
        assert min(options.shard) > 0, "Shards must be positive numbers"
        which_shard, num_shards = options.shard
        assert (
            which_shard <= num_shards
        ), "Selected shard must be less than or equal to total number of shards"
        assert num_shards <= len(
            selected_tests
        ), f"Number of shards must be less than {len(selected_tests)}"
        # TODO: fix this to use test_times_filename, but currently this is not working
        # because setting the export arg immeidately halts the test execution.
        selected_tests = get_shard_based_on_S3(which_shard, num_shards,
                                               selected_tests, TEST_TIMES_FILE)

    # skip all distributed tests if distributed package is not available.
    if not dist.is_available():
        selected_tests = exclude_tests(
            DISTRIBUTED_TESTS, selected_tests,
            "PyTorch is built without distributed support.")

    # skip tests that require LAPACK when it's not available
    if not torch._C.has_lapack:
        selected_tests = exclude_tests(
            TESTS_REQUIRING_LAPACK, selected_tests,
            "PyTorch is built without LAPACK support.")

    return selected_tests
コード例 #4
0
    'distributed/_pipeline/sync/test_deferred_batch_norm',
    'distributed/_pipeline/sync/test_dependency',
    'distributed/_pipeline/sync/test_inplace',
    'distributed/_pipeline/sync/test_microbatch',
    'distributed/_pipeline/sync/test_phony',
    'distributed/_pipeline/sync/test_pipe',
    'distributed/_pipeline/sync/test_pipeline',
    'distributed/_pipeline/sync/test_stream',
    'distributed/_pipeline/sync/test_transparency',
    'distributed/_pipeline/sync/test_worker',
]
_DEP_MODULES_CACHE: Dict[str, set] = {}

DISTRIBUTED_TESTS_CONFIG = {}

if dist.is_available():
    DISTRIBUTED_TESTS_CONFIG['test'] = {'WORLD_SIZE': '1'}
    if not TEST_WITH_ROCM and dist.is_mpi_available():
        DISTRIBUTED_TESTS_CONFIG['mpi'] = {
            'WORLD_SIZE': '3',
            'TEST_REPORT_SOURCE_OVERRIDE': 'dist-mpi'
        }
    if dist.is_nccl_available():
        DISTRIBUTED_TESTS_CONFIG['nccl'] = {
            'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3',
            'TEST_REPORT_SOURCE_OVERRIDE': 'dist-nccl'
        }
    if dist.is_gloo_available():
        DISTRIBUTED_TESTS_CONFIG['gloo'] = {
            'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3',
            'TEST_REPORT_SOURCE_OVERRIDE': 'dist-gloo'
コード例 #5
0
    def __init__(
        self,
        config: ConfigSchema,
        model: Optional[MultiRelationEmbedder] = None,
        trainer: Optional[AbstractBatchProcessor] = None,
        evaluator: Optional[AbstractBatchProcessor] = None,
        rank: Rank = RANK_ZERO,
        subprocess_init: Optional[Callable[[], None]] = None,
    ):
        """Each epoch/pass, for each partition pair, loads in embeddings and edgelist
        from disk, runs HOGWILD training on them, and writes partitions back to disk.
        """
        tag_logs_with_process_name(f"Trainer-{rank}")
        self.config = config
        if config.verbose > 0:
            import pprint
            pprint.PrettyPrinter().pprint(config.to_dict())

        logger.info("Loading entity counts...")
        entity_storage = ENTITY_STORAGES.make_instance(config.entity_path)
        entity_counts: Dict[str, List[int]] = {}
        for entity, econf in config.entities.items():
            entity_counts[entity] = []
            for part in range(econf.num_partitions):
                entity_counts[entity].append(
                    entity_storage.load_count(entity, part))

        # Figure out how many lhs and rhs partitions we need
        holder = self.holder = EmbeddingHolder(config)

        logger.debug(
            f"nparts {holder.nparts_lhs} {holder.nparts_rhs} "
            f"types {holder.lhs_partitioned_types} {holder.rhs_partitioned_types}"
        )

        # We know ahead of time that we wil need 1-2 storages for each embedding type,
        # as well as the max size of this storage (num_entities x D).
        # We allocate these storages n advance in `embedding_storage_freelist`.
        # When we need storage for an entity type, we pop it from this free list,
        # and then add it back when we 'delete' the embedding table.
        embedding_storage_freelist: Dict[
            EntityName, Set[torch.FloatStorage]] = defaultdict(set)
        for entity_type, counts in entity_counts.items():
            max_count = max(counts)
            num_sides = (
                (1 if entity_type in holder.lhs_partitioned_types else 0) +
                (1 if entity_type in holder.rhs_partitioned_types else 0) +
                (1 if entity_type in (holder.lhs_unpartitioned_types |
                                      holder.rhs_unpartitioned_types) else 0))
            for _ in range(num_sides):
                embedding_storage_freelist[entity_type].add(
                    allocate_shared_tensor((max_count, config.dimension),
                                           dtype=torch.float).storage())

        # create the handlers, threads, etc. for distributed training
        if config.num_machines > 1 or config.num_partition_servers > 0:
            if not 0 <= rank < config.num_machines:
                raise RuntimeError("Invalid rank for trainer")
            if not td.is_available():
                raise RuntimeError(
                    "The installed PyTorch version doesn't provide "
                    "distributed training capabilities.")
            ranks = ProcessRanks.from_num_invocations(
                config.num_machines, config.num_partition_servers)

            num_ps_groups = config.num_groups_for_partition_server
            groups: List[List[int]] = [ranks.trainers]  # barrier group
            groups += [ranks.trainers + ranks.partition_servers
                       ] * num_ps_groups  # ps groups
            group_idxs_for_partition_servers = range(1, len(groups))

            if rank == RANK_ZERO:
                logger.info("Setup lock server...")
                start_server(
                    LockServer(
                        num_clients=len(ranks.trainers),
                        nparts_lhs=holder.nparts_lhs,
                        nparts_rhs=holder.nparts_rhs,
                        entities_lhs=holder.lhs_partitioned_types,
                        entities_rhs=holder.rhs_partitioned_types,
                        entity_counts=entity_counts,
                        init_tree=config.distributed_tree_init_order,
                    ),
                    process_name="LockServer",
                    init_method=config.distributed_init_method,
                    world_size=ranks.world_size,
                    server_rank=ranks.lock_server,
                    groups=groups,
                    subprocess_init=subprocess_init,
                )

            self.bucket_scheduler = DistributedBucketScheduler(
                server_rank=ranks.lock_server,
                client_rank=ranks.trainers[rank],
            )

            logger.info("Setup param server...")
            start_server(
                ParameterServer(num_clients=len(ranks.trainers)),
                process_name=f"ParamS-{rank}",
                init_method=config.distributed_init_method,
                world_size=ranks.world_size,
                server_rank=ranks.parameter_servers[rank],
                groups=groups,
                subprocess_init=subprocess_init,
            )

            parameter_sharer = ParameterSharer(
                process_name=f"ParamC-{rank}",
                client_rank=ranks.parameter_clients[rank],
                all_server_ranks=ranks.parameter_servers,
                init_method=config.distributed_init_method,
                world_size=ranks.world_size,
                groups=groups,
                subprocess_init=subprocess_init,
            )

            if config.num_partition_servers == -1:
                start_server(
                    ParameterServer(
                        num_clients=len(ranks.trainers),
                        group_idxs=group_idxs_for_partition_servers,
                        log_stats=True,
                    ),
                    process_name=f"PartS-{rank}",
                    init_method=config.distributed_init_method,
                    world_size=ranks.world_size,
                    server_rank=ranks.partition_servers[rank],
                    groups=groups,
                    subprocess_init=subprocess_init,
                )

            groups = init_process_group(
                rank=ranks.trainers[rank],
                world_size=ranks.world_size,
                init_method=config.distributed_init_method,
                groups=groups,
            )
            trainer_group, *groups_for_partition_servers = groups
            self.barrier_group = trainer_group

            if len(ranks.partition_servers) > 0:
                partition_client = PartitionClient(
                    ranks.partition_servers,
                    groups=groups_for_partition_servers,
                    log_stats=True,
                )
            else:
                partition_client = None
        else:
            self.barrier_group = None
            self.bucket_scheduler = SingleMachineBucketScheduler(
                holder.nparts_lhs, holder.nparts_rhs, config.bucket_order)
            parameter_sharer = None
            partition_client = None
            hide_distributed_logging()

        # fork early for HOGWILD threads
        logger.info("Creating workers...")
        self.num_workers = get_num_workers(config.workers)
        self.pool = create_pool(
            self.num_workers,
            subprocess_name=f"TWorker-{rank}",
            subprocess_init=subprocess_init,
        )

        checkpoint_manager = CheckpointManager(
            config.checkpoint_path,
            rank=rank,
            num_machines=config.num_machines,
            partition_client=partition_client,
            subprocess_name=f"BackgRW-{rank}",
            subprocess_init=subprocess_init,
        )
        self.checkpoint_manager = checkpoint_manager
        checkpoint_manager.register_metadata_provider(
            ConfigMetadataProvider(config))
        if rank == 0:
            checkpoint_manager.write_config(config)

        num_edge_chunks = get_num_edge_chunks(config)

        self.iteration_manager = IterationManager(
            config.num_epochs,
            config.edge_paths,
            num_edge_chunks,
            iteration_idx=checkpoint_manager.checkpoint_version)
        checkpoint_manager.register_metadata_provider(self.iteration_manager)

        logger.info("Initializing global model...")
        if model is None:
            model = make_model(config)
        model.share_memory()
        if trainer is None:
            trainer = Trainer(
                model_optimizer=make_optimizer(config, model.parameters(),
                                               False),
                loss_fn=config.loss_fn,
                margin=config.margin,
                relations=config.relations,
            )
        if evaluator is None:
            evaluator = TrainingRankingEvaluator(
                override_num_batch_negs=config.eval_num_batch_negs,
                override_num_uniform_negs=config.eval_num_uniform_negs,
            )

        if config.init_path is not None:
            self.loadpath_manager = CheckpointManager(config.init_path)
        else:
            self.loadpath_manager = None

        # load model from checkpoint or loadpath, if available
        state_dict, optim_state = checkpoint_manager.maybe_read_model()
        if state_dict is None and self.loadpath_manager is not None:
            state_dict, optim_state = self.loadpath_manager.maybe_read_model()
        if state_dict is not None:
            model.load_state_dict(state_dict, strict=False)
        if optim_state is not None:
            trainer.model_optimizer.load_state_dict(optim_state)

        logger.debug("Loading unpartitioned entities...")
        for entity in holder.lhs_unpartitioned_types | holder.rhs_unpartitioned_types:
            count = entity_counts[entity][0]
            s = embedding_storage_freelist[entity].pop()
            embs = torch.FloatTensor(s).view(-1, config.dimension)[:count]
            embs, optimizer = self._load_embeddings(entity,
                                                    Partition(0),
                                                    out=embs)
            holder.unpartitioned_embeddings[entity] = embs
            trainer.unpartitioned_optimizers[entity] = optimizer

        # start communicating shared parameters with the parameter server
        if parameter_sharer is not None:
            shared_parameters: Set[int] = set()
            for name, param in model.named_parameters():
                if id(param) in shared_parameters:
                    continue
                shared_parameters.add(id(param))
                key = f"model.{name}"
                logger.info(
                    f"Adding {key} ({param.numel()} params) to parameter server"
                )
                parameter_sharer.set_param(key, param.data)
            for entity, embs in holder.unpartitioned_embeddings.items():
                key = f"entity.{entity}"
                logger.info(
                    f"Adding {key} ({embs.numel()} params) to parameter server"
                )
                parameter_sharer.set_param(key, embs.data)

        # store everything in self
        self.model = model
        self.trainer = trainer
        self.evaluator = evaluator
        self.rank = rank
        self.entity_counts = entity_counts
        self.embedding_storage_freelist = embedding_storage_freelist

        self.strict = False
コード例 #6
0
ファイル: distributed.py プロジェクト: EXYNOS-999/DeepMeMes
def is_dist_initialized():
    return dist.is_available() and dist.is_initialized()
コード例 #7
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default="results",
        type=str,
        help="The output directory where the model checkpoints will be written.",
    )
    parser.add_argument(
        "--config_file",
        default="config/bert_config.json",
        type=str,
        help="The config file which specified the model details.",
    )
    parser.add_argument(
        "--no_cuda", action="store_true", help="Whether not to use CUDA when available"
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help="Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="local_rank for distributed training on gpus",
    )
    parser.add_argument(
        "--seed", type=int, default=42, help="random seed for initialization"
    )
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument(
        "--num_workers",
        type=int,
        default=16,
        help="Number of workers in the dataloader.",
    )
    parser.add_argument(
        "--save_name", default="", type=str, help="save name for training."
    )
    parser.add_argument(
        "--use_chunk",
        default=0,
        type=float,
        help="whether use chunck for parallel training.",
    )
    parser.add_argument(
        "--tasks", default="", type=str, help="1-2-3... training task separate by -"
    )
    parser.add_argument(
        "--in_memory",
        default=False,
        type=bool,
        help="whether use chunck for parallel training.",
    )
    parser.add_argument(
        "--baseline", action="store_true", help="whether use single stream baseline."
    )
    parser.add_argument(
        "--zero_shot", action="store_true", help="whether use single stream baseline."
    )
    parser.add_argument("--split", default="", type=str, help="which split to use.")
    parser.add_argument("--batch_size", default=1, type=int, help="which split to use.")
    parser.add_argument(
        "--clean_train_sets",
        default=True,
        type=bool,
        help="whether clean train sets for multitask data.",
    )
    parser.add_argument(
        "--task_specific_tokens",
        action="store_true",
        help="whether to use task specific tokens for the multi-task learning.",
    )
    args = parser.parse_args()
    with open("vilbert_tasks.yml", "r") as f:
        task_cfg = edict(yaml.safe_load(f))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if args.baseline:
        from pytorch_pretrained_bert.modeling import BertConfig
    else:
        from vilbert.vilbert import BertConfig

    task_names = []
    for i, task_id in enumerate(args.tasks.split("-")):
        task = "TASK" + task_id
        name = task_cfg[task]["name"]
        task_names.append(name)

    # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0]
    if "/" in args.from_pretrained:
        timeStamp = args.from_pretrained.split("/")[1]
    else:
        timeStamp = args.from_pretrained

    savePath = os.path.join(args.output_dir, timeStamp)

    config = BertConfig.from_json_file(args.config_file)
    bert_weight_name = json.load(
        open("config/" + args.bert_model + "_weight_name.json", "r")
    )

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device(
            "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
        )
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
            device, n_gpu, bool(args.local_rank != -1), args.fp16
        )
    )

    default_gpu = False
    if dist.is_available() and args.local_rank != -1:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True

    if default_gpu and not os.path.exists(savePath):
        os.makedirs(savePath)

    task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val = LoadDatasetEval(
        args, task_cfg, args.tasks.split("-")
    )

    num_labels = max([dataset.num_labels for dataset in task_datasets_val.values()])

    if args.task_specific_tokens:
        config.task_specific_tokens = True

    config.fast_mode = True
    if args.zero_shot:
        model = BertForMultiModalPreTraining.from_pretrained(
            args.from_pretrained, config
        )
    else:
        model = VILBertForVLTasks.from_pretrained(
            args.from_pretrained,
            config=config,
            num_labels=num_labels,
            default_gpu=default_gpu,
        )

    task_losses = LoadLosses(args, task_cfg, args.tasks.split("-"))
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model, deay_allreduce=True)

    elif n_gpu > 1:
        model = nn.DataParallel(model)

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    print("***** Running training *****")
    print("  Num Iters: ", task_num_iters)
    print("  Batch size: ", task_batch_size)

    model.eval()
    # when run evaluate, we run each task sequentially.
    for task_id in task_ids:
        results = []
        others = []

        score_matrix = np.zeros((5000, 1000))
        target_matrix = np.zeros((5000, 1000))
        rank_matrix = np.ones((5000)) * 1000
        count = 0

        for i, batch in enumerate(task_dataloader_val[task_id]):
            batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch)
            features, spatials, image_mask, question, input_mask, segment_ids, target, caption_idx, image_idx = (
                batch
            )
            task_tokens = (
                question.new().resize_(question.size(0), 1).fill_(int(task_id[4:]))
            )

            if task_id in ["TASK7", "TASK8"]:
                batch_size = features.size(0)
                features = features.squeeze(0)
                spatials = spatials.squeeze(0)
                image_mask = image_mask.squeeze(0)

            with torch.no_grad():
                if args.zero_shot:
                    _, _, vil_logit, _ = model(
                        question,
                        features,
                        spatials,
                        segment_ids,
                        input_mask,
                        image_mask,
                        task_ids=task_tokens,
                    )

                    score_matrix[
                        caption_idx, image_idx * 500 : (image_idx + 1) * 500
                    ] = (torch.softmax(vil_logit, dim=1)[:, 0].view(-1).cpu().numpy())
                    target_matrix[
                        caption_idx, image_idx * 500 : (image_idx + 1) * 500
                    ] = (target.view(-1).float().cpu().numpy())

                else:
                    _, _, vil_logit, _, _, _, _, _, _ = model(
                        question,
                        features,
                        spatials,
                        segment_ids,
                        input_mask,
                        image_mask,
                        task_ids=task_tokens,
                    )
                    score_matrix[
                        caption_idx, image_idx * 500 : (image_idx + 1) * 500
                    ] = (vil_logit.view(-1).cpu().numpy())
                    target_matrix[
                        caption_idx, image_idx * 500 : (image_idx + 1) * 500
                    ] = (target.view(-1).float().cpu().numpy())

                if image_idx.item() == 1:
                    rank = np.where(
                        (
                            np.argsort(-score_matrix[caption_idx])
                            == np.where(target_matrix[caption_idx] == 1)[0][0]
                        )
                        == 1
                    )[0][0]
                    rank_matrix[caption_idx] = rank

                    rank_matrix_tmp = rank_matrix[: caption_idx + 1]
                    r1 = 100.0 * np.sum(rank_matrix_tmp < 1) / len(rank_matrix_tmp)
                    r5 = 100.0 * np.sum(rank_matrix_tmp < 5) / len(rank_matrix_tmp)
                    r10 = 100.0 * np.sum(rank_matrix_tmp < 10) / len(rank_matrix_tmp)

                    medr = np.floor(np.median(rank_matrix_tmp) + 1)
                    meanr = np.mean(rank_matrix_tmp) + 1
                    print(
                        "%d Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f"
                        % (count, r1, r5, r10, medr, meanr)
                    )

                    results.append(np.argsort(-score_matrix[caption_idx]).tolist()[:20])
            count += 1

        r1 = 100.0 * np.sum(rank_matrix < 1) / len(rank_matrix)
        r5 = 100.0 * np.sum(rank_matrix < 5) / len(rank_matrix)
        r10 = 100.0 * np.sum(rank_matrix < 10) / len(rank_matrix)

        medr = np.floor(np.median(rank_matrix) + 1)
        meanr = np.mean(rank_matrix) + 1

        print("************************************************")
        print(
            "Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f"
            % (r1, r5, r10, medr, meanr)
        )
        print("************************************************")

        if args.split:
            json_path = os.path.join(savePath, args.split)
        else:
            json_path = os.path.join(savePath, task_cfg[task_id]["val_split"])
        json.dump(results, open(json_path + "_result.json", "w"))
        json.dump(others, open(json_path + "_others.json", "w"))
コード例 #8
0
def print_all(msg):
    if (not dist.is_available()):
        print(msg)
    elif dist.get_rank() % 8 == 0:
        print(f'{dist.get_rank()//8}: {msg}')
コード例 #9
0
ファイル: distributed.py プロジェクト: Pandinosaurus/catalyst
def _is_torch_distributed_initialized() -> bool:
    """Checks if torch.distributed is available and initialized."""
    return dist.is_available() and dist.is_initialized()
コード例 #10
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default="results",
        type=str,
        help="The output directory where the model checkpoints will be written.",
    )
    parser.add_argument(
        "--config_file",
        default="config/bert_config.json",
        type=str,
        help="The config file which specified the model details.",
    )
    parser.add_argument(
        "--no_cuda", action="store_true", help="Whether not to use CUDA when available"
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help="Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument(
        "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument(
        "--num_workers", type=int, default=10, help="Number of workers in the dataloader."
    )
    parser.add_argument(
        "--save_name",
        default='',
        type=str,
        help="save name for training.", 
    )
    parser.add_argument(
        "--batch_size", default=1000, type=int, help="what is the batch size?"
    )
    parser.add_argument(
        "--tasks", default='', type=str, help="1-2-3... training task separate by -"
    )
    parser.add_argument(
        "--in_memory", default=False, type=bool, help="whether use chunck for parallel training."
    )
    parser.add_argument(
        "--baseline", action="store_true", help="whether use single stream baseline."
    )
    parser.add_argument(
        "--split", default="", type=str, help="which split to use."
    )

    '''
    Thil : for test split, change yml file's eval split and ann file. if cache file is outdated, erase and rerun
    '''

    args = parser.parse_args()
    with open('vlbert_tasks.yml', 'r') as f:
        task_cfg = edict(yaml.safe_load(f))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if args.baseline:
        from pytorch_pretrained_bert.modeling import BertConfig
        from vilbert.basebert import BaseBertForVLTasks     
    else:
        from vilbert.vilbert import BertConfig
        from vilbert.vilbert import VILBertForVLTasks

    task_names = []
    for i, task_id in enumerate(args.tasks.split('-')):
        task = 'TASK' + task_id
        name = task_cfg[task]['name']
        task_names.append(name)

    # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0]
    timeStamp = args.from_pretrained.split('/')[1] + '-' + args.save_name
    savePath = os.path.join(args.output_dir, timeStamp)

    config = BertConfig.from_json_file(args.config_file)
    bert_weight_name = json.load(open("config/" + args.bert_model + "_weight_name.json", "r"))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        torch.distributed.init_process_group(backend="nccl")
    
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
            device, n_gpu, bool(args.local_rank != -1), args.fp16
        )
    )
    
    default_gpu = False
    if dist.is_available() and args.local_rank != -1:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True

    if default_gpu and not os.path.exists(savePath):
        os.makedirs(savePath)

    task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val \
                        = LoadDatasetEval(args, task_cfg, args.tasks.split('-'))

    tbLogger = utils.tbLogger(timeStamp, savePath, task_names, task_ids, task_num_iters, 1, save_logger=False, txt_name='eval.txt')

    num_labels = max([dataset.num_labels for dataset in task_datasets_val.values()])

    if args.baseline:
        model = BaseBertForVLTasks.from_pretrained(
            args.from_pretrained, config, num_labels=num_labels, default_gpu=default_gpu
            )
    else:
        model = VILBertForVLTasks.from_pretrained(
            args.from_pretrained, config, num_labels=num_labels, default_gpu=default_gpu
            )

    task_losses = LoadLosses(args, task_cfg, args.tasks.split('-'))
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model, delay_allreduce=True)

    elif n_gpu > 1:
        model = nn.DataParallel(model)

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    print("  Num Iters: ", task_num_iters)
    print("  Batch size: ", task_batch_size)    

    model.eval()
    for task_id in task_ids:
        results = []
        others = []
        predictions = []
        for i, batch in enumerate(task_dataloader_val[task_id]):
            loss, score, batch_size, results, others, predictions = EvaluatingModel(args, task_cfg, device, \
                    task_id, batch, model, task_dataloader_val, task_losses, results, others, predictions)

            tbLogger.step_val(0, float(loss), float(score), task_id, batch_size, 'val')

            sys.stdout.write('%d/%d\r' % (i, len(task_dataloader_val[task_id])))
            sys.stdout.flush()
        # save the result or evaluate the result.
        ave_score = tbLogger.showLossVal()

        if args.split:
            json_path = os.path.join(savePath, args.split)           
        else:
            json_path = os.path.join(savePath, task_cfg[task_id]['val_split'])

        json.dump(results, open(json_path+ '_result.json', 'w'))
        json.dump(others, open(json_path+ '_others.json', 'w'))
        json.dump(predictions, open(json_path+ '_predictions.json', 'w'))
コード例 #11
0
def init_env(is_cuda: Union[bool, int] = True, is_benchmark: bool = False, is_train: bool = True,
             config_path: Optional[str] = None,
             experiments_root: str = "experiment", rand_seed: Union[bool, str, int] = False,
             cv2_num_threads: int = -1, verbosity: bool = True, log_stdout: Union[bool, str] = False,
             local_rank: Optional[int] = None, silence_non_master_rank: Optional[bool] = False) \
        -> Tuple[torch.device, Optional[Dict]]:
    """Init torch training environment

    Args:
        is_cuda (Optional(bool, int)): If False, always use CPU. If Ture, use GPU and set GPU:0 as default device. If
            int, set GPU:i as default device.
        is_benchmark (bool): If True, set torch.backends.cudnn.benchmark = True
        is_train (bool): If False, disable grad
        config_path (Optional[str]): The path of yaml config
        experiments_root (str): The path where experiments result are stored
        rand_seed (Union[bool, str, int]) : If True, fix random of torch, numpy, python's random module from
            config.RAND_SEED. If False, don't fix random. If rand_seed is int or str, fix random according to
            rand_seed. (Default: False)
        cv2_num_threads (int): Set cv2 threads num by cv2.setNumThreads(cv2_num_threads). If < 0,
            don't set. (Default: -1)
        verbosity (bool): If True, print detailed info
        log_stdout (Union[bool, str]): If True, the stdout will be logged to corresponding experiment dir. If False, the
            stdout will not be logged. If log_stdout is str, it will be recognized as a path and stdout will be logged
            to that path. (Default: False)
        local_rank (Optional[int]): If not None, init distributed parallel env with rank = local_rank with
            init_method = "env://".  Default device will also bu set as "cuda:local_rank" .Make sure environment
            is pre-set.
        silence_non_master_rank (bool): If True, non master rank's (rank > 0) print will be silenced. (Default: False)

    Returns: Default device and config (If config_path is None, config is None)
    """
    # * Set distributed, verbosity is delayed.
    if local_rank is not None:
        # ** Check distributed env
        if not dist.is_available():
            raise ValueError(
                f"local_rank = {local_rank} while torch.distributed is not available"
            )
        if not torch.cuda.is_available():
            raise ValueError(
                f"init_env only support cuda device distributed with nccl backend. "
                f"local_rank = {local_rank} while torch.cuda is not available")

        # ** Set cuda device id
        if is_cuda is False:
            raise ValueError(
                f"When set local rank, cuda is needed. However, is_cuda = {is_cuda}"
            )
        if isinstance(is_cuda, int) and (is_cuda != local_rank):
            raise ValueError(
                f"local_rank = {local_rank} must equal to is_cuda = {is_cuda}")
        is_cuda = local_rank

        # ** set device & init_process_group
        torch.cuda.set_device(local_rank)
        dist.init_process_group(backend='nccl',
                                init_method="env://",
                                rank=local_rank)

    # * Read CONFIG, verbosity is delayed.
    config = parse_config(
        config_path, experiments_root) if config_path is not None else None

    def get_stdout_log_dir_from_config():
        stdout_log_dir = osp.join(config.rslt_dir, 'stdout')

        return stdout_log_dir

    def get_stdout_log_file(stdout_log_dir):
        if local_rank is not None:
            prefix = '.' if dist.get_rank() > 0 else ''
            file_name = prefix + '-'.join(
                ['stdout', f"rank{dist.get_rank()}",
                 get_local_time_str()]) + '.log'
        else:
            file_name = '-'.join(
                ['stdout', get_local_time_str(for_file_name=True)]) + '.log'
        return osp.join(stdout_log_dir, file_name)

    # * Log stdout
    # ** Get log file
    if isinstance(log_stdout, bool):
        stdout_log_file = get_stdout_log_file(
            get_stdout_log_dir_from_config()) if log_stdout else None
    elif isinstance(log_stdout, str):
        stdout_log_file = get_stdout_log_file(log_stdout)
    else:
        raise ValueError(
            f"log_stdout: {log_stdout} should be bool or path str")
    # ** Set logger
    if stdout_log_file is not None:
        if local_rank is not None:
            silence = silence_non_master_rank and (dist.get_rank() > 0)
            Logger(stdout_log_file, real_time=True, silence=silence)
            if verbosity:
                print(f"Log stdout at {stdout_log_file}. Silence = {silence}")
        else:
            Logger(stdout_log_file, real_time=True)
            if verbosity:
                print(f"Log stdout at {stdout_log_file}")

    # * Welcome & Show system info
    if verbosity:
        welcome()
        print(f"Current working dir is {os.getcwd()}")
        print(f"Current python environment path is\n{sys.path}")
        print(f"Current Process Info: ")
        pprint(get_process_info())
        print("\n")

    # * Print distributed delaying verbosity
    if verbosity and (local_rank is not None):
        print(
            f"Using torch.distributed. Current cuda device id is set to local_rank = {local_rank}. \n"
            f"    Progress Group Rank: {dist.get_rank()}\n"
            f"    World Size: {dist.get_world_size()}\n"
            f"    Local Rank: {local_rank}")

    # * Print config's delaying verbosity
    if verbosity and config is not None:
        print(
            "\033[32m------------------------------- CONFIG -------------------------------\033[0m"
        )
        pprint(dict(config))
        print(
            "\033[32m----------------------------- CONFIG END -----------------------------\033[0m"
        )
        print("\n")

    if verbosity:
        print(
            "\033[32m-------------------------------- INIT --------------------------------\033[0m"
        )

    # *  Get device
    if isinstance(is_cuda, bool):
        device = get_device(is_cuda, cuda_id=0, verbosity=verbosity)
    elif isinstance(is_cuda, int):
        device = get_device(is_cuda=True, cuda_id=is_cuda, verbosity=verbosity)
    else:
        raise ValueError(f"Parameter is_cuda = {is_cuda} must be str or int")

    # * Set benchmark
    torch.backends.cudnn.benchmark = is_benchmark
    if verbosity:
        print(f"torch.backends.cudnn.benchmark = {is_benchmark}")

    # * Set cv2 threads num
    if cv2_num_threads >= 0:
        cv2.setNumThreads(cv2_num_threads)
        if verbosity:
            print(f"cv2.setNumThreads({cv2_num_threads})")

    # * If test, disable grad
    torch.set_grad_enabled(is_train)
    if verbosity:
        print(f"torch.set_grad_enabled({is_train})")

    def get_rand_seed_from_config():
        if 'rand_seed' not in config:
            raise ValueError(f"CONFIG didn't have key rand_seed")
        return config.rand_seed

    # * Set rand seed
    if isinstance(rand_seed, bool):
        rand_seed_ = get_rand_seed_from_config() if rand_seed else None
    elif isinstance(rand_seed, int) or isinstance(rand_seed, str):
        rand_seed_ = rand_seed
    else:
        raise ValueError(f"rand_seed should be bool, int or str.")

    if rand_seed_ is not None:
        set_rand_seed(rand_seed_)
        if verbosity:
            print(f"Set rand seed {rand_seed_}")

    # * End of init
    if verbosity:
        print(
            "\033[32m------------------------------ INIT END ------------------------------\033[0m"
        )
        print("\n")

    # * Return
    return device, config
コード例 #12
0
    def __init__(self,
                 model_creator,
                 data_creator,
                 optimizer_creator,
                 loss_creator,
                 train_function=None,
                 validation_function=None,
                 initialization_hook=None,
                 config=None,
                 num_replicas=1,
                 use_gpu=False,
                 batch_size=16,
                 backend="auto"):
        """Sets up the PyTorch trainer.

        Args:
            model_creator (dict -> torch.nn.Module): creates the model
                using the config.
            data_creator (int, dict -> DataLoader, DataLoader): Function that
                takes in (batch_size, config) and returns two Torch DataLoader
                objects.
            optimizer_creator (torch.nn.Module, dict -> optimizer):
                creates the loss and optimizer using the model and the config.
            loss_creator (dict -> loss): Creates the loss function/criterion
                using the config.
            train_function: Trains a model for a epoch. This takes in (
                model, train_dataloader, criterion, optimizer, config), and
                returns a dict of training stats.
            validation_function: Runs validation. This takes in (
                model, val_dataloader, criterion, config) and returns a dict of
                validation stats.
            config (dict): configuration passed to "model_creator",
                "data_creator", "optimizer_creator", and "loss_creator".
            num_replicas (int): the number of workers used in distributed
                training.
            use_gpu (bool): Sets resource allocation for workers to 1 GPU
                if true.
            batch_size (int): batch size for an update.
            backend (string): backend used by distributed PyTorch.
        """
        # TODO: add support for mixed precision
        # TODO: add support for callbacks
        if num_replicas > 1 and not dist.is_available():
            raise ValueError(
                ("Distributed PyTorch is not supported on macOS. "
                 "To run without distributed PyTorch, set 'num_replicas=1'. "
                 "For more information, see "
                 "https://github.com/pytorch/examples/issues/467."))

        self.model_creator = model_creator
        self.data_creator = data_creator
        self.train_function = train_function
        self.optimizer_creator = optimizer_creator
        self.loss_creator = loss_creator
        self.validation_function = validation_function
        self.initialization_hook = initialization_hook
        self.config = {} if config is None else config
        self.optimizer_timer = utils.TimerStat(window_size=1)

        if backend == "auto":
            backend = "nccl" if use_gpu else "gloo"

        logger.info("Using {} as backend.".format(backend))
        self.backend = backend
        self.use_gpu = use_gpu
        self.batch_size = batch_size
        self.max_replicas = num_replicas
        self.temp_dir = tempfile.mkdtemp(prefix="raysgd")
        self._num_failures = 0
        self._last_resize = float("-inf")
        self._start_workers(self.max_replicas)
コード例 #13
0
ファイル: utils.py プロジェクト: xcmyz/BERT-RACE.2
def barrier():
    if dist.is_available() and dist.is_initialized():
        dist.barrier()
コード例 #14
0
ファイル: torch_trainer.py プロジェクト: zivzone/ray
    def __init__(
        self,
        *,
        training_operator_cls,
        initialization_hook=None,
        config=None,
        num_workers=1,
        num_cpus_per_worker=1,
        use_gpu="auto",
        backend="auto",
        wrap_ddp=True,
        timeout_s=1800,
        use_fp16=False,
        use_tqdm=False,
        add_dist_sampler=True,
        scheduler_step_freq=None,
        use_local=False,
        # Deprecated Args.
        num_replicas=None,
        batch_size=None,
        model_creator=None,
        data_creator=None,
        optimizer_creator=None,
        scheduler_creator=None,
        loss_creator=None,
        serialize_data_creation=None,
        data_loader_args=None,
        apex_args=None,
    ):
        if (model_creator or data_creator or optimizer_creator
                or scheduler_creator or loss_creator):
            raise DeprecationWarning(
                "Creator functions are deprecated. You should create a "
                "custom TrainingOperator, override setup, and register all "
                "training state there. See TrainingOperator for more info. "
                "If you would still like to use creator functions, you can "
                "do CustomOperator = TrainingOperator.from_creators("
                "model_creator, ...) and pass in CustomOperator into "
                "TorchTrainer.")

        if use_local and log_once("use_local"):
            logger.warning("use_local is set to True. This could lead to "
                           "issues with Cuda devices. If you are seeing this "
                           "issue, try setting use_local to False. For more "
                           "information, see "
                           "https://github.com/ray-project/ray/issues/9202.")

        if num_workers > 1 and not dist.is_available():
            raise ValueError(
                ("Distributed PyTorch is not supported on macOS. "
                 "To run without distributed PyTorch, set 'num_workers=1'. "
                 "For more information, see "
                 "https://github.com/pytorch/examples/issues/467."))

        if num_replicas is not None:
            raise DeprecationWarning(
                "num_replicas is deprecated. Use num_workers instead.")

        if batch_size is not None:
            raise DeprecationWarning(
                "batch_size is deprecated. Use config={'batch_size': N} "
                "specify a batch size for each worker or "
                "config={ray.util.sgd.utils.BATCH_SIZE: N} to specify a "
                "batch size to be used across all workers.")

        if apex_args is not None:
            raise DeprecationWarning(
                "apex_args is deprecated. Pass in apex_args when calling "
                "`register` in the `setup` method of your `TrainingOperator` "
                "instead.")

        if serialize_data_creation is True:
            if log_once("serialize_data_creation"):
                logging.warning(
                    "serialize_data_creation is deprecated and will be "
                    "ignored. If you require serialized data loading you "
                    "should implement this in TrainingOperator.setup. "
                    "You may find FileLock useful here.")

        if data_loader_args:
            raise DeprecationWarning(
                "data_loader_args is deprecated. You can return a "
                "torch.utils.data.DataLoader in data_creator. Ray will "
                "automatically set a DistributedSampler if a DataLoader is "
                "returned and num_workers > 1.")

        self.training_operator_cls = training_operator_cls

        self.initialization_hook = initialization_hook
        self.config = {} if config is None else config
        if use_gpu == "auto":
            use_gpu = torch.cuda.is_available()

        _remind_gpu_usage(use_gpu)

        if backend == "auto":
            backend = "nccl" if use_gpu else "gloo"

        if backend == "nccl":
            timeout_s = NCCL_TIMEOUT_S

        logger.debug(f"Using {backend} as backend.")
        self.backend = backend
        self.num_cpus_per_worker = num_cpus_per_worker
        self.use_gpu = use_gpu
        self.max_replicas = num_workers

        self.serialize_data_creation = serialize_data_creation
        self.wrap_ddp = wrap_ddp
        self.timeout_s = timeout_s
        self.use_fp16 = use_fp16
        self.use_tqdm = use_tqdm
        self.add_dist_sampler = add_dist_sampler
        self.use_local = use_local

        self.temp_dir = tempfile.mkdtemp(prefix="raysgd")
        self._num_failures = 0
        self._last_resize = float("-inf")

        if scheduler_step_freq:
            _validate_scheduler_step_freq(scheduler_step_freq)

        self.scheduler_step_freq = scheduler_step_freq

        if not ray.is_initialized() and self.max_replicas > 1:
            logger.info("Automatically initializing single-node Ray. To use "
                        "multi-node training, be sure to run `ray.init("
                        "address='auto')` before instantiating the Trainer.")
            ray.init()
        startup_success = self._start_workers(self.max_replicas)
        if not startup_success:
            raise RuntimeError("Worker startup failed. "
                               "Are you sure you have enough resources to "
                               "start the specified number of workers?")
コード例 #15
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default="results",
        type=str,
        help=
        "The output directory where the model checkpoints will be written.",
    )
    parser.add_argument(
        "--config_file",
        default="config/bert_config.json",
        type=str,
        help="The config file which specified the model details.",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="local_rank for distributed training on gpus",
    )
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument(
        "--num_workers",
        type=int,
        default=16,
        help="Number of workers in the dataloader.",
    )
    parser.add_argument("--save_name",
                        default="",
                        type=str,
                        help="save name for training.")
    parser.add_argument(
        "--use_chunk",
        default=0,
        type=float,
        help="whether use chunck for parallel training.",
    )
    parser.add_argument("--batch_size",
                        default=30,
                        type=int,
                        help="what is the batch size?")
    parser.add_argument("--tasks",
                        default="",
                        type=str,
                        help="1-2-3... training task separate by -")
    parser.add_argument(
        "--in_memory",
        default=False,
        type=bool,
        help="whether use chunck for parallel training.",
    )
    parser.add_argument("--baseline",
                        action="store_true",
                        help="whether use single stream baseline.")
    parser.add_argument("--split",
                        default="",
                        type=str,
                        help="which split to use.")
    parser.add_argument(
        "--dynamic_attention",
        action="store_true",
        help="whether use dynamic attention.",
    )
    parser.add_argument(
        "--clean_train_sets",
        default=True,
        type=bool,
        help="whether clean train sets for multitask data.",
    )
    parser.add_argument(
        "--visual_target",
        default=0,
        type=int,
        help="which target to use for visual branch. \
        0: soft label, \
        1: regress the feature, \
        2: NCE loss.",
    )
    parser.add_argument(
        "--task_specific_tokens",
        action="store_true",
        help="whether to use task specific tokens for the multi-task learning.",
    )

    args = parser.parse_args()
    with open("vilbert_tasks.yml", "r") as f:
        task_cfg = edict(yaml.safe_load(f))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if args.baseline:
        from pytorch_transformers.modeling_bert import BertConfig
        from vilbert.basebert import BaseBertForVLTasks
    else:
        from vilbert.vilbert import BertConfig
        from vilbert.vilbert import VILBertForVLTasks

    task_names = []
    for i, task_id in enumerate(args.tasks.split("-")):
        task = "TASK" + task_id
        name = task_cfg[task]["name"]
        task_names.append(name)

    # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0]
    timeStamp = args.from_pretrained.split("/")[-1] + "-" + args.save_name
    savePath = os.path.join(args.output_dir, timeStamp)
    config = BertConfig.from_json_file(args.config_file)

    if args.task_specific_tokens:
        config.task_specific_tokens = True

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    default_gpu = False
    if dist.is_available() and args.local_rank != -1:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True

    if default_gpu and not os.path.exists(savePath):
        os.makedirs(savePath)

    task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val = LoadDatasetEval(
        args, task_cfg, args.tasks.split("-"))

    tbLogger = utils.tbLogger(
        timeStamp,
        savePath,
        task_names,
        task_ids,
        task_num_iters,
        1,
        save_logger=False,
        txt_name="eval.txt",
    )
    num_labels = max(
        [dataset.num_labels for dataset in task_datasets_val.values()])

    if args.dynamic_attention:
        config.dynamic_attention = True
    if "roberta" in args.bert_model:
        config.model = "roberta"

    if args.visual_target == 0:
        config.v_target_size = 1601
        config.visual_target = args.visual_target
    else:
        config.v_target_size = 2048
        config.visual_target = args.visual_target

    if args.task_specific_tokens:
        config.task_specific_tokens = True

    if args.baseline:
        model = BaseBertForVLTasks.from_pretrained(
            args.from_pretrained,
            config=config,
            num_labels=num_labels,
            default_gpu=default_gpu,
        )
    else:
        model = VILBertForVLTasks.from_pretrained(
            args.from_pretrained,
            config=config,
            num_labels=num_labels,
            default_gpu=default_gpu,
        )

    task_losses = LoadLosses(args, task_cfg, args.tasks.split("-"))
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model, delay_allreduce=True)

    elif n_gpu > 1:
        model = nn.DataParallel(model)

    print("***** Running evaluation *****")
    print("  Num Iters: ", task_num_iters)
    print("  Batch size: ", task_batch_size)

    model.eval()
    # when run evaluate, we run each task sequentially.
    for task_id in task_ids:
        results = []
        others = []
        for i, batch in enumerate(task_dataloader_val[task_id]):
            loss, score, batch_size, results, others = EvaluatingModel(
                args,
                task_cfg,
                device,
                task_id,
                batch,
                model,
                task_dataloader_val,
                task_losses,
                results,
                others,
            )

            tbLogger.step_val(0, float(loss), float(score), task_id,
                              batch_size, "val")

            sys.stdout.write("%d/%d\r" %
                             (i, len(task_dataloader_val[task_id])))
            sys.stdout.flush()
        # save the result or evaluate the result.
        ave_score = tbLogger.showLossVal(task_id)

        if args.split:
            json_path = os.path.join(savePath, args.split)
        else:
            json_path = os.path.join(savePath, task_cfg[task_id]["val_split"])

        json.dump(results, open(json_path + "_result.json", "w"))
        json.dump(others, open(json_path + "_others.json", "w"))
コード例 #16
0
def main():

    parser = argparse.ArgumentParser(description='PyTorch BLERSSI Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=1,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--beta1', type=float, default=0.1, help='Beta1 value')
    parser.add_argument('--beta2', type=float, default=0.5, help='Beta2 value')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=True,
                        help='For Saving the current Model')
    parser.add_argument('--dir',
                        default='logs',
                        metavar='L',
                        help='directory where summary logs are stored')
    if dist.is_available():
        parser.add_argument(
            '--backend',
            type=str,
            help='Distributed backend',
            choices=[dist.Backend.GLOO, dist.Backend.NCCL, dist.Backend.MPI],
            default=dist.Backend.GLOO)
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    if use_cuda:
        print('Using CUDA')

    writer = SummaryWriter(args.dir)

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    if should_distribute():
        print('Using distributed PyTorch with {} backend'.format(args.backend))
        dist.init_process_group(backend=args.backend)

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    model = Net().to(device)

    if is_distributed():
        Distributor = nn.parallel.DistributedDataParallel if use_cuda \
            else nn.parallel.DistributedDataParallelCPU
        model = Distributor(model)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 betas=(args.beta1, args.beta2))
    for epoch in range(1, args.epochs + 1):
        train(args, model, optimizer, epoch, writer)

    if (args.save_model):
        torch.save(model.state_dict(), "/var/blerssi_cnn.pt")
        print("Model Saved")
コード例 #17
0
def train_and_report_stats(
    config: ConfigSchema,
    model: Optional[MultiRelationEmbedder] = None,
    trainer: Optional[AbstractBatchProcessor] = None,
    evaluator: Optional[AbstractBatchProcessor] = None,
    rank: Rank = RANK_ZERO,
    subprocess_init: Optional[Callable[[], None]] = None,
) -> Generator[Tuple[int, Optional[Stats], Stats, Optional[Stats]], None,
               None]:
    """Each epoch/pass, for each partition pair, loads in embeddings and edgelist
    from disk, runs HOGWILD training on them, and writes partitions back to disk.
    """
    tag_logs_with_process_name(f"Trainer-{rank}")

    if config.verbose > 0:
        import pprint
        pprint.PrettyPrinter().pprint(config.to_dict())

    logger.info("Loading entity counts...")
    entity_storage = ENTITY_STORAGES.make_instance(config.entity_path)
    entity_counts: Dict[str, List[int]] = {}
    for entity, econf in config.entities.items():
        entity_counts[entity] = []
        for part in range(econf.num_partitions):
            entity_counts[entity].append(
                entity_storage.load_count(entity, part))

    # Figure out how many lhs and rhs partitions we need
    nparts_lhs, lhs_partitioned_types = get_partitioned_types(config, Side.LHS)
    nparts_rhs, rhs_partitioned_types = get_partitioned_types(config, Side.RHS)
    logger.debug(f"nparts {nparts_lhs} {nparts_rhs} "
                 f"types {lhs_partitioned_types} {rhs_partitioned_types}")
    total_buckets = nparts_lhs * nparts_rhs

    sync: AbstractSynchronizer
    bucket_scheduler: AbstractBucketScheduler
    parameter_sharer: Optional[ParameterSharer]
    partition_client: Optional[PartitionClient]
    if config.num_machines > 1:
        if not 0 <= rank < config.num_machines:
            raise RuntimeError("Invalid rank for trainer")
        if not td.is_available():
            raise RuntimeError("The installed PyTorch version doesn't provide "
                               "distributed training capabilities.")
        ranks = ProcessRanks.from_num_invocations(config.num_machines,
                                                  config.num_partition_servers)

        if rank == RANK_ZERO:
            logger.info("Setup lock server...")
            start_server(
                LockServer(
                    num_clients=len(ranks.trainers),
                    nparts_lhs=nparts_lhs,
                    nparts_rhs=nparts_rhs,
                    lock_lhs=len(lhs_partitioned_types) > 0,
                    lock_rhs=len(rhs_partitioned_types) > 0,
                    init_tree=config.distributed_tree_init_order,
                ),
                process_name="LockServer",
                init_method=config.distributed_init_method,
                world_size=ranks.world_size,
                server_rank=ranks.lock_server,
                groups=[ranks.trainers],
                subprocess_init=subprocess_init,
            )

        bucket_scheduler = DistributedBucketScheduler(
            server_rank=ranks.lock_server,
            client_rank=ranks.trainers[rank],
        )

        logger.info("Setup param server...")
        start_server(
            ParameterServer(num_clients=len(ranks.trainers)),
            process_name=f"ParamS-{rank}",
            init_method=config.distributed_init_method,
            world_size=ranks.world_size,
            server_rank=ranks.parameter_servers[rank],
            groups=[ranks.trainers],
            subprocess_init=subprocess_init,
        )

        parameter_sharer = ParameterSharer(
            process_name=f"ParamC-{rank}",
            client_rank=ranks.parameter_clients[rank],
            all_server_ranks=ranks.parameter_servers,
            init_method=config.distributed_init_method,
            world_size=ranks.world_size,
            groups=[ranks.trainers],
            subprocess_init=subprocess_init,
        )

        if config.num_partition_servers == -1:
            start_server(
                ParameterServer(num_clients=len(ranks.trainers),
                                log_stats=True),
                process_name=f"PartS-{rank}",
                init_method=config.distributed_init_method,
                world_size=ranks.world_size,
                server_rank=ranks.partition_servers[rank],
                groups=[ranks.trainers],
                subprocess_init=subprocess_init,
            )

        if len(ranks.partition_servers) > 0:
            partition_client = PartitionClient(ranks.partition_servers,
                                               log_stats=True)
        else:
            partition_client = None

        groups = init_process_group(
            rank=ranks.trainers[rank],
            world_size=ranks.world_size,
            init_method=config.distributed_init_method,
            groups=[ranks.trainers],
        )
        trainer_group, = groups
        sync = DistributedSynchronizer(trainer_group)

    else:
        sync = DummySynchronizer()
        bucket_scheduler = SingleMachineBucketScheduler(
            nparts_lhs, nparts_rhs, config.bucket_order)
        parameter_sharer = None
        partition_client = None
        hide_distributed_logging()

    # fork early for HOGWILD threads
    logger.info("Creating workers...")
    num_workers = get_num_workers(config.workers)
    pool = create_pool(
        num_workers,
        subprocess_name=f"TWorker-{rank}",
        subprocess_init=subprocess_init,
    )

    def make_optimizer(params: Iterable[torch.nn.Parameter],
                       is_emb: bool) -> Optimizer:
        params = list(params)
        if len(params) == 0:
            optimizer = DummyOptimizer()
        elif is_emb:
            optimizer = RowAdagrad(params, lr=config.lr)
        else:
            if config.relation_lr is not None:
                lr = config.relation_lr
            else:
                lr = config.lr
            optimizer = Adagrad(params, lr=lr)
        optimizer.share_memory()
        return optimizer

    # background_io is only supported in single-machine mode
    background_io = config.background_io and config.num_machines == 1

    checkpoint_manager = CheckpointManager(
        config.checkpoint_path,
        background=background_io,
        rank=rank,
        num_machines=config.num_machines,
        partition_client=partition_client,
        subprocess_name=f"BackgRW-{rank}",
        subprocess_init=subprocess_init,
    )
    checkpoint_manager.register_metadata_provider(
        ConfigMetadataProvider(config))
    checkpoint_manager.write_config(config)

    iteration_manager = IterationManager(
        config.num_epochs,
        config.edge_paths,
        config.num_edge_chunks,
        iteration_idx=checkpoint_manager.checkpoint_version)
    checkpoint_manager.register_metadata_provider(iteration_manager)

    if config.init_path is not None:
        loadpath_manager = CheckpointManager(config.init_path)
    else:
        loadpath_manager = None

    def load_embeddings(
        entity: EntityName,
        part: Partition,
        strict: bool = False,
        force_dirty: bool = False,
    ) -> Tuple[torch.nn.Parameter, Optional[OptimizerStateDict]]:
        if strict:
            embs, optim_state = checkpoint_manager.read(
                entity, part, force_dirty=force_dirty)
        else:
            # Strict is only false during the first iteration, because in that
            # case the checkpoint may not contain any data (unless a previous
            # run was resumed) so we fall back on initial values.
            embs, optim_state = checkpoint_manager.maybe_read(
                entity, part, force_dirty=force_dirty)
            if embs is None and loadpath_manager is not None:
                embs, optim_state = loadpath_manager.maybe_read(entity, part)
            if embs is None:
                embs, optim_state = init_embs(entity,
                                              entity_counts[entity][part],
                                              config.dimension,
                                              config.init_scale)
        assert embs.is_shared()
        return torch.nn.Parameter(embs), optim_state

    logger.info("Initializing global model...")

    if model is None:
        model = make_model(config)
    model.share_memory()
    if trainer is None:
        trainer = Trainer(
            global_optimizer=make_optimizer(model.parameters(), False),
            loss_fn=config.loss_fn,
            margin=config.margin,
            relations=config.relations,
        )
    if evaluator is None:
        evaluator = TrainingRankingEvaluator(
            override_num_batch_negs=config.eval_num_batch_negs,
            override_num_uniform_negs=config.eval_num_uniform_negs,
        )
    eval_batch_size = round_up_to_nearest_multiple(config.batch_size,
                                                   config.eval_num_batch_negs)

    state_dict, optim_state = checkpoint_manager.maybe_read_model()

    if state_dict is None and loadpath_manager is not None:
        state_dict, optim_state = loadpath_manager.maybe_read_model()
    if state_dict is not None:
        model.load_state_dict(state_dict, strict=False)
    if optim_state is not None:
        trainer.global_optimizer.load_state_dict(optim_state)

    logger.debug("Loading unpartitioned entities...")
    for entity, econfig in config.entities.items():
        if econfig.num_partitions == 1:
            embs, optim_state = load_embeddings(entity, Partition(0))
            model.set_embeddings(entity, embs, Side.LHS)
            model.set_embeddings(entity, embs, Side.RHS)
            optimizer = make_optimizer([embs], True)
            if optim_state is not None:
                optimizer.load_state_dict(optim_state)
            trainer.entity_optimizers[(entity, Partition(0))] = optimizer

    # start communicating shared parameters with the parameter server
    if parameter_sharer is not None:
        parameter_sharer.share_model_params(model)

    strict = False

    def swap_partitioned_embeddings(
        old_b: Optional[Bucket],
        new_b: Optional[Bucket],
    ):
        # 0. given the old and new buckets, construct data structures to keep
        #    track of old and new embedding (entity, part) tuples

        io_bytes = 0
        logger.info(f"Swapping partitioned embeddings {old_b} {new_b}")

        types = ([(e, Side.LHS) for e in lhs_partitioned_types] +
                 [(e, Side.RHS) for e in rhs_partitioned_types])
        old_parts = {(e, old_b.get_partition(side)): side
                     for e, side in types if old_b is not None}
        new_parts = {(e, new_b.get_partition(side)): side
                     for e, side in types if new_b is not None}

        to_checkpoint = set(old_parts) - set(new_parts)
        preserved = set(old_parts) & set(new_parts)

        # 1. checkpoint embeddings that will not be used in the next pair
        #
        if old_b is not None:  # there are previous embeddings to checkpoint
            logger.info("Writing partitioned embeddings")
            for entity, part in to_checkpoint:
                side = old_parts[(entity, part)]
                side_name = side.pick("lhs", "rhs")
                logger.debug(f"Checkpointing ({entity} {part} {side_name})")
                embs = model.get_embeddings(entity, side)
                optim_key = (entity, part)
                optim_state = OptimizerStateDict(
                    trainer.entity_optimizers[optim_key].state_dict())
                io_bytes += embs.numel() * embs.element_size(
                )  # ignore optim state
                checkpoint_manager.write(entity, part, embs.detach(),
                                         optim_state)
                if optim_key in trainer.entity_optimizers:
                    del trainer.entity_optimizers[optim_key]
                # these variables are holding large objects; let them be freed
                del embs
                del optim_state

            bucket_scheduler.release_bucket(old_b)

        # 2. copy old embeddings that will be used in the next pair
        #    into a temporary dictionary
        #
        tmp_emb = {
            x: model.get_embeddings(x[0], old_parts[x])
            for x in preserved
        }

        for entity, _ in types:
            model.clear_embeddings(entity, Side.LHS)
            model.clear_embeddings(entity, Side.RHS)

        if new_b is None:  # there are no new embeddings to load
            return io_bytes

        bucket_logger = BucketLogger(logger, bucket=new_b)

        # 3. load new embeddings into the model/optimizer, either from disk
        #    or the temporary dictionary
        #
        bucket_logger.info("Loading entities")
        for entity, side in types:
            part = new_b.get_partition(side)
            part_key = (entity, part)
            if part_key in tmp_emb:
                bucket_logger.debug(
                    f"Loading ({entity}, {part}) from preserved")
                embs, optim_state = tmp_emb[part_key], None
            else:
                bucket_logger.debug(f"Loading ({entity}, {part})")

                force_dirty = bucket_scheduler.check_and_set_dirty(
                    entity, part)
                embs, optim_state = load_embeddings(entity,
                                                    part,
                                                    strict=strict,
                                                    force_dirty=force_dirty)
                io_bytes += embs.numel() * embs.element_size(
                )  # ignore optim state

            model.set_embeddings(entity, embs, side)
            tmp_emb[part_key] = embs

            optim_key = (entity, part)
            if optim_key not in trainer.entity_optimizers:
                bucket_logger.debug(f"Resetting optimizer {optim_key}")
                optimizer = make_optimizer([embs], True)
                if optim_state is not None:
                    bucket_logger.debug("Setting optim state")
                    optimizer.load_state_dict(optim_state)

                trainer.entity_optimizers[optim_key] = optimizer

        return io_bytes

    if rank == RANK_ZERO:
        for stats in checkpoint_manager.maybe_read_stats():
            yield (
                stats["index"],
                Stats.from_dict(stats["eval_stats_before"]),
                Stats.from_dict(stats["stats"]),
                Stats.from_dict(stats["eval_stats_after"]),
            )

    # Start of the main training loop.
    for epoch_idx, edge_path_idx, edge_chunk_idx in iteration_manager:
        logger.info(
            f"Starting epoch {epoch_idx + 1} / {iteration_manager.num_epochs}, "
            f"edge path {edge_path_idx + 1} / {iteration_manager.num_edge_paths}, "
            f"edge chunk {edge_chunk_idx + 1} / {iteration_manager.num_edge_chunks}"
        )
        edge_storage = EDGE_STORAGES.make_instance(iteration_manager.edge_path)
        logger.info(f"Edge path: {iteration_manager.edge_path}")

        sync.barrier()
        dist_logger.info("Lock client new epoch...")
        bucket_scheduler.new_pass(
            is_first=iteration_manager.iteration_idx == 0)
        sync.barrier()

        remaining = total_buckets
        cur_b = None
        while remaining > 0:
            old_b = cur_b
            io_time = 0.
            io_bytes = 0
            cur_b, remaining = bucket_scheduler.acquire_bucket()
            logger.info(f"still in queue: {remaining}")
            if cur_b is None:
                if old_b is not None:
                    # if you couldn't get a new pair, release the lock
                    # to prevent a deadlock!
                    tic = time.time()
                    io_bytes += swap_partitioned_embeddings(old_b, None)
                    io_time += time.time() - tic
                time.sleep(1)  # don't hammer td
                continue

            bucket_logger = BucketLogger(logger, bucket=cur_b)

            tic = time.time()

            io_bytes += swap_partitioned_embeddings(old_b, cur_b)

            current_index = \
                (iteration_manager.iteration_idx + 1) * total_buckets - remaining

            next_b = bucket_scheduler.peek()
            if next_b is not None and background_io:
                # Ensure the previous bucket finished writing to disk.
                checkpoint_manager.wait_for_marker(current_index - 1)

                bucket_logger.debug("Prefetching")
                for entity in lhs_partitioned_types:
                    checkpoint_manager.prefetch(entity, next_b.lhs)
                for entity in rhs_partitioned_types:
                    checkpoint_manager.prefetch(entity, next_b.rhs)

                checkpoint_manager.record_marker(current_index)

            bucket_logger.debug("Loading edges")
            edges = edge_storage.load_chunk_of_edges(cur_b.lhs, cur_b.rhs,
                                                     edge_chunk_idx,
                                                     config.num_edge_chunks)
            num_edges = len(edges)
            # this might be off in the case of tensorlist or extra edge fields
            io_bytes += edges.lhs.tensor.numel(
            ) * edges.lhs.tensor.element_size()
            io_bytes += edges.rhs.tensor.numel(
            ) * edges.rhs.tensor.element_size()
            io_bytes += edges.rel.numel() * edges.rel.element_size()

            bucket_logger.debug("Shuffling edges")
            # Fix a seed to get the same permutation every time; have it
            # depend on all and only what affects the set of edges.
            g = torch.Generator()
            g.manual_seed(
                hash((edge_path_idx, edge_chunk_idx, cur_b.lhs, cur_b.rhs)))

            num_eval_edges = int(num_edges * config.eval_fraction)
            if num_eval_edges > 0:
                edge_perm = torch.randperm(num_edges, generator=g)
                eval_edge_perm = edge_perm[-num_eval_edges:]
                num_edges -= num_eval_edges
                edge_perm = edge_perm[torch.randperm(num_edges)]
            else:
                edge_perm = torch.randperm(num_edges)

            # HOGWILD evaluation before training
            eval_stats_before: Optional[Stats] = None
            if num_eval_edges > 0:
                bucket_logger.debug(
                    "Waiting for workers to perform evaluation")
                future_all_eval_stats_before = pool.map_async(
                    call, [
                        partial(
                            process_in_batches,
                            batch_size=eval_batch_size,
                            model=model,
                            batch_processor=evaluator,
                            edges=edges,
                            indices=eval_edge_perm[s],
                        ) for s in split_almost_equally(eval_edge_perm.size(0),
                                                        num_parts=num_workers)
                    ])
                all_eval_stats_before = \
                    get_async_result(future_all_eval_stats_before, pool)
                eval_stats_before = Stats.sum(all_eval_stats_before).average()
                bucket_logger.info(
                    f"Stats before training: {eval_stats_before}")

            io_time += time.time() - tic
            tic = time.time()
            # HOGWILD training
            bucket_logger.debug("Waiting for workers to perform training")
            # FIXME should we only delay if iteration_idx == 0?
            future_all_stats = pool.map_async(call, [
                partial(
                    process_in_batches,
                    batch_size=config.batch_size,
                    model=model,
                    batch_processor=trainer,
                    edges=edges,
                    indices=edge_perm[s],
                    delay=config.hogwild_delay
                    if epoch_idx == 0 and rank > 0 else 0,
                ) for rank, s in enumerate(
                    split_almost_equally(edge_perm.size(0),
                                         num_parts=num_workers))
            ])
            all_stats = get_async_result(future_all_stats, pool)
            stats = Stats.sum(all_stats).average()
            compute_time = time.time() - tic

            bucket_logger.info(
                f"bucket {total_buckets - remaining} / {total_buckets} : "
                f"Processed {num_edges} edges in {compute_time:.2f} s "
                f"( {num_edges / compute_time / 1e6:.2g} M/sec ); "
                f"io: {io_time:.2f} s ( {io_bytes / io_time / 1e6:.2f} MB/sec )"
            )
            bucket_logger.info(f"{stats}")

            # HOGWILD eval after training
            eval_stats_after: Optional[Stats] = None
            if num_eval_edges > 0:
                bucket_logger.debug(
                    "Waiting for workers to perform evaluation")
                future_all_eval_stats_after = pool.map_async(
                    call, [
                        partial(
                            process_in_batches,
                            batch_size=eval_batch_size,
                            model=model,
                            batch_processor=evaluator,
                            edges=edges,
                            indices=eval_edge_perm[s],
                        ) for s in split_almost_equally(eval_edge_perm.size(0),
                                                        num_parts=num_workers)
                    ])
                all_eval_stats_after = \
                    get_async_result(future_all_eval_stats_after, pool)
                eval_stats_after = Stats.sum(all_eval_stats_after).average()
                bucket_logger.info(f"Stats after training: {eval_stats_after}")

            # Add train/eval metrics to queue
            checkpoint_manager.append_stats({
                "index":
                current_index,
                "eval_stats_before":
                eval_stats_before.to_dict(),
                "stats":
                stats.to_dict(),
                "eval_stats_after":
                eval_stats_after.to_dict(),
            })
            yield current_index, eval_stats_before, stats, eval_stats_after

        swap_partitioned_embeddings(cur_b, None)

        # Distributed Processing: all machines can leave the barrier now.
        sync.barrier()

        # Preserving a checkpoint requires two steps:
        # - create a snapshot (w/ symlinks) after it's first written;
        # - don't delete it once the following one is written.
        # These two happen in two successive iterations of the main loop: the
        # one just before and the one just after the epoch boundary.
        preserve_old_checkpoint = should_preserve_old_checkpoint(
            iteration_manager, config.checkpoint_preservation_interval)
        preserve_new_checkpoint = should_preserve_old_checkpoint(
            iteration_manager + 1, config.checkpoint_preservation_interval)

        # Write metadata: for multiple machines, write from rank-0
        logger.info(
            f"Finished epoch {epoch_idx + 1} / {iteration_manager.num_epochs}, "
            f"edge path {edge_path_idx + 1} / {iteration_manager.num_edge_paths}, "
            f"edge chunk {edge_chunk_idx + 1} / {iteration_manager.num_edge_chunks}"
        )
        if rank == 0:
            for entity, econfig in config.entities.items():
                if econfig.num_partitions == 1:
                    embs = model.get_embeddings(entity, Side.LHS)
                    optimizer = trainer.entity_optimizers[(entity,
                                                           Partition(0))]

                    checkpoint_manager.write(
                        entity, Partition(0), embs.detach(),
                        OptimizerStateDict(optimizer.state_dict()))

            sanitized_state_dict: ModuleStateDict = {}
            for k, v in ModuleStateDict(model.state_dict()).items():
                if k.startswith('lhs_embs') or k.startswith('rhs_embs'):
                    # skipping state that's an entity embedding
                    continue
                sanitized_state_dict[k] = v

            logger.info("Writing the metadata")
            checkpoint_manager.write_model(
                sanitized_state_dict,
                OptimizerStateDict(trainer.global_optimizer.state_dict()),
            )

        logger.info("Writing the checkpoint")
        checkpoint_manager.write_new_version(config)

        dist_logger.info(
            "Waiting for other workers to write their parts of the checkpoint")
        sync.barrier()
        dist_logger.info("All parts of the checkpoint have been written")

        logger.info("Switching to the new checkpoint version")
        checkpoint_manager.switch_to_new_version()

        dist_logger.info(
            "Waiting for other workers to switch to the new checkpoint version"
        )
        sync.barrier()
        dist_logger.info(
            "All workers have switched to the new checkpoint version")

        # After all the machines have finished committing
        # checkpoints, we either remove the old checkpoints
        # or we preserve it
        if preserve_new_checkpoint:
            # Add 1 so the index is a multiple of the interval, it looks nicer.
            checkpoint_manager.preserve_current_version(config, epoch_idx + 1)
        if not preserve_old_checkpoint:
            checkpoint_manager.remove_old_version(config)

        # now we're sure that all partition files exist,
        # so be strict about loading them
        strict = True

    # quiescence
    pool.close()
    pool.join()

    sync.barrier()

    checkpoint_manager.close()
    if loadpath_manager is not None:
        loadpath_manager.close()

    # FIXME join distributed workers (not really necessary)

    logger.info("Exiting")
コード例 #18
0
def print_once(msg):
    if (not dist.is_available()) or dist.get_rank() == 0:
        print(msg)
コード例 #19
0
def is_available():
    return dist.is_available() and cda.device_count() > 1
コード例 #20
0
def distributed_is_initialized():
    if distributed.is_available():
        if distributed.is_initialized():
            return True
    return False
コード例 #21
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--file_path",
        default="data/conceptual_caption/",
        type=str,
        help="The input train corpus.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-base-uncased, roberta-base, roberta-large, ",
    )
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, roberta-base",
    )
    parser.add_argument(
        "--output_dir",
        default="save",
        type=str,
        # required=True,
        help="The output directory where the model checkpoints will be written.",
    )
    parser.add_argument(
        "--config_file",
        type=str,
        default="config/bert_base_6layer_6conect.json",
        help="The config file which specified the model details.",
    )
    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=36,
        type=int,
        help="The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.",
    )
    parser.add_argument(
        "--train_batch_size",
        default=512,
        type=int,
        help="Total batch size for training.",
    )
    parser.add_argument(
        "--learning_rate",
        default=1e-4,
        type=float,
        help="The initial learning rate for Adam.",
    )
    parser.add_argument(
        "--num_train_epochs",
        default=10.0,
        type=float,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--start_epoch",
        default=0,
        type=float,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help="Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.",
    )
    parser.add_argument(
        "--img_weight", default=1, type=float, help="weight for image loss"
    )
    parser.add_argument(
        "--no_cuda", action="store_true", help="Whether not to use CUDA when available"
    )
    parser.add_argument(
        "--on_memory",
        action="store_true",
        help="Whether to load train samples into memory or use disk",
    )
    parser.add_argument(
        "--do_lower_case",
        type=bool,
        default=True,
        help="Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="local_rank for distributed training on gpus",
    )
    parser.add_argument(
        "--seed", type=int, default=42, help="random seed for initialization"
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumualte before performing a backward/update pass.",
    )
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument(
        "--dynamic_attention",
        action="store_true",
        help="whether use dynamic attention.",
    )
    parser.add_argument(
        "--num_workers",
        type=int,
        default=25,
        help="Number of workers in the dataloader.",
    )
    parser.add_argument(
        "--save_name", default="", type=str, help="save name for training."
    )
    parser.add_argument(
        "--baseline",
        action="store_true",
        help="Wheter to use the baseline model (single bert).",
    )
    parser.add_argument(
        "--freeze",
        default=-1,
        type=int,
        help="till which layer of textual stream of vilbert need to fixed.",
    )
    parser.add_argument(
        "--distributed",
        action="store_true",
        help="whether use chunck for parallel training.",
    )
    parser.add_argument(
        "--without_coattention", action="store_true", help="whether pair loss."
    )
    parser.add_argument(
        "--visual_target",
        default=0,
        type=int,
        help="which target to use for visual branch. \
        0: soft label, \
        1: regress the feature, \
        2: NCE loss.",
    )

    parser.add_argument(
        "--objective",
        default=0,
        type=int,
        help="which objective to use \
        0: with ICA loss, \
        1: with ICA loss, for the not aligned pair, no masking objective, \
        2: without ICA loss, do not sample negative pair.",
    )
    parser.add_argument(
        "--num_negative", default=255, type=int, help="num of negative to use"
    )

    parser.add_argument(
        "--resume_file", default="", type=str, help="Resume from checkpoint"
    )
    parser.add_argument(
        "--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer."
    )

    args = parser.parse_args()

    if args.baseline:
        from pytorch_pretrained_bert.modeling import BertConfig
        from vilbert.basebert import BertForMultiModalPreTraining
    else:
        from vilbert.vilbert import BertForMultiModalPreTraining, BertConfig

    if args.save_name:
        prefix = "-" + args.save_name
    else:
        prefix = ""

    timeStamp = args.config_file.split("/")[1].split(".")[0] + prefix
    savePath = os.path.join(args.output_dir, timeStamp)

    bert_weight_name = json.load(
        open("config/" + args.from_pretrained + "_weight_name.json", "r")
    )

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device(
            "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
        )
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
            device, n_gpu, bool(args.local_rank != -1), args.fp16
        )
    )

    default_gpu = False
    if dist.is_available() and args.local_rank != -1:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True

    if default_gpu:
        if not os.path.exists(savePath):
            os.makedirs(savePath)

    config = BertConfig.from_json_file(args.config_file)

    if default_gpu:
        # save all the hidden parameters.
        with open(os.path.join(savePath, "command.txt"), "w") as f:
            print(args, file=f)  # Python 3.x
            print("\n", file=f)
            print(config, file=f)

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    cache = 5000
    if dist.is_available() and args.local_rank != -1:
        num_replicas = dist.get_world_size()
        args.train_batch_size = args.train_batch_size // num_replicas
        args.num_workers = args.num_workers // num_replicas
        cache = cache // num_replicas

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    if "roberta" in args.bert_model:
        tokenizer = RobertaTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case
        )
    else:
        tokenizer = BertTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case
        )
    num_train_optimization_steps = None
    train_dataset = ConceptCapLoaderTrain(
        args.file_path,
        tokenizer,
        args.bert_model,
        seq_len=args.max_seq_length,
        batch_size=args.train_batch_size,
        visual_target=args.visual_target,
        num_workers=args.num_workers,
        local_rank=args.local_rank,
        objective=args.objective,
        cache=cache,
    )

    validation_dataset = ConceptCapLoaderVal(
        args.file_path,
        tokenizer,
        args.bert_model,
        seq_len=args.max_seq_length,
        batch_size=args.train_batch_size,
        visual_target=args.visual_target,
        num_workers=2,
        objective=args.objective,
    )

    num_train_optimization_steps = int(
        train_dataset.num_dataset
        / args.train_batch_size
        / args.gradient_accumulation_steps
    ) * (args.num_train_epochs - args.start_epoch)

    task_names = ["Conceptual_Caption"]
    task_ids = ["TASK0"]
    task_num_iters = {"TASK0": train_dataset.num_dataset / args.train_batch_size}

    logdir = os.path.join("logs", timeStamp)
    if default_gpu:
        tbLogger = utils.tbLogger(
            logdir,
            savePath,
            task_names,
            task_ids,
            task_num_iters,
            args.gradient_accumulation_steps,
        )

    if args.visual_target == 0:
        config.v_target_size = 1601
        config.visual_target = args.visual_target
    else:
        config.v_target_size = 2048
        config.visual_target = args.visual_target

    if "roberta" in args.bert_model:
        config.model = "roberta"

    if args.freeze > config.t_biattention_id[0]:
        config.fixed_t_layer = config.t_biattention_id[0]

    if args.without_coattention:
        config.with_coattention = False

    if args.dynamic_attention:
        config.dynamic_attention = True

    if args.from_pretrained:
        model = BertForMultiModalPreTraining.from_pretrained(
            args.from_pretrained, config=config, default_gpu=default_gpu
        )
    else:
        model = BertForMultiModalPreTraining(config)

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    if args.freeze != -1:
        bert_weight_name_filtered = []
        for name in bert_weight_name:
            if "embeddings" in name:
                bert_weight_name_filtered.append(name)
            elif "encoder" in name:
                layer_num = name.split(".")[2]
                if int(layer_num) <= args.freeze:
                    bert_weight_name_filtered.append(name)

        optimizer_grouped_parameters = []
        for key, value in dict(model.named_parameters()).items():
            if key[12:] in bert_weight_name_filtered:
                value.requires_grad = False

        if default_gpu:
            print("filtered weight")
            print(bert_weight_name_filtered)

    if not args.from_pretrained:
        param_optimizer = list(model.named_parameters())
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
    else:
        optimizer_grouped_parameters = []
        for key, value in dict(model.named_parameters()).items():
            if value.requires_grad:
                if key[12:] in bert_weight_name:
                    lr = args.learning_rate * 0.1
                else:
                    lr = args.learning_rate

                if any(nd in key for nd in no_decay):
                    optimizer_grouped_parameters += [
                        {"params": [value], "lr": lr, "weight_decay": 0.0}
                    ]

                if not any(nd in key for nd in no_decay):
                    optimizer_grouped_parameters += [
                        {"params": [value], "lr": lr, "weight_decay": 0.01}
                    ]

        if default_gpu:
            print(
                len(list(model.named_parameters())), len(optimizer_grouped_parameters)
            )

    # set different parameters for vision branch and lanugage branch.
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            bias_correction=False,
            max_grad_norm=1.0,
        )
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

    else:

        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            eps=args.adam_epsilon,
            betas=(0.9, 0.98),
        )

    scheduler = WarmupLinearSchedule(
        optimizer,
        warmup_steps=args.warmup_proportion * num_train_optimization_steps,
        t_total=num_train_optimization_steps,
    )

    startIterID = 0
    global_step = 0

    if args.resume_file != "" and os.path.exists(args.resume_file):
        checkpoint = torch.load(args.resume_file, map_location="cpu")
        new_dict = {}
        for attr in checkpoint["model_state_dict"]:
            if attr.startswith("module."):
                new_dict[attr.replace("module.", "", 1)] = checkpoint[
                    "model_state_dict"
                ][attr]
            else:
                new_dict[attr] = checkpoint["model_state_dict"][attr]
        model.load_state_dict(new_dict)
        scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        global_step = checkpoint["global_step"]
        del checkpoint

    model.cuda()

    for state in optimizer.state.values():
        for k, v in state.items():
            if torch.is_tensor(v):
                state[k] = v.cuda()

    if args.fp16:
        model.half()
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if default_gpu:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", train_dataset.num_dataset)
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

    for epochId in range(int(args.start_epoch), int(args.num_train_epochs)):
        model.train()
        for step, batch in enumerate(train_dataset):

            iterId = startIterID + step + (epochId * len(train_dataset))
            image_ids = batch[-1]
            batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch[:-1])

            input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask = (
                batch
            )

            if args.objective == 1:
                image_label = image_label * (is_next == 0).long().unsqueeze(1)
                image_label[image_label == 0] = -1

                lm_label_ids = lm_label_ids * (is_next == 0).long().unsqueeze(1)
                lm_label_ids[lm_label_ids == 0] = -1

            masked_loss_t, masked_loss_v, next_sentence_loss = model(
                input_ids,
                image_feat,
                image_loc,
                segment_ids,
                input_mask,
                image_mask,
                lm_label_ids,
                image_label,
                image_target,
                is_next,
            )

            if args.objective == 2:
                next_sentence_loss = next_sentence_loss * 0

            masked_loss_v = masked_loss_v * args.img_weight
            loss = masked_loss_t + masked_loss_v + next_sentence_loss

            if n_gpu > 1:
                loss = loss.mean()
                masked_loss_t = masked_loss_t.mean()
                masked_loss_v = masked_loss_v.mean()
                next_sentence_loss = next_sentence_loss.mean()

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / num_train_optimization_steps,
                        args.warmup_proportion,
                    )
                    for param_group in optimizer.param_groups:
                        param_group["lr"] = lr_this_step

                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

                if default_gpu:
                    tbLogger.step_train_CC(
                        epochId,
                        iterId,
                        float(masked_loss_t),
                        float(masked_loss_v),
                        float(next_sentence_loss),
                        optimizer.param_groups[0]["lr"],
                        "TASK0",
                        "train",
                    )

            if (
                step % (20 * args.gradient_accumulation_steps) == 0
                and step != 0
                and default_gpu
            ):
                tbLogger.showLossTrainCC()

        # Do the evaluation
        torch.set_grad_enabled(False)
        numBatches = len(validation_dataset)

        model.eval()
        for step, batch in enumerate(validation_dataset):
            image_ids = batch[-1]
            batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch[:-1])

            input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask = (
                batch
            )

            batch_size = input_ids.size(0)
            masked_loss_t, masked_loss_v, next_sentence_loss = model(
                input_ids,
                image_feat,
                image_loc,
                segment_ids,
                input_mask,
                image_mask,
                lm_label_ids,
                image_label,
                image_target,
                is_next,
            )

            masked_loss_v = masked_loss_v * args.img_weight
            loss = masked_loss_t + masked_loss_v + next_sentence_loss

            if n_gpu > 1:
                loss = loss.mean()
                masked_loss_t = masked_loss_t.mean()
                masked_loss_v = masked_loss_v.mean()
                next_sentence_loss = next_sentence_loss.mean()

            if default_gpu:
                tbLogger.step_val_CC(
                    epochId,
                    float(masked_loss_t),
                    float(masked_loss_v),
                    float(next_sentence_loss),
                    "TASK0",
                    batch_size,
                    "val",
                )
                sys.stdout.write("%d / %d \r" % (step, numBatches))
                sys.stdout.flush()

        if default_gpu:
            ave_score = tbLogger.showLossValCC()

        torch.set_grad_enabled(True)

        if default_gpu:
            # Save a trained model
            logger.info("** ** * Saving fine - tuned model ** ** * ")
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Only save the model it-self
            output_model_file = os.path.join(
                savePath, "pytorch_model_" + str(epochId) + ".bin"
            )
            output_checkpoint = os.path.join(
                savePath, "pytorch_ckpt_" + str(epochId) + ".tar"
            )
            torch.save(model_to_save.state_dict(), output_model_file)
            torch.save(
                {
                    "model_state_dict": model_to_save.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "scheduler_state_dict": scheduler.state_dict(),
                    "global_step": global_step,
                },
                output_checkpoint,
            )

    if default_gpu:
        tbLogger.txt_close()
コード例 #22
0
ファイル: test_pg_wrapper.py プロジェクト: ydcjeff/pytorch
import os
import sys
import unittest
from datetime import timedelta

import torch
import torch.distributed as c10d

if not c10d.is_available():
    print("c10d not available, skipping tests", file=sys.stderr)
    sys.exit(0)

from torch.testing._internal.common_distributed import (
    MultiProcessTestCase,
    requires_nccl,
    requires_gloo,
    skip_if_lt_x_gpu,
    with_dist_debug_levels,
    create_device,
)
from torch.testing._internal.common_utils import (
    run_tests,
    TEST_WITH_TSAN,
)
from test_c10d_common import LOOPBACK


class AbstractProcessGroupWrapperTest(MultiProcessTestCase):
    def setUp(self):
        super(AbstractProcessGroupWrapperTest, self).setUp()
        # For Windows platform, Python does not support fork, change it to spawn here.
コード例 #23
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=1, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')
    parser.add_argument('--dir', default='logs', metavar='L',
                        help='directory where summary logs are stored')
    if dist.is_available():
        parser.add_argument('--backend', type=str, help='Distributed backend',
                            choices=[dist.Backend.GLOO, dist.Backend.NCCL, dist.Backend.MPI],
                            default=dist.Backend.GLOO)
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    if use_cuda:
        print('Using CUDA')

    writer = SummaryWriter(args.dir)

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    if should_distribute():
        print('Using distributed PyTorch with {} backend'.format(args.backend))
        dist.init_process_group(backend=args.backend)

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args.test_batch_size, shuffle=False, **kwargs)

    model = Net().to(device)

    if is_distributed():
        Distributor = nn.parallel.DistributedDataParallel if use_cuda \
            else nn.parallel.DistributedDataParallelCPU
        model = Distributor(model)

    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch, writer)
        test(args, model, device, test_loader, writer, epoch)

    if (args.save_model):
        torch.save(model.state_dict(),"mnist_cnn.pt")
コード例 #24
0
def is_dist_avail_and_initialized():
    if not dist.is_available():
        return False
    if not dist.is_initialized():
        return False
    return True
コード例 #25
0
ファイル: utils.py プロジェクト: buddhisant/RetinaNet
def get_dist_info():
    assert dist.is_initialized(), "还没有初始化分布式!"
    assert dist.is_available(), "分布式在当前设备不可用!"
    rank = dist.get_rank()
    world_size = dist.get_world_size()
    return rank, world_size
コード例 #26
0
DEFAULT_TIMEOUT = 300
CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel": 500}

if INIT_METHOD.startswith("file://"):
    FOLDER = INIT_METHOD[7:]


def get_timeout(test_id):
    test_name = test_id.split(".")[-1]
    if test_name in CUSTOMIZED_TIMEOUT:
        return CUSTOMIZED_TIMEOUT[test_name]
    else:
        return DEFAULT_TIMEOUT


if not dist.is_available():
    print("Distributed not available, skipping tests")
    sys.exit(0)

SKIP_IF_NO_CUDA_EXIT_CODE = 75
SKIP_IF_NO_GPU_EXIT_CODE = 76
SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE = 77
SKIP_IF_BACKEND_UNAVAILABLE = 78


def skip_if_no_cuda_distributed(func):
    func.skip_if_no_cuda_distributed = True

    @wraps(func)
    def wrapper(*args, **kwargs):
        if not torch.cuda.is_available():
コード例 #27
0
ファイル: common.py プロジェクト: realathu/ignite
def setup_common_training_handlers(
    trainer,
    train_sampler=None,
    to_save=None,
    save_every_iters=1000,
    output_path=None,
    lr_scheduler=None,
    with_gpu_stats=False,
    output_names=None,
    with_pbars=True,
    with_pbar_on_iters=True,
    log_every_iters=100,
    device="cuda",
):
    """Helper method to setup trainer with common handlers (it also supports distributed configuration):
        - :class:`~ignite.handlers.TerminateOnNan`
        - handler to setup learning rate scheduling
        - :class:`~ignite.handlers.ModelCheckpoint`
        - :class:`~ignite.metrics.RunningAverage` on `update_function` output
        - Two progress bars on epochs and optionally on iterations

    Args:
        trainer (Engine): trainer engine. Output of trainer's `update_function` should be a dictionary
            or sequence or a single tensor.
        train_sampler (torch.utils.data.DistributedSampler, optional): Optional distributed sampler used to call
            `set_epoch` method on epoch started event.
        to_save (dict, optional): dictionary with objects to save in the checkpoint. This is used with
            :class:`~ignite.handlers.ModelCheckpoint`.
        save_every_iters (int, optional): saving interval. By default, `to_save` objects are stored
            each 1000 iterations.
        output_path (str, optional): output path to indicate where `to_save` objects are stored.
        lr_scheduler (ParamScheduler or subclass of `torch.optim.lr_scheduler._LRScheduler`): learning rate scheduler
            as native torch LRScheduler or ignite's parameter scheduler.
        with_gpu_stats (bool, optional): if True, :class:`~ignite.contrib.metrics.handlers.GpuInfo` is attached to the
            trainer. This requires `pynvml` package to be installed.
        output_names (list/tuple): list of names associated with `update_function` output dictionary.
        with_pbars (bool, optional): if True, two progress bars on epochs and optionally on iterations are attached
        with_pbar_on_iters (bool, optional): if True, a progress bar on iterations is attached to the trainer.
        log_every_iters (int, optional): logging interval for :class:`~ignite.contrib.metrics.handlers.GpuInfo` and for
            epoch-wise progress bar.
        device (str of torch.device, optional): Optional device specification in case of distributed computation usage.
    """
    kwargs = dict(
        to_save=to_save,
        save_every_iters=save_every_iters,
        output_path=output_path,
        lr_scheduler=lr_scheduler,
        with_gpu_stats=with_gpu_stats,
        output_names=output_names,
        with_pbars=with_pbars,
        with_pbar_on_iters=with_pbar_on_iters,
        log_every_iters=log_every_iters,
        device=device,
    )
    if dist.is_available() and dist.is_initialized():
        _setup_common_distrib_training_handlers(trainer,
                                                train_sampler=train_sampler,
                                                **kwargs)
    else:
        if train_sampler is not None:
            warnings.warn(
                "Argument train_sampler distributed sampler used to call `set_epoch` method on epoch "
                "started event, but no distributed setting detected",
                UserWarning,
            )
        _setup_common_training_handlers(trainer, **kwargs)
コード例 #28
0
def get_world_size() -> int:
    if not dist.is_available():
        return 1
    if not dist.is_initialized():
        return 1
    return dist.get_world_size()
コード例 #29
0
def should_distribute():
    return dist.is_available() and WORLD_SIZE > 1
コード例 #30
0
ファイル: test_distributed.py プロジェクト: athiwatp/pytorch
import time
import unittest
from functools import wraps, reduce
from contextlib import contextmanager

import torch
import torch.distributed as dist
from common import TestCase

BACKEND = os.environ['BACKEND']
TEMP_DIR = os.environ['TEMP_DIR']
MASTER_PORT = '29500'
MASTER_ADDR = '127.0.0.1'


if not dist.is_available():
    print('Distributed not available, skipping tests')
    sys.exit(0)


@contextmanager
def _lock():
    lockfile = os.path.join(TEMP_DIR, 'lockfile')
    with open(lockfile, 'w') as lf:
        try:
            fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
            yield
        finally:
            fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
            lf.close()
コード例 #31
0
def main():
    args = parse_args()

    # Devices
    if args.local_rank == -1:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        torch.distributed.init_process_group(backend="nccl")
    default_gpu = False
    if dist.is_available() and args.local_rank != -1:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True
    logger.info(
        f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}"
    )

    # Load config
    config = BertConfig.from_json_file(args.config_file)

    # Load task config
    with open(args.tasks_config_file, "r") as f:
        task_cfg = edict(yaml.safe_load(f))
    task_id = args.task.strip()
    task = "TASK" + task_id

    # Output dirs
    if "/" in args.from_pretrained:
        timeStamp = args.from_pretrained.split("/")[1]
    else:
        timeStamp = args.from_pretrained
    savePath = os.path.join(args.output_dir, timeStamp)
    if default_gpu and not os.path.exists(savePath):
        os.makedirs(savePath)

    # Seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Dataset
    batch_size, task2num_iters, dset_val, dl_val = LoadDatasetEval(
        args, config, task_cfg, args.task)
    max_subiter_images = dset_val.max_num_images

    # Model
    if args.zero_shot:
        config.visual_target_weights = {}
        model = BertForVLPreTraining.from_pretrained(args.from_pretrained,
                                                     config=config)
    else:
        model = BertForVLTasks.from_pretrained(args.from_pretrained,
                                               config=config,
                                               task_cfg=task_cfg,
                                               task_ids=[task])

    # Move to GPU(s)
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model, deay_allreduce=True)
    elif n_gpu > 1:
        model = nn.DataParallel(model)
        raise ValueError("Please run with a single GPU")

    # Print summary
    if default_gpu:
        print("***** Running evaluation *****")
        print("  Num Iters: ", task2num_iters)
        print("  Batch size: ", batch_size)

    # Evaluate
    model.eval()
    results = []
    others = []
    score_matrix = np.zeros(
        (args.num_images * args.captions_per_image, args.num_images))
    target_matrix = np.zeros(
        (args.num_images * args.captions_per_image, args.num_images))
    rank_vector = np.ones(
        args.num_images * args.captions_per_image) * args.num_images
    count = 0
    for i, batch in tqdm(enumerate(dl_val), total=task2num_iters[task]):
        batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch)
        features, spatials, image_mask, question, input_mask, segment_ids, target, caption_idx, image_idx = batch

        features = features.squeeze(0)
        spatials = spatials.squeeze(0)
        image_mask = image_mask.squeeze(0)
        question = question.repeat(features.size(0), 1)
        segment_ids = segment_ids.repeat(features.size(0), 1)
        input_mask = input_mask.repeat(features.size(0), 1)

        with torch.no_grad():
            if args.zero_shot:
                _, _, vil_logit, _, _, _ = model(question, features, spatials,
                                                 segment_ids, input_mask,
                                                 image_mask)

                score_matrix[caption_idx,
                             image_idx * max_subiter_images:(image_idx + 1) *
                             max_subiter_images] = (torch.softmax(
                                 vil_logit, dim=1)[:,
                                                   0].view(-1).cpu().numpy())
                target_matrix[caption_idx,
                              image_idx * max_subiter_images:(image_idx + 1) *
                              max_subiter_images] = (
                                  target.view(-1).float().cpu().numpy())

            else:
                vil_logit, _, _, _ = model(question, features, spatials, task,
                                           segment_ids, input_mask, image_mask)

                score_matrix[caption_idx,
                             image_idx * max_subiter_images:(image_idx + 1) *
                             max_subiter_images] = (
                                 vil_logit.view(-1).cpu().numpy())
                target_matrix[caption_idx,
                              image_idx * max_subiter_images:(image_idx + 1) *
                              max_subiter_images] = (
                                  target.view(-1).float().cpu().numpy())

            if image_idx.item() == args.num_subiters - 1:
                rank = np.where(
                    (np.argsort(-score_matrix[caption_idx]) == np.where(
                        target_matrix[caption_idx] == 1)[0][0]) == 1)[0][0]
                rank_vector[caption_idx] = rank

                cur_rank_vector = rank_vector[:caption_idx + 1]
                r1 = 100.0 * np.sum(cur_rank_vector < 1) / len(cur_rank_vector)
                r5 = 100.0 * np.sum(cur_rank_vector < 5) / len(cur_rank_vector)
                r10 = 100.0 * np.sum(
                    cur_rank_vector < 10) / len(cur_rank_vector)

                medr = np.floor(np.median(cur_rank_vector) + 1)
                meanr = np.mean(cur_rank_vector) + 1
                print(
                    "%d Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f"
                    % (count, r1, r5, r10, medr, meanr))

                results.append(
                    np.argsort(-score_matrix[caption_idx]).tolist()[:20])
        count += 1

    r1 = 100.0 * np.sum(rank_vector < 1) / len(rank_vector)
    r5 = 100.0 * np.sum(rank_vector < 5) / len(rank_vector)
    r10 = 100.0 * np.sum(rank_vector < 10) / len(rank_vector)

    medr = np.floor(np.median(rank_vector) + 1)
    meanr = np.mean(rank_vector) + 1

    print("************************************************")
    print("****************Image Retrieval*****************")
    print("************************************************")
    print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" %
          (r1, r5, r10, medr, meanr))
    print("************************************************")

    if args.split:
        json_path = os.path.join(savePath, args.split)
    else:
        json_path = os.path.join(savePath, task_cfg[task_id]["val_split"])
    json.dump(results, open(json_path + "_result.json", "w"))
    json.dump(others, open(json_path + "_others.json", "w"))

    # Text Retrieval
    rank_vector = np.zeros(args.num_images)
    for image_idx in range(args.num_images):
        ranks = []
        tgt_captions = np.where(target_matrix[:, image_idx] == 1)[0]
        sorted_scores = np.argsort(-score_matrix[:, image_idx])
        for tgt_caption in tgt_captions:
            ranks.append(np.where((sorted_scores == tgt_caption) == 1)[0][0])
        rank_vector[image_idx] = min(ranks)

    r1 = 100.0 * np.sum(rank_vector < 1) / len(rank_vector)
    r5 = 100.0 * np.sum(rank_vector < 5) / len(rank_vector)
    r10 = 100.0 * np.sum(rank_vector < 10) / len(rank_vector)

    medr = np.floor(np.median(rank_vector) + 1)
    meanr = np.mean(rank_vector) + 1

    print("************************************************")
    print("****************Text Retrieval******************")
    print("************************************************")
    print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" %
          (r1, r5, r10, medr, meanr))
    print("************************************************")