def dist_is_initialized(): if dist.is_available(): if dist.is_initialized(): return True return False
def get_rank() -> int: if not dist.is_available(): return 0 if not dist.is_initialized(): return 0 return dist.get_rank()
def get_selected_tests(options): # First make sure run specific test cases options are processed. if options.run_specified_test_cases: if options.use_specified_test_cases_by == "include": options.include = list(SPECIFIED_TEST_CASES_DICT.keys()) elif options.use_specified_test_cases_by == "bring-to-front": options.bring_to_front = list(SPECIFIED_TEST_CASES_DICT.keys()) selected_tests = options.include # filter if there's JIT only and distributed only test options if options.jit: selected_tests = list( filter(lambda test_name: "jit" in test_name, selected_tests)) if options.distributed_tests: selected_tests = list( filter(lambda test_name: test_name in DISTRIBUTED_TESTS, selected_tests)) # Filter to only run core tests when --core option is specified if options.core: selected_tests = list( filter(lambda test_name: test_name in CORE_TEST_LIST, selected_tests)) # process reordering if options.bring_to_front: to_front = set(options.bring_to_front) selected_tests = options.bring_to_front + list( filter(lambda name: name not in to_front, selected_tests)) if options.first: first_index = find_test_index(options.first, selected_tests) selected_tests = selected_tests[first_index:] if options.last: last_index = find_test_index(options.last, selected_tests, find_last_index=True) selected_tests = selected_tests[:last_index + 1] # process exclusion if options.exclude_jit_executor: options.exclude.extend(JIT_EXECUTOR_TESTS) if options.exclude_distributed_tests: options.exclude.extend(DISTRIBUTED_TESTS) selected_tests = exclude_tests(options.exclude, selected_tests) if sys.platform == "win32" and not options.ignore_win_blocklist: target_arch = os.environ.get("VSCMD_ARG_TGT_ARCH") if target_arch != "x64": WINDOWS_BLOCKLIST.append("cpp_extensions_aot_no_ninja") WINDOWS_BLOCKLIST.append("cpp_extensions_aot_ninja") WINDOWS_BLOCKLIST.append("cpp_extensions_jit") WINDOWS_BLOCKLIST.append("jit") WINDOWS_BLOCKLIST.append("jit_fuser") # This is exception that's caused by this issue https://github.com/pytorch/pytorch/issues/69460 # This below code should be removed once this issue is solved if torch.version.cuda is not None and LooseVersion( torch.version.cuda) >= "11.5": WINDOWS_BLOCKLIST.append("test_cpp_extensions_aot") WINDOWS_BLOCKLIST.append("test_cpp_extensions_aot_ninja") WINDOWS_BLOCKLIST.append("test_cpp_extensions_aot_no_ninja") selected_tests = exclude_tests(WINDOWS_BLOCKLIST, selected_tests, "on Windows") elif TEST_WITH_ROCM: selected_tests = exclude_tests(ROCM_BLOCKLIST, selected_tests, "on ROCm") # sharding if options.shard: assert len(options.shard) == 2, "Unexpected shard format" assert min(options.shard) > 0, "Shards must be positive numbers" which_shard, num_shards = options.shard assert ( which_shard <= num_shards ), "Selected shard must be less than or equal to total number of shards" assert num_shards <= len( selected_tests ), f"Number of shards must be less than {len(selected_tests)}" # TODO: fix this to use test_times_filename, but currently this is not working # because setting the export arg immeidately halts the test execution. selected_tests = get_shard_based_on_S3(which_shard, num_shards, selected_tests, TEST_TIMES_FILE) # skip all distributed tests if distributed package is not available. if not dist.is_available(): selected_tests = exclude_tests( DISTRIBUTED_TESTS, selected_tests, "PyTorch is built without distributed support.") # skip tests that require LAPACK when it's not available if not torch._C.has_lapack: selected_tests = exclude_tests( TESTS_REQUIRING_LAPACK, selected_tests, "PyTorch is built without LAPACK support.") return selected_tests
'distributed/_pipeline/sync/test_deferred_batch_norm', 'distributed/_pipeline/sync/test_dependency', 'distributed/_pipeline/sync/test_inplace', 'distributed/_pipeline/sync/test_microbatch', 'distributed/_pipeline/sync/test_phony', 'distributed/_pipeline/sync/test_pipe', 'distributed/_pipeline/sync/test_pipeline', 'distributed/_pipeline/sync/test_stream', 'distributed/_pipeline/sync/test_transparency', 'distributed/_pipeline/sync/test_worker', ] _DEP_MODULES_CACHE: Dict[str, set] = {} DISTRIBUTED_TESTS_CONFIG = {} if dist.is_available(): DISTRIBUTED_TESTS_CONFIG['test'] = {'WORLD_SIZE': '1'} if not TEST_WITH_ROCM and dist.is_mpi_available(): DISTRIBUTED_TESTS_CONFIG['mpi'] = { 'WORLD_SIZE': '3', 'TEST_REPORT_SOURCE_OVERRIDE': 'dist-mpi' } if dist.is_nccl_available(): DISTRIBUTED_TESTS_CONFIG['nccl'] = { 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3', 'TEST_REPORT_SOURCE_OVERRIDE': 'dist-nccl' } if dist.is_gloo_available(): DISTRIBUTED_TESTS_CONFIG['gloo'] = { 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3', 'TEST_REPORT_SOURCE_OVERRIDE': 'dist-gloo'
def __init__( self, config: ConfigSchema, model: Optional[MultiRelationEmbedder] = None, trainer: Optional[AbstractBatchProcessor] = None, evaluator: Optional[AbstractBatchProcessor] = None, rank: Rank = RANK_ZERO, subprocess_init: Optional[Callable[[], None]] = None, ): """Each epoch/pass, for each partition pair, loads in embeddings and edgelist from disk, runs HOGWILD training on them, and writes partitions back to disk. """ tag_logs_with_process_name(f"Trainer-{rank}") self.config = config if config.verbose > 0: import pprint pprint.PrettyPrinter().pprint(config.to_dict()) logger.info("Loading entity counts...") entity_storage = ENTITY_STORAGES.make_instance(config.entity_path) entity_counts: Dict[str, List[int]] = {} for entity, econf in config.entities.items(): entity_counts[entity] = [] for part in range(econf.num_partitions): entity_counts[entity].append( entity_storage.load_count(entity, part)) # Figure out how many lhs and rhs partitions we need holder = self.holder = EmbeddingHolder(config) logger.debug( f"nparts {holder.nparts_lhs} {holder.nparts_rhs} " f"types {holder.lhs_partitioned_types} {holder.rhs_partitioned_types}" ) # We know ahead of time that we wil need 1-2 storages for each embedding type, # as well as the max size of this storage (num_entities x D). # We allocate these storages n advance in `embedding_storage_freelist`. # When we need storage for an entity type, we pop it from this free list, # and then add it back when we 'delete' the embedding table. embedding_storage_freelist: Dict[ EntityName, Set[torch.FloatStorage]] = defaultdict(set) for entity_type, counts in entity_counts.items(): max_count = max(counts) num_sides = ( (1 if entity_type in holder.lhs_partitioned_types else 0) + (1 if entity_type in holder.rhs_partitioned_types else 0) + (1 if entity_type in (holder.lhs_unpartitioned_types | holder.rhs_unpartitioned_types) else 0)) for _ in range(num_sides): embedding_storage_freelist[entity_type].add( allocate_shared_tensor((max_count, config.dimension), dtype=torch.float).storage()) # create the handlers, threads, etc. for distributed training if config.num_machines > 1 or config.num_partition_servers > 0: if not 0 <= rank < config.num_machines: raise RuntimeError("Invalid rank for trainer") if not td.is_available(): raise RuntimeError( "The installed PyTorch version doesn't provide " "distributed training capabilities.") ranks = ProcessRanks.from_num_invocations( config.num_machines, config.num_partition_servers) num_ps_groups = config.num_groups_for_partition_server groups: List[List[int]] = [ranks.trainers] # barrier group groups += [ranks.trainers + ranks.partition_servers ] * num_ps_groups # ps groups group_idxs_for_partition_servers = range(1, len(groups)) if rank == RANK_ZERO: logger.info("Setup lock server...") start_server( LockServer( num_clients=len(ranks.trainers), nparts_lhs=holder.nparts_lhs, nparts_rhs=holder.nparts_rhs, entities_lhs=holder.lhs_partitioned_types, entities_rhs=holder.rhs_partitioned_types, entity_counts=entity_counts, init_tree=config.distributed_tree_init_order, ), process_name="LockServer", init_method=config.distributed_init_method, world_size=ranks.world_size, server_rank=ranks.lock_server, groups=groups, subprocess_init=subprocess_init, ) self.bucket_scheduler = DistributedBucketScheduler( server_rank=ranks.lock_server, client_rank=ranks.trainers[rank], ) logger.info("Setup param server...") start_server( ParameterServer(num_clients=len(ranks.trainers)), process_name=f"ParamS-{rank}", init_method=config.distributed_init_method, world_size=ranks.world_size, server_rank=ranks.parameter_servers[rank], groups=groups, subprocess_init=subprocess_init, ) parameter_sharer = ParameterSharer( process_name=f"ParamC-{rank}", client_rank=ranks.parameter_clients[rank], all_server_ranks=ranks.parameter_servers, init_method=config.distributed_init_method, world_size=ranks.world_size, groups=groups, subprocess_init=subprocess_init, ) if config.num_partition_servers == -1: start_server( ParameterServer( num_clients=len(ranks.trainers), group_idxs=group_idxs_for_partition_servers, log_stats=True, ), process_name=f"PartS-{rank}", init_method=config.distributed_init_method, world_size=ranks.world_size, server_rank=ranks.partition_servers[rank], groups=groups, subprocess_init=subprocess_init, ) groups = init_process_group( rank=ranks.trainers[rank], world_size=ranks.world_size, init_method=config.distributed_init_method, groups=groups, ) trainer_group, *groups_for_partition_servers = groups self.barrier_group = trainer_group if len(ranks.partition_servers) > 0: partition_client = PartitionClient( ranks.partition_servers, groups=groups_for_partition_servers, log_stats=True, ) else: partition_client = None else: self.barrier_group = None self.bucket_scheduler = SingleMachineBucketScheduler( holder.nparts_lhs, holder.nparts_rhs, config.bucket_order) parameter_sharer = None partition_client = None hide_distributed_logging() # fork early for HOGWILD threads logger.info("Creating workers...") self.num_workers = get_num_workers(config.workers) self.pool = create_pool( self.num_workers, subprocess_name=f"TWorker-{rank}", subprocess_init=subprocess_init, ) checkpoint_manager = CheckpointManager( config.checkpoint_path, rank=rank, num_machines=config.num_machines, partition_client=partition_client, subprocess_name=f"BackgRW-{rank}", subprocess_init=subprocess_init, ) self.checkpoint_manager = checkpoint_manager checkpoint_manager.register_metadata_provider( ConfigMetadataProvider(config)) if rank == 0: checkpoint_manager.write_config(config) num_edge_chunks = get_num_edge_chunks(config) self.iteration_manager = IterationManager( config.num_epochs, config.edge_paths, num_edge_chunks, iteration_idx=checkpoint_manager.checkpoint_version) checkpoint_manager.register_metadata_provider(self.iteration_manager) logger.info("Initializing global model...") if model is None: model = make_model(config) model.share_memory() if trainer is None: trainer = Trainer( model_optimizer=make_optimizer(config, model.parameters(), False), loss_fn=config.loss_fn, margin=config.margin, relations=config.relations, ) if evaluator is None: evaluator = TrainingRankingEvaluator( override_num_batch_negs=config.eval_num_batch_negs, override_num_uniform_negs=config.eval_num_uniform_negs, ) if config.init_path is not None: self.loadpath_manager = CheckpointManager(config.init_path) else: self.loadpath_manager = None # load model from checkpoint or loadpath, if available state_dict, optim_state = checkpoint_manager.maybe_read_model() if state_dict is None and self.loadpath_manager is not None: state_dict, optim_state = self.loadpath_manager.maybe_read_model() if state_dict is not None: model.load_state_dict(state_dict, strict=False) if optim_state is not None: trainer.model_optimizer.load_state_dict(optim_state) logger.debug("Loading unpartitioned entities...") for entity in holder.lhs_unpartitioned_types | holder.rhs_unpartitioned_types: count = entity_counts[entity][0] s = embedding_storage_freelist[entity].pop() embs = torch.FloatTensor(s).view(-1, config.dimension)[:count] embs, optimizer = self._load_embeddings(entity, Partition(0), out=embs) holder.unpartitioned_embeddings[entity] = embs trainer.unpartitioned_optimizers[entity] = optimizer # start communicating shared parameters with the parameter server if parameter_sharer is not None: shared_parameters: Set[int] = set() for name, param in model.named_parameters(): if id(param) in shared_parameters: continue shared_parameters.add(id(param)) key = f"model.{name}" logger.info( f"Adding {key} ({param.numel()} params) to parameter server" ) parameter_sharer.set_param(key, param.data) for entity, embs in holder.unpartitioned_embeddings.items(): key = f"entity.{entity}" logger.info( f"Adding {key} ({embs.numel()} params) to parameter server" ) parameter_sharer.set_param(key, embs.data) # store everything in self self.model = model self.trainer = trainer self.evaluator = evaluator self.rank = rank self.entity_counts = entity_counts self.embedding_storage_freelist = embedding_storage_freelist self.strict = False
def is_dist_initialized(): return dist.is_available() and dist.is_initialized()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="results", type=str, help="The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_config.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--no_cuda", action="store_true", help="Whether not to use CUDA when available" ) parser.add_argument( "--do_lower_case", default=True, type=bool, help="Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument( "--seed", type=int, default=42, help="random seed for initialization" ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader.", ) parser.add_argument( "--save_name", default="", type=str, help="save name for training." ) parser.add_argument( "--use_chunk", default=0, type=float, help="whether use chunck for parallel training.", ) parser.add_argument( "--tasks", default="", type=str, help="1-2-3... training task separate by -" ) parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument( "--baseline", action="store_true", help="whether use single stream baseline." ) parser.add_argument( "--zero_shot", action="store_true", help="whether use single stream baseline." ) parser.add_argument("--split", default="", type=str, help="which split to use.") parser.add_argument("--batch_size", default=1, type=int, help="which split to use.") parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--task_specific_tokens", action="store_true", help="whether to use task specific tokens for the multi-task learning.", ) args = parser.parse_args() with open("vilbert_tasks.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_pretrained_bert.modeling import BertConfig else: from vilbert.vilbert import BertConfig task_names = [] for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id name = task_cfg[task]["name"] task_names.append(name) # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0] if "/" in args.from_pretrained: timeStamp = args.from_pretrained.split("/")[1] else: timeStamp = args.from_pretrained savePath = os.path.join(args.output_dir, timeStamp) config = BertConfig.from_json_file(args.config_file) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r") ) if args.local_rank == -1 or args.no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu" ) n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16 ) ) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val = LoadDatasetEval( args, task_cfg, args.tasks.split("-") ) num_labels = max([dataset.num_labels for dataset in task_datasets_val.values()]) if args.task_specific_tokens: config.task_specific_tokens = True config.fast_mode = True if args.zero_shot: model = BertForMultiModalPreTraining.from_pretrained( args.from_pretrained, config ) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) task_losses = LoadLosses(args, task_cfg, args.tasks.split("-")) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, deay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) model.eval() # when run evaluate, we run each task sequentially. for task_id in task_ids: results = [] others = [] score_matrix = np.zeros((5000, 1000)) target_matrix = np.zeros((5000, 1000)) rank_matrix = np.ones((5000)) * 1000 count = 0 for i, batch in enumerate(task_dataloader_val[task_id]): batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch) features, spatials, image_mask, question, input_mask, segment_ids, target, caption_idx, image_idx = ( batch ) task_tokens = ( question.new().resize_(question.size(0), 1).fill_(int(task_id[4:])) ) if task_id in ["TASK7", "TASK8"]: batch_size = features.size(0) features = features.squeeze(0) spatials = spatials.squeeze(0) image_mask = image_mask.squeeze(0) with torch.no_grad(): if args.zero_shot: _, _, vil_logit, _ = model( question, features, spatials, segment_ids, input_mask, image_mask, task_ids=task_tokens, ) score_matrix[ caption_idx, image_idx * 500 : (image_idx + 1) * 500 ] = (torch.softmax(vil_logit, dim=1)[:, 0].view(-1).cpu().numpy()) target_matrix[ caption_idx, image_idx * 500 : (image_idx + 1) * 500 ] = (target.view(-1).float().cpu().numpy()) else: _, _, vil_logit, _, _, _, _, _, _ = model( question, features, spatials, segment_ids, input_mask, image_mask, task_ids=task_tokens, ) score_matrix[ caption_idx, image_idx * 500 : (image_idx + 1) * 500 ] = (vil_logit.view(-1).cpu().numpy()) target_matrix[ caption_idx, image_idx * 500 : (image_idx + 1) * 500 ] = (target.view(-1).float().cpu().numpy()) if image_idx.item() == 1: rank = np.where( ( np.argsort(-score_matrix[caption_idx]) == np.where(target_matrix[caption_idx] == 1)[0][0] ) == 1 )[0][0] rank_matrix[caption_idx] = rank rank_matrix_tmp = rank_matrix[: caption_idx + 1] r1 = 100.0 * np.sum(rank_matrix_tmp < 1) / len(rank_matrix_tmp) r5 = 100.0 * np.sum(rank_matrix_tmp < 5) / len(rank_matrix_tmp) r10 = 100.0 * np.sum(rank_matrix_tmp < 10) / len(rank_matrix_tmp) medr = np.floor(np.median(rank_matrix_tmp) + 1) meanr = np.mean(rank_matrix_tmp) + 1 print( "%d Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (count, r1, r5, r10, medr, meanr) ) results.append(np.argsort(-score_matrix[caption_idx]).tolist()[:20]) count += 1 r1 = 100.0 * np.sum(rank_matrix < 1) / len(rank_matrix) r5 = 100.0 * np.sum(rank_matrix < 5) / len(rank_matrix) r10 = 100.0 * np.sum(rank_matrix < 10) / len(rank_matrix) medr = np.floor(np.median(rank_matrix) + 1) meanr = np.mean(rank_matrix) + 1 print("************************************************") print( "Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (r1, r5, r10, medr, meanr) ) print("************************************************") if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task_id]["val_split"]) json.dump(results, open(json_path + "_result.json", "w")) json.dump(others, open(json_path + "_others.json", "w"))
def print_all(msg): if (not dist.is_available()): print(msg) elif dist.get_rank() % 8 == 0: print(f'{dist.get_rank()//8}: {msg}')
def _is_torch_distributed_initialized() -> bool: """Checks if torch.distributed is available and initialized.""" return dist.is_available() and dist.is_initialized()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="results", type=str, help="The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_config.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--no_cuda", action="store_true", help="Whether not to use CUDA when available" ) parser.add_argument( "--do_lower_case", default=True, type=bool, help="Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=10, help="Number of workers in the dataloader." ) parser.add_argument( "--save_name", default='', type=str, help="save name for training.", ) parser.add_argument( "--batch_size", default=1000, type=int, help="what is the batch size?" ) parser.add_argument( "--tasks", default='', type=str, help="1-2-3... training task separate by -" ) parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training." ) parser.add_argument( "--baseline", action="store_true", help="whether use single stream baseline." ) parser.add_argument( "--split", default="", type=str, help="which split to use." ) ''' Thil : for test split, change yml file's eval split and ann file. if cache file is outdated, erase and rerun ''' args = parser.parse_args() with open('vlbert_tasks.yml', 'r') as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_pretrained_bert.modeling import BertConfig from vilbert.basebert import BaseBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] for i, task_id in enumerate(args.tasks.split('-')): task = 'TASK' + task_id name = task_cfg[task]['name'] task_names.append(name) # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0] timeStamp = args.from_pretrained.split('/')[1] + '-' + args.save_name savePath = os.path.join(args.output_dir, timeStamp) config = BertConfig.from_json_file(args.config_file) bert_weight_name = json.load(open("config/" + args.bert_model + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16 ) ) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val \ = LoadDatasetEval(args, task_cfg, args.tasks.split('-')) tbLogger = utils.tbLogger(timeStamp, savePath, task_names, task_ids, task_num_iters, 1, save_logger=False, txt_name='eval.txt') num_labels = max([dataset.num_labels for dataset in task_datasets_val.values()]) if args.baseline: model = BaseBertForVLTasks.from_pretrained( args.from_pretrained, config, num_labels=num_labels, default_gpu=default_gpu ) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config, num_labels=num_labels, default_gpu=default_gpu ) task_losses = LoadLosses(args, task_cfg, args.tasks.split('-')) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) model.eval() for task_id in task_ids: results = [] others = [] predictions = [] for i, batch in enumerate(task_dataloader_val[task_id]): loss, score, batch_size, results, others, predictions = EvaluatingModel(args, task_cfg, device, \ task_id, batch, model, task_dataloader_val, task_losses, results, others, predictions) tbLogger.step_val(0, float(loss), float(score), task_id, batch_size, 'val') sys.stdout.write('%d/%d\r' % (i, len(task_dataloader_val[task_id]))) sys.stdout.flush() # save the result or evaluate the result. ave_score = tbLogger.showLossVal() if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task_id]['val_split']) json.dump(results, open(json_path+ '_result.json', 'w')) json.dump(others, open(json_path+ '_others.json', 'w')) json.dump(predictions, open(json_path+ '_predictions.json', 'w'))
def init_env(is_cuda: Union[bool, int] = True, is_benchmark: bool = False, is_train: bool = True, config_path: Optional[str] = None, experiments_root: str = "experiment", rand_seed: Union[bool, str, int] = False, cv2_num_threads: int = -1, verbosity: bool = True, log_stdout: Union[bool, str] = False, local_rank: Optional[int] = None, silence_non_master_rank: Optional[bool] = False) \ -> Tuple[torch.device, Optional[Dict]]: """Init torch training environment Args: is_cuda (Optional(bool, int)): If False, always use CPU. If Ture, use GPU and set GPU:0 as default device. If int, set GPU:i as default device. is_benchmark (bool): If True, set torch.backends.cudnn.benchmark = True is_train (bool): If False, disable grad config_path (Optional[str]): The path of yaml config experiments_root (str): The path where experiments result are stored rand_seed (Union[bool, str, int]) : If True, fix random of torch, numpy, python's random module from config.RAND_SEED. If False, don't fix random. If rand_seed is int or str, fix random according to rand_seed. (Default: False) cv2_num_threads (int): Set cv2 threads num by cv2.setNumThreads(cv2_num_threads). If < 0, don't set. (Default: -1) verbosity (bool): If True, print detailed info log_stdout (Union[bool, str]): If True, the stdout will be logged to corresponding experiment dir. If False, the stdout will not be logged. If log_stdout is str, it will be recognized as a path and stdout will be logged to that path. (Default: False) local_rank (Optional[int]): If not None, init distributed parallel env with rank = local_rank with init_method = "env://". Default device will also bu set as "cuda:local_rank" .Make sure environment is pre-set. silence_non_master_rank (bool): If True, non master rank's (rank > 0) print will be silenced. (Default: False) Returns: Default device and config (If config_path is None, config is None) """ # * Set distributed, verbosity is delayed. if local_rank is not None: # ** Check distributed env if not dist.is_available(): raise ValueError( f"local_rank = {local_rank} while torch.distributed is not available" ) if not torch.cuda.is_available(): raise ValueError( f"init_env only support cuda device distributed with nccl backend. " f"local_rank = {local_rank} while torch.cuda is not available") # ** Set cuda device id if is_cuda is False: raise ValueError( f"When set local rank, cuda is needed. However, is_cuda = {is_cuda}" ) if isinstance(is_cuda, int) and (is_cuda != local_rank): raise ValueError( f"local_rank = {local_rank} must equal to is_cuda = {is_cuda}") is_cuda = local_rank # ** set device & init_process_group torch.cuda.set_device(local_rank) dist.init_process_group(backend='nccl', init_method="env://", rank=local_rank) # * Read CONFIG, verbosity is delayed. config = parse_config( config_path, experiments_root) if config_path is not None else None def get_stdout_log_dir_from_config(): stdout_log_dir = osp.join(config.rslt_dir, 'stdout') return stdout_log_dir def get_stdout_log_file(stdout_log_dir): if local_rank is not None: prefix = '.' if dist.get_rank() > 0 else '' file_name = prefix + '-'.join( ['stdout', f"rank{dist.get_rank()}", get_local_time_str()]) + '.log' else: file_name = '-'.join( ['stdout', get_local_time_str(for_file_name=True)]) + '.log' return osp.join(stdout_log_dir, file_name) # * Log stdout # ** Get log file if isinstance(log_stdout, bool): stdout_log_file = get_stdout_log_file( get_stdout_log_dir_from_config()) if log_stdout else None elif isinstance(log_stdout, str): stdout_log_file = get_stdout_log_file(log_stdout) else: raise ValueError( f"log_stdout: {log_stdout} should be bool or path str") # ** Set logger if stdout_log_file is not None: if local_rank is not None: silence = silence_non_master_rank and (dist.get_rank() > 0) Logger(stdout_log_file, real_time=True, silence=silence) if verbosity: print(f"Log stdout at {stdout_log_file}. Silence = {silence}") else: Logger(stdout_log_file, real_time=True) if verbosity: print(f"Log stdout at {stdout_log_file}") # * Welcome & Show system info if verbosity: welcome() print(f"Current working dir is {os.getcwd()}") print(f"Current python environment path is\n{sys.path}") print(f"Current Process Info: ") pprint(get_process_info()) print("\n") # * Print distributed delaying verbosity if verbosity and (local_rank is not None): print( f"Using torch.distributed. Current cuda device id is set to local_rank = {local_rank}. \n" f" Progress Group Rank: {dist.get_rank()}\n" f" World Size: {dist.get_world_size()}\n" f" Local Rank: {local_rank}") # * Print config's delaying verbosity if verbosity and config is not None: print( "\033[32m------------------------------- CONFIG -------------------------------\033[0m" ) pprint(dict(config)) print( "\033[32m----------------------------- CONFIG END -----------------------------\033[0m" ) print("\n") if verbosity: print( "\033[32m-------------------------------- INIT --------------------------------\033[0m" ) # * Get device if isinstance(is_cuda, bool): device = get_device(is_cuda, cuda_id=0, verbosity=verbosity) elif isinstance(is_cuda, int): device = get_device(is_cuda=True, cuda_id=is_cuda, verbosity=verbosity) else: raise ValueError(f"Parameter is_cuda = {is_cuda} must be str or int") # * Set benchmark torch.backends.cudnn.benchmark = is_benchmark if verbosity: print(f"torch.backends.cudnn.benchmark = {is_benchmark}") # * Set cv2 threads num if cv2_num_threads >= 0: cv2.setNumThreads(cv2_num_threads) if verbosity: print(f"cv2.setNumThreads({cv2_num_threads})") # * If test, disable grad torch.set_grad_enabled(is_train) if verbosity: print(f"torch.set_grad_enabled({is_train})") def get_rand_seed_from_config(): if 'rand_seed' not in config: raise ValueError(f"CONFIG didn't have key rand_seed") return config.rand_seed # * Set rand seed if isinstance(rand_seed, bool): rand_seed_ = get_rand_seed_from_config() if rand_seed else None elif isinstance(rand_seed, int) or isinstance(rand_seed, str): rand_seed_ = rand_seed else: raise ValueError(f"rand_seed should be bool, int or str.") if rand_seed_ is not None: set_rand_seed(rand_seed_) if verbosity: print(f"Set rand seed {rand_seed_}") # * End of init if verbosity: print( "\033[32m------------------------------ INIT END ------------------------------\033[0m" ) print("\n") # * Return return device, config
def __init__(self, model_creator, data_creator, optimizer_creator, loss_creator, train_function=None, validation_function=None, initialization_hook=None, config=None, num_replicas=1, use_gpu=False, batch_size=16, backend="auto"): """Sets up the PyTorch trainer. Args: model_creator (dict -> torch.nn.Module): creates the model using the config. data_creator (int, dict -> DataLoader, DataLoader): Function that takes in (batch_size, config) and returns two Torch DataLoader objects. optimizer_creator (torch.nn.Module, dict -> optimizer): creates the loss and optimizer using the model and the config. loss_creator (dict -> loss): Creates the loss function/criterion using the config. train_function: Trains a model for a epoch. This takes in ( model, train_dataloader, criterion, optimizer, config), and returns a dict of training stats. validation_function: Runs validation. This takes in ( model, val_dataloader, criterion, config) and returns a dict of validation stats. config (dict): configuration passed to "model_creator", "data_creator", "optimizer_creator", and "loss_creator". num_replicas (int): the number of workers used in distributed training. use_gpu (bool): Sets resource allocation for workers to 1 GPU if true. batch_size (int): batch size for an update. backend (string): backend used by distributed PyTorch. """ # TODO: add support for mixed precision # TODO: add support for callbacks if num_replicas > 1 and not dist.is_available(): raise ValueError( ("Distributed PyTorch is not supported on macOS. " "To run without distributed PyTorch, set 'num_replicas=1'. " "For more information, see " "https://github.com/pytorch/examples/issues/467.")) self.model_creator = model_creator self.data_creator = data_creator self.train_function = train_function self.optimizer_creator = optimizer_creator self.loss_creator = loss_creator self.validation_function = validation_function self.initialization_hook = initialization_hook self.config = {} if config is None else config self.optimizer_timer = utils.TimerStat(window_size=1) if backend == "auto": backend = "nccl" if use_gpu else "gloo" logger.info("Using {} as backend.".format(backend)) self.backend = backend self.use_gpu = use_gpu self.batch_size = batch_size self.max_replicas = num_replicas self.temp_dir = tempfile.mkdtemp(prefix="raysgd") self._num_failures = 0 self._last_resize = float("-inf") self._start_workers(self.max_replicas)
def barrier(): if dist.is_available() and dist.is_initialized(): dist.barrier()
def __init__( self, *, training_operator_cls, initialization_hook=None, config=None, num_workers=1, num_cpus_per_worker=1, use_gpu="auto", backend="auto", wrap_ddp=True, timeout_s=1800, use_fp16=False, use_tqdm=False, add_dist_sampler=True, scheduler_step_freq=None, use_local=False, # Deprecated Args. num_replicas=None, batch_size=None, model_creator=None, data_creator=None, optimizer_creator=None, scheduler_creator=None, loss_creator=None, serialize_data_creation=None, data_loader_args=None, apex_args=None, ): if (model_creator or data_creator or optimizer_creator or scheduler_creator or loss_creator): raise DeprecationWarning( "Creator functions are deprecated. You should create a " "custom TrainingOperator, override setup, and register all " "training state there. See TrainingOperator for more info. " "If you would still like to use creator functions, you can " "do CustomOperator = TrainingOperator.from_creators(" "model_creator, ...) and pass in CustomOperator into " "TorchTrainer.") if use_local and log_once("use_local"): logger.warning("use_local is set to True. This could lead to " "issues with Cuda devices. If you are seeing this " "issue, try setting use_local to False. For more " "information, see " "https://github.com/ray-project/ray/issues/9202.") if num_workers > 1 and not dist.is_available(): raise ValueError( ("Distributed PyTorch is not supported on macOS. " "To run without distributed PyTorch, set 'num_workers=1'. " "For more information, see " "https://github.com/pytorch/examples/issues/467.")) if num_replicas is not None: raise DeprecationWarning( "num_replicas is deprecated. Use num_workers instead.") if batch_size is not None: raise DeprecationWarning( "batch_size is deprecated. Use config={'batch_size': N} " "specify a batch size for each worker or " "config={ray.util.sgd.utils.BATCH_SIZE: N} to specify a " "batch size to be used across all workers.") if apex_args is not None: raise DeprecationWarning( "apex_args is deprecated. Pass in apex_args when calling " "`register` in the `setup` method of your `TrainingOperator` " "instead.") if serialize_data_creation is True: if log_once("serialize_data_creation"): logging.warning( "serialize_data_creation is deprecated and will be " "ignored. If you require serialized data loading you " "should implement this in TrainingOperator.setup. " "You may find FileLock useful here.") if data_loader_args: raise DeprecationWarning( "data_loader_args is deprecated. You can return a " "torch.utils.data.DataLoader in data_creator. Ray will " "automatically set a DistributedSampler if a DataLoader is " "returned and num_workers > 1.") self.training_operator_cls = training_operator_cls self.initialization_hook = initialization_hook self.config = {} if config is None else config if use_gpu == "auto": use_gpu = torch.cuda.is_available() _remind_gpu_usage(use_gpu) if backend == "auto": backend = "nccl" if use_gpu else "gloo" if backend == "nccl": timeout_s = NCCL_TIMEOUT_S logger.debug(f"Using {backend} as backend.") self.backend = backend self.num_cpus_per_worker = num_cpus_per_worker self.use_gpu = use_gpu self.max_replicas = num_workers self.serialize_data_creation = serialize_data_creation self.wrap_ddp = wrap_ddp self.timeout_s = timeout_s self.use_fp16 = use_fp16 self.use_tqdm = use_tqdm self.add_dist_sampler = add_dist_sampler self.use_local = use_local self.temp_dir = tempfile.mkdtemp(prefix="raysgd") self._num_failures = 0 self._last_resize = float("-inf") if scheduler_step_freq: _validate_scheduler_step_freq(scheduler_step_freq) self.scheduler_step_freq = scheduler_step_freq if not ray.is_initialized() and self.max_replicas > 1: logger.info("Automatically initializing single-node Ray. To use " "multi-node training, be sure to run `ray.init(" "address='auto')` before instantiating the Trainer.") ray.init() startup_success = self._start_workers(self.max_replicas) if not startup_success: raise RuntimeError("Worker startup failed. " "Are you sure you have enough resources to " "start the specified number of workers?")
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="results", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_config.json", type=str, help="The config file which specified the model details.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader.", ) parser.add_argument("--save_name", default="", type=str, help="save name for training.") parser.add_argument( "--use_chunk", default=0, type=float, help="whether use chunck for parallel training.", ) parser.add_argument("--batch_size", default=30, type=int, help="what is the batch size?") parser.add_argument("--tasks", default="", type=str, help="1-2-3... training task separate by -") parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument("--baseline", action="store_true", help="whether use single stream baseline.") parser.add_argument("--split", default="", type=str, help="which split to use.") parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) parser.add_argument( "--task_specific_tokens", action="store_true", help="whether to use task specific tokens for the multi-task learning.", ) args = parser.parse_args() with open("vilbert_tasks.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_transformers.modeling_bert import BertConfig from vilbert.basebert import BaseBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id name = task_cfg[task]["name"] task_names.append(name) # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0] timeStamp = args.from_pretrained.split("/")[-1] + "-" + args.save_name savePath = os.path.join(args.output_dir, timeStamp) config = BertConfig.from_json_file(args.config_file) if args.task_specific_tokens: config.task_specific_tokens = True if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val = LoadDatasetEval( args, task_cfg, args.tasks.split("-")) tbLogger = utils.tbLogger( timeStamp, savePath, task_names, task_ids, task_num_iters, 1, save_logger=False, txt_name="eval.txt", ) num_labels = max( [dataset.num_labels for dataset in task_datasets_val.values()]) if args.dynamic_attention: config.dynamic_attention = True if "roberta" in args.bert_model: config.model = "roberta" if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if args.task_specific_tokens: config.task_specific_tokens = True if args.baseline: model = BaseBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) task_losses = LoadLosses(args, task_cfg, args.tasks.split("-")) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) print("***** Running evaluation *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) model.eval() # when run evaluate, we run each task sequentially. for task_id in task_ids: results = [] others = [] for i, batch in enumerate(task_dataloader_val[task_id]): loss, score, batch_size, results, others = EvaluatingModel( args, task_cfg, device, task_id, batch, model, task_dataloader_val, task_losses, results, others, ) tbLogger.step_val(0, float(loss), float(score), task_id, batch_size, "val") sys.stdout.write("%d/%d\r" % (i, len(task_dataloader_val[task_id]))) sys.stdout.flush() # save the result or evaluate the result. ave_score = tbLogger.showLossVal(task_id) if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task_id]["val_split"]) json.dump(results, open(json_path + "_result.json", "w")) json.dump(others, open(json_path + "_others.json", "w"))
def main(): parser = argparse.ArgumentParser(description='PyTorch BLERSSI Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=1, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--beta1', type=float, default=0.1, help='Beta1 value') parser.add_argument('--beta2', type=float, default=0.5, help='Beta2 value') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') parser.add_argument('--dir', default='logs', metavar='L', help='directory where summary logs are stored') if dist.is_available(): parser.add_argument( '--backend', type=str, help='Distributed backend', choices=[dist.Backend.GLOO, dist.Backend.NCCL, dist.Backend.MPI], default=dist.Backend.GLOO) args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() if use_cuda: print('Using CUDA') writer = SummaryWriter(args.dir) torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") if should_distribute(): print('Using distributed PyTorch with {} backend'.format(args.backend)) dist.init_process_group(backend=args.backend) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} model = Net().to(device) if is_distributed(): Distributor = nn.parallel.DistributedDataParallel if use_cuda \ else nn.parallel.DistributedDataParallelCPU model = Distributor(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2)) for epoch in range(1, args.epochs + 1): train(args, model, optimizer, epoch, writer) if (args.save_model): torch.save(model.state_dict(), "/var/blerssi_cnn.pt") print("Model Saved")
def train_and_report_stats( config: ConfigSchema, model: Optional[MultiRelationEmbedder] = None, trainer: Optional[AbstractBatchProcessor] = None, evaluator: Optional[AbstractBatchProcessor] = None, rank: Rank = RANK_ZERO, subprocess_init: Optional[Callable[[], None]] = None, ) -> Generator[Tuple[int, Optional[Stats], Stats, Optional[Stats]], None, None]: """Each epoch/pass, for each partition pair, loads in embeddings and edgelist from disk, runs HOGWILD training on them, and writes partitions back to disk. """ tag_logs_with_process_name(f"Trainer-{rank}") if config.verbose > 0: import pprint pprint.PrettyPrinter().pprint(config.to_dict()) logger.info("Loading entity counts...") entity_storage = ENTITY_STORAGES.make_instance(config.entity_path) entity_counts: Dict[str, List[int]] = {} for entity, econf in config.entities.items(): entity_counts[entity] = [] for part in range(econf.num_partitions): entity_counts[entity].append( entity_storage.load_count(entity, part)) # Figure out how many lhs and rhs partitions we need nparts_lhs, lhs_partitioned_types = get_partitioned_types(config, Side.LHS) nparts_rhs, rhs_partitioned_types = get_partitioned_types(config, Side.RHS) logger.debug(f"nparts {nparts_lhs} {nparts_rhs} " f"types {lhs_partitioned_types} {rhs_partitioned_types}") total_buckets = nparts_lhs * nparts_rhs sync: AbstractSynchronizer bucket_scheduler: AbstractBucketScheduler parameter_sharer: Optional[ParameterSharer] partition_client: Optional[PartitionClient] if config.num_machines > 1: if not 0 <= rank < config.num_machines: raise RuntimeError("Invalid rank for trainer") if not td.is_available(): raise RuntimeError("The installed PyTorch version doesn't provide " "distributed training capabilities.") ranks = ProcessRanks.from_num_invocations(config.num_machines, config.num_partition_servers) if rank == RANK_ZERO: logger.info("Setup lock server...") start_server( LockServer( num_clients=len(ranks.trainers), nparts_lhs=nparts_lhs, nparts_rhs=nparts_rhs, lock_lhs=len(lhs_partitioned_types) > 0, lock_rhs=len(rhs_partitioned_types) > 0, init_tree=config.distributed_tree_init_order, ), process_name="LockServer", init_method=config.distributed_init_method, world_size=ranks.world_size, server_rank=ranks.lock_server, groups=[ranks.trainers], subprocess_init=subprocess_init, ) bucket_scheduler = DistributedBucketScheduler( server_rank=ranks.lock_server, client_rank=ranks.trainers[rank], ) logger.info("Setup param server...") start_server( ParameterServer(num_clients=len(ranks.trainers)), process_name=f"ParamS-{rank}", init_method=config.distributed_init_method, world_size=ranks.world_size, server_rank=ranks.parameter_servers[rank], groups=[ranks.trainers], subprocess_init=subprocess_init, ) parameter_sharer = ParameterSharer( process_name=f"ParamC-{rank}", client_rank=ranks.parameter_clients[rank], all_server_ranks=ranks.parameter_servers, init_method=config.distributed_init_method, world_size=ranks.world_size, groups=[ranks.trainers], subprocess_init=subprocess_init, ) if config.num_partition_servers == -1: start_server( ParameterServer(num_clients=len(ranks.trainers), log_stats=True), process_name=f"PartS-{rank}", init_method=config.distributed_init_method, world_size=ranks.world_size, server_rank=ranks.partition_servers[rank], groups=[ranks.trainers], subprocess_init=subprocess_init, ) if len(ranks.partition_servers) > 0: partition_client = PartitionClient(ranks.partition_servers, log_stats=True) else: partition_client = None groups = init_process_group( rank=ranks.trainers[rank], world_size=ranks.world_size, init_method=config.distributed_init_method, groups=[ranks.trainers], ) trainer_group, = groups sync = DistributedSynchronizer(trainer_group) else: sync = DummySynchronizer() bucket_scheduler = SingleMachineBucketScheduler( nparts_lhs, nparts_rhs, config.bucket_order) parameter_sharer = None partition_client = None hide_distributed_logging() # fork early for HOGWILD threads logger.info("Creating workers...") num_workers = get_num_workers(config.workers) pool = create_pool( num_workers, subprocess_name=f"TWorker-{rank}", subprocess_init=subprocess_init, ) def make_optimizer(params: Iterable[torch.nn.Parameter], is_emb: bool) -> Optimizer: params = list(params) if len(params) == 0: optimizer = DummyOptimizer() elif is_emb: optimizer = RowAdagrad(params, lr=config.lr) else: if config.relation_lr is not None: lr = config.relation_lr else: lr = config.lr optimizer = Adagrad(params, lr=lr) optimizer.share_memory() return optimizer # background_io is only supported in single-machine mode background_io = config.background_io and config.num_machines == 1 checkpoint_manager = CheckpointManager( config.checkpoint_path, background=background_io, rank=rank, num_machines=config.num_machines, partition_client=partition_client, subprocess_name=f"BackgRW-{rank}", subprocess_init=subprocess_init, ) checkpoint_manager.register_metadata_provider( ConfigMetadataProvider(config)) checkpoint_manager.write_config(config) iteration_manager = IterationManager( config.num_epochs, config.edge_paths, config.num_edge_chunks, iteration_idx=checkpoint_manager.checkpoint_version) checkpoint_manager.register_metadata_provider(iteration_manager) if config.init_path is not None: loadpath_manager = CheckpointManager(config.init_path) else: loadpath_manager = None def load_embeddings( entity: EntityName, part: Partition, strict: bool = False, force_dirty: bool = False, ) -> Tuple[torch.nn.Parameter, Optional[OptimizerStateDict]]: if strict: embs, optim_state = checkpoint_manager.read( entity, part, force_dirty=force_dirty) else: # Strict is only false during the first iteration, because in that # case the checkpoint may not contain any data (unless a previous # run was resumed) so we fall back on initial values. embs, optim_state = checkpoint_manager.maybe_read( entity, part, force_dirty=force_dirty) if embs is None and loadpath_manager is not None: embs, optim_state = loadpath_manager.maybe_read(entity, part) if embs is None: embs, optim_state = init_embs(entity, entity_counts[entity][part], config.dimension, config.init_scale) assert embs.is_shared() return torch.nn.Parameter(embs), optim_state logger.info("Initializing global model...") if model is None: model = make_model(config) model.share_memory() if trainer is None: trainer = Trainer( global_optimizer=make_optimizer(model.parameters(), False), loss_fn=config.loss_fn, margin=config.margin, relations=config.relations, ) if evaluator is None: evaluator = TrainingRankingEvaluator( override_num_batch_negs=config.eval_num_batch_negs, override_num_uniform_negs=config.eval_num_uniform_negs, ) eval_batch_size = round_up_to_nearest_multiple(config.batch_size, config.eval_num_batch_negs) state_dict, optim_state = checkpoint_manager.maybe_read_model() if state_dict is None and loadpath_manager is not None: state_dict, optim_state = loadpath_manager.maybe_read_model() if state_dict is not None: model.load_state_dict(state_dict, strict=False) if optim_state is not None: trainer.global_optimizer.load_state_dict(optim_state) logger.debug("Loading unpartitioned entities...") for entity, econfig in config.entities.items(): if econfig.num_partitions == 1: embs, optim_state = load_embeddings(entity, Partition(0)) model.set_embeddings(entity, embs, Side.LHS) model.set_embeddings(entity, embs, Side.RHS) optimizer = make_optimizer([embs], True) if optim_state is not None: optimizer.load_state_dict(optim_state) trainer.entity_optimizers[(entity, Partition(0))] = optimizer # start communicating shared parameters with the parameter server if parameter_sharer is not None: parameter_sharer.share_model_params(model) strict = False def swap_partitioned_embeddings( old_b: Optional[Bucket], new_b: Optional[Bucket], ): # 0. given the old and new buckets, construct data structures to keep # track of old and new embedding (entity, part) tuples io_bytes = 0 logger.info(f"Swapping partitioned embeddings {old_b} {new_b}") types = ([(e, Side.LHS) for e in lhs_partitioned_types] + [(e, Side.RHS) for e in rhs_partitioned_types]) old_parts = {(e, old_b.get_partition(side)): side for e, side in types if old_b is not None} new_parts = {(e, new_b.get_partition(side)): side for e, side in types if new_b is not None} to_checkpoint = set(old_parts) - set(new_parts) preserved = set(old_parts) & set(new_parts) # 1. checkpoint embeddings that will not be used in the next pair # if old_b is not None: # there are previous embeddings to checkpoint logger.info("Writing partitioned embeddings") for entity, part in to_checkpoint: side = old_parts[(entity, part)] side_name = side.pick("lhs", "rhs") logger.debug(f"Checkpointing ({entity} {part} {side_name})") embs = model.get_embeddings(entity, side) optim_key = (entity, part) optim_state = OptimizerStateDict( trainer.entity_optimizers[optim_key].state_dict()) io_bytes += embs.numel() * embs.element_size( ) # ignore optim state checkpoint_manager.write(entity, part, embs.detach(), optim_state) if optim_key in trainer.entity_optimizers: del trainer.entity_optimizers[optim_key] # these variables are holding large objects; let them be freed del embs del optim_state bucket_scheduler.release_bucket(old_b) # 2. copy old embeddings that will be used in the next pair # into a temporary dictionary # tmp_emb = { x: model.get_embeddings(x[0], old_parts[x]) for x in preserved } for entity, _ in types: model.clear_embeddings(entity, Side.LHS) model.clear_embeddings(entity, Side.RHS) if new_b is None: # there are no new embeddings to load return io_bytes bucket_logger = BucketLogger(logger, bucket=new_b) # 3. load new embeddings into the model/optimizer, either from disk # or the temporary dictionary # bucket_logger.info("Loading entities") for entity, side in types: part = new_b.get_partition(side) part_key = (entity, part) if part_key in tmp_emb: bucket_logger.debug( f"Loading ({entity}, {part}) from preserved") embs, optim_state = tmp_emb[part_key], None else: bucket_logger.debug(f"Loading ({entity}, {part})") force_dirty = bucket_scheduler.check_and_set_dirty( entity, part) embs, optim_state = load_embeddings(entity, part, strict=strict, force_dirty=force_dirty) io_bytes += embs.numel() * embs.element_size( ) # ignore optim state model.set_embeddings(entity, embs, side) tmp_emb[part_key] = embs optim_key = (entity, part) if optim_key not in trainer.entity_optimizers: bucket_logger.debug(f"Resetting optimizer {optim_key}") optimizer = make_optimizer([embs], True) if optim_state is not None: bucket_logger.debug("Setting optim state") optimizer.load_state_dict(optim_state) trainer.entity_optimizers[optim_key] = optimizer return io_bytes if rank == RANK_ZERO: for stats in checkpoint_manager.maybe_read_stats(): yield ( stats["index"], Stats.from_dict(stats["eval_stats_before"]), Stats.from_dict(stats["stats"]), Stats.from_dict(stats["eval_stats_after"]), ) # Start of the main training loop. for epoch_idx, edge_path_idx, edge_chunk_idx in iteration_manager: logger.info( f"Starting epoch {epoch_idx + 1} / {iteration_manager.num_epochs}, " f"edge path {edge_path_idx + 1} / {iteration_manager.num_edge_paths}, " f"edge chunk {edge_chunk_idx + 1} / {iteration_manager.num_edge_chunks}" ) edge_storage = EDGE_STORAGES.make_instance(iteration_manager.edge_path) logger.info(f"Edge path: {iteration_manager.edge_path}") sync.barrier() dist_logger.info("Lock client new epoch...") bucket_scheduler.new_pass( is_first=iteration_manager.iteration_idx == 0) sync.barrier() remaining = total_buckets cur_b = None while remaining > 0: old_b = cur_b io_time = 0. io_bytes = 0 cur_b, remaining = bucket_scheduler.acquire_bucket() logger.info(f"still in queue: {remaining}") if cur_b is None: if old_b is not None: # if you couldn't get a new pair, release the lock # to prevent a deadlock! tic = time.time() io_bytes += swap_partitioned_embeddings(old_b, None) io_time += time.time() - tic time.sleep(1) # don't hammer td continue bucket_logger = BucketLogger(logger, bucket=cur_b) tic = time.time() io_bytes += swap_partitioned_embeddings(old_b, cur_b) current_index = \ (iteration_manager.iteration_idx + 1) * total_buckets - remaining next_b = bucket_scheduler.peek() if next_b is not None and background_io: # Ensure the previous bucket finished writing to disk. checkpoint_manager.wait_for_marker(current_index - 1) bucket_logger.debug("Prefetching") for entity in lhs_partitioned_types: checkpoint_manager.prefetch(entity, next_b.lhs) for entity in rhs_partitioned_types: checkpoint_manager.prefetch(entity, next_b.rhs) checkpoint_manager.record_marker(current_index) bucket_logger.debug("Loading edges") edges = edge_storage.load_chunk_of_edges(cur_b.lhs, cur_b.rhs, edge_chunk_idx, config.num_edge_chunks) num_edges = len(edges) # this might be off in the case of tensorlist or extra edge fields io_bytes += edges.lhs.tensor.numel( ) * edges.lhs.tensor.element_size() io_bytes += edges.rhs.tensor.numel( ) * edges.rhs.tensor.element_size() io_bytes += edges.rel.numel() * edges.rel.element_size() bucket_logger.debug("Shuffling edges") # Fix a seed to get the same permutation every time; have it # depend on all and only what affects the set of edges. g = torch.Generator() g.manual_seed( hash((edge_path_idx, edge_chunk_idx, cur_b.lhs, cur_b.rhs))) num_eval_edges = int(num_edges * config.eval_fraction) if num_eval_edges > 0: edge_perm = torch.randperm(num_edges, generator=g) eval_edge_perm = edge_perm[-num_eval_edges:] num_edges -= num_eval_edges edge_perm = edge_perm[torch.randperm(num_edges)] else: edge_perm = torch.randperm(num_edges) # HOGWILD evaluation before training eval_stats_before: Optional[Stats] = None if num_eval_edges > 0: bucket_logger.debug( "Waiting for workers to perform evaluation") future_all_eval_stats_before = pool.map_async( call, [ partial( process_in_batches, batch_size=eval_batch_size, model=model, batch_processor=evaluator, edges=edges, indices=eval_edge_perm[s], ) for s in split_almost_equally(eval_edge_perm.size(0), num_parts=num_workers) ]) all_eval_stats_before = \ get_async_result(future_all_eval_stats_before, pool) eval_stats_before = Stats.sum(all_eval_stats_before).average() bucket_logger.info( f"Stats before training: {eval_stats_before}") io_time += time.time() - tic tic = time.time() # HOGWILD training bucket_logger.debug("Waiting for workers to perform training") # FIXME should we only delay if iteration_idx == 0? future_all_stats = pool.map_async(call, [ partial( process_in_batches, batch_size=config.batch_size, model=model, batch_processor=trainer, edges=edges, indices=edge_perm[s], delay=config.hogwild_delay if epoch_idx == 0 and rank > 0 else 0, ) for rank, s in enumerate( split_almost_equally(edge_perm.size(0), num_parts=num_workers)) ]) all_stats = get_async_result(future_all_stats, pool) stats = Stats.sum(all_stats).average() compute_time = time.time() - tic bucket_logger.info( f"bucket {total_buckets - remaining} / {total_buckets} : " f"Processed {num_edges} edges in {compute_time:.2f} s " f"( {num_edges / compute_time / 1e6:.2g} M/sec ); " f"io: {io_time:.2f} s ( {io_bytes / io_time / 1e6:.2f} MB/sec )" ) bucket_logger.info(f"{stats}") # HOGWILD eval after training eval_stats_after: Optional[Stats] = None if num_eval_edges > 0: bucket_logger.debug( "Waiting for workers to perform evaluation") future_all_eval_stats_after = pool.map_async( call, [ partial( process_in_batches, batch_size=eval_batch_size, model=model, batch_processor=evaluator, edges=edges, indices=eval_edge_perm[s], ) for s in split_almost_equally(eval_edge_perm.size(0), num_parts=num_workers) ]) all_eval_stats_after = \ get_async_result(future_all_eval_stats_after, pool) eval_stats_after = Stats.sum(all_eval_stats_after).average() bucket_logger.info(f"Stats after training: {eval_stats_after}") # Add train/eval metrics to queue checkpoint_manager.append_stats({ "index": current_index, "eval_stats_before": eval_stats_before.to_dict(), "stats": stats.to_dict(), "eval_stats_after": eval_stats_after.to_dict(), }) yield current_index, eval_stats_before, stats, eval_stats_after swap_partitioned_embeddings(cur_b, None) # Distributed Processing: all machines can leave the barrier now. sync.barrier() # Preserving a checkpoint requires two steps: # - create a snapshot (w/ symlinks) after it's first written; # - don't delete it once the following one is written. # These two happen in two successive iterations of the main loop: the # one just before and the one just after the epoch boundary. preserve_old_checkpoint = should_preserve_old_checkpoint( iteration_manager, config.checkpoint_preservation_interval) preserve_new_checkpoint = should_preserve_old_checkpoint( iteration_manager + 1, config.checkpoint_preservation_interval) # Write metadata: for multiple machines, write from rank-0 logger.info( f"Finished epoch {epoch_idx + 1} / {iteration_manager.num_epochs}, " f"edge path {edge_path_idx + 1} / {iteration_manager.num_edge_paths}, " f"edge chunk {edge_chunk_idx + 1} / {iteration_manager.num_edge_chunks}" ) if rank == 0: for entity, econfig in config.entities.items(): if econfig.num_partitions == 1: embs = model.get_embeddings(entity, Side.LHS) optimizer = trainer.entity_optimizers[(entity, Partition(0))] checkpoint_manager.write( entity, Partition(0), embs.detach(), OptimizerStateDict(optimizer.state_dict())) sanitized_state_dict: ModuleStateDict = {} for k, v in ModuleStateDict(model.state_dict()).items(): if k.startswith('lhs_embs') or k.startswith('rhs_embs'): # skipping state that's an entity embedding continue sanitized_state_dict[k] = v logger.info("Writing the metadata") checkpoint_manager.write_model( sanitized_state_dict, OptimizerStateDict(trainer.global_optimizer.state_dict()), ) logger.info("Writing the checkpoint") checkpoint_manager.write_new_version(config) dist_logger.info( "Waiting for other workers to write their parts of the checkpoint") sync.barrier() dist_logger.info("All parts of the checkpoint have been written") logger.info("Switching to the new checkpoint version") checkpoint_manager.switch_to_new_version() dist_logger.info( "Waiting for other workers to switch to the new checkpoint version" ) sync.barrier() dist_logger.info( "All workers have switched to the new checkpoint version") # After all the machines have finished committing # checkpoints, we either remove the old checkpoints # or we preserve it if preserve_new_checkpoint: # Add 1 so the index is a multiple of the interval, it looks nicer. checkpoint_manager.preserve_current_version(config, epoch_idx + 1) if not preserve_old_checkpoint: checkpoint_manager.remove_old_version(config) # now we're sure that all partition files exist, # so be strict about loading them strict = True # quiescence pool.close() pool.join() sync.barrier() checkpoint_manager.close() if loadpath_manager is not None: loadpath_manager.close() # FIXME join distributed workers (not really necessary) logger.info("Exiting")
def print_once(msg): if (not dist.is_available()) or dist.get_rank() == 0: print(msg)
def is_available(): return dist.is_available() and cda.device_count() > 1
def distributed_is_initialized(): if distributed.is_available(): if distributed.is_initialized(): return True return False
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--file_path", default="data/conceptual_caption/", type=str, help="The input train corpus.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-base-uncased, roberta-base, roberta-large, ", ) parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, roberta-base", ) parser.add_argument( "--output_dir", default="save", type=str, # required=True, help="The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", type=str, default="config/bert_base_6layer_6conect.json", help="The config file which specified the model details.", ) ## Other parameters parser.add_argument( "--max_seq_length", default=36, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.", ) parser.add_argument( "--train_batch_size", default=512, type=int, help="Total batch size for training.", ) parser.add_argument( "--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.", ) parser.add_argument( "--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--start_epoch", default=0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument( "--img_weight", default=1, type=float, help="weight for image loss" ) parser.add_argument( "--no_cuda", action="store_true", help="Whether not to use CUDA when available" ) parser.add_argument( "--on_memory", action="store_true", help="Whether to load train samples into memory or use disk", ) parser.add_argument( "--do_lower_case", type=bool, default=True, help="Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument( "--seed", type=int, default=42, help="random seed for initialization" ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--num_workers", type=int, default=25, help="Number of workers in the dataloader.", ) parser.add_argument( "--save_name", default="", type=str, help="save name for training." ) parser.add_argument( "--baseline", action="store_true", help="Wheter to use the baseline model (single bert).", ) parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.", ) parser.add_argument( "--distributed", action="store_true", help="whether use chunck for parallel training.", ) parser.add_argument( "--without_coattention", action="store_true", help="whether pair loss." ) parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) parser.add_argument( "--objective", default=0, type=int, help="which objective to use \ 0: with ICA loss, \ 1: with ICA loss, for the not aligned pair, no masking objective, \ 2: without ICA loss, do not sample negative pair.", ) parser.add_argument( "--num_negative", default=255, type=int, help="num of negative to use" ) parser.add_argument( "--resume_file", default="", type=str, help="Resume from checkpoint" ) parser.add_argument( "--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer." ) args = parser.parse_args() if args.baseline: from pytorch_pretrained_bert.modeling import BertConfig from vilbert.basebert import BertForMultiModalPreTraining else: from vilbert.vilbert import BertForMultiModalPreTraining, BertConfig if args.save_name: prefix = "-" + args.save_name else: prefix = "" timeStamp = args.config_file.split("/")[1].split(".")[0] + prefix savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.from_pretrained + "_weight_name.json", "r") ) if args.local_rank == -1 or args.no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu" ) n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16 ) ) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps cache = 5000 if dist.is_available() and args.local_rank != -1: num_replicas = dist.get_world_size() args.train_batch_size = args.train_batch_size // num_replicas args.num_workers = args.num_workers // num_replicas cache = cache // num_replicas random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if "roberta" in args.bert_model: tokenizer = RobertaTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case ) else: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case ) num_train_optimization_steps = None train_dataset = ConceptCapLoaderTrain( args.file_path, tokenizer, args.bert_model, seq_len=args.max_seq_length, batch_size=args.train_batch_size, visual_target=args.visual_target, num_workers=args.num_workers, local_rank=args.local_rank, objective=args.objective, cache=cache, ) validation_dataset = ConceptCapLoaderVal( args.file_path, tokenizer, args.bert_model, seq_len=args.max_seq_length, batch_size=args.train_batch_size, visual_target=args.visual_target, num_workers=2, objective=args.objective, ) num_train_optimization_steps = int( train_dataset.num_dataset / args.train_batch_size / args.gradient_accumulation_steps ) * (args.num_train_epochs - args.start_epoch) task_names = ["Conceptual_Caption"] task_ids = ["TASK0"] task_num_iters = {"TASK0": train_dataset.num_dataset / args.train_batch_size} logdir = os.path.join("logs", timeStamp) if default_gpu: tbLogger = utils.tbLogger( logdir, savePath, task_names, task_ids, task_num_iters, args.gradient_accumulation_steps, ) if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if "roberta" in args.bert_model: config.model = "roberta" if args.freeze > config.t_biattention_id[0]: config.fixed_t_layer = config.t_biattention_id[0] if args.without_coattention: config.with_coattention = False if args.dynamic_attention: config.dynamic_attention = True if args.from_pretrained: model = BertForMultiModalPreTraining.from_pretrained( args.from_pretrained, config=config, default_gpu=default_gpu ) else: model = BertForMultiModalPreTraining(config) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if "embeddings" in name: bert_weight_name_filtered.append(name) elif "encoder" in name: layer_num = name.split(".")[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) if not args.from_pretrained: param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] else: optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if key[12:] in bert_weight_name: lr = args.learning_rate * 0.1 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [ {"params": [value], "lr": lr, "weight_decay": 0.0} ] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [ {"params": [value], "lr": lr, "weight_decay": 0.01} ] if default_gpu: print( len(list(model.named_parameters())), len(optimizer_grouped_parameters) ) # set different parameters for vision branch and lanugage branch. if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam( optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0, ) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98), ) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=args.warmup_proportion * num_train_optimization_steps, t_total=num_train_optimization_steps, ) startIterID = 0 global_step = 0 if args.resume_file != "" and os.path.exists(args.resume_file): checkpoint = torch.load(args.resume_file, map_location="cpu") new_dict = {} for attr in checkpoint["model_state_dict"]: if attr.startswith("module."): new_dict[attr.replace("module.", "", 1)] = checkpoint[ "model_state_dict" ][attr] else: new_dict[attr] = checkpoint["model_state_dict"][attr] model.load_state_dict(new_dict) scheduler.load_state_dict(checkpoint["scheduler_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) global_step = checkpoint["global_step"] del checkpoint model.cuda() for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.fp16: model.half() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if default_gpu: logger.info("***** Running training *****") logger.info(" Num examples = %d", train_dataset.num_dataset) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) for epochId in range(int(args.start_epoch), int(args.num_train_epochs)): model.train() for step, batch in enumerate(train_dataset): iterId = startIterID + step + (epochId * len(train_dataset)) image_ids = batch[-1] batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask = ( batch ) if args.objective == 1: image_label = image_label * (is_next == 0).long().unsqueeze(1) image_label[image_label == 0] = -1 lm_label_ids = lm_label_ids * (is_next == 0).long().unsqueeze(1) lm_label_ids[lm_label_ids == 0] = -1 masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_target, is_next, ) if args.objective == 2: next_sentence_loss = next_sentence_loss * 0 masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if default_gpu: tbLogger.step_train_CC( epochId, iterId, float(masked_loss_t), float(masked_loss_v), float(next_sentence_loss), optimizer.param_groups[0]["lr"], "TASK0", "train", ) if ( step % (20 * args.gradient_accumulation_steps) == 0 and step != 0 and default_gpu ): tbLogger.showLossTrainCC() # Do the evaluation torch.set_grad_enabled(False) numBatches = len(validation_dataset) model.eval() for step, batch in enumerate(validation_dataset): image_ids = batch[-1] batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask = ( batch ) batch_size = input_ids.size(0) masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_target, is_next, ) masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() if default_gpu: tbLogger.step_val_CC( epochId, float(masked_loss_t), float(masked_loss_v), float(next_sentence_loss), "TASK0", batch_size, "val", ) sys.stdout.write("%d / %d \r" % (step, numBatches)) sys.stdout.flush() if default_gpu: ave_score = tbLogger.showLossValCC() torch.set_grad_enabled(True) if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin" ) output_checkpoint = os.path.join( savePath, "pytorch_ckpt_" + str(epochId) + ".tar" ) torch.save(model_to_save.state_dict(), output_model_file) torch.save( { "model_state_dict": model_to_save.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "global_step": global_step, }, output_checkpoint, ) if default_gpu: tbLogger.txt_close()
import os import sys import unittest from datetime import timedelta import torch import torch.distributed as c10d if not c10d.is_available(): print("c10d not available, skipping tests", file=sys.stderr) sys.exit(0) from torch.testing._internal.common_distributed import ( MultiProcessTestCase, requires_nccl, requires_gloo, skip_if_lt_x_gpu, with_dist_debug_levels, create_device, ) from torch.testing._internal.common_utils import ( run_tests, TEST_WITH_TSAN, ) from test_c10d_common import LOOPBACK class AbstractProcessGroupWrapperTest(MultiProcessTestCase): def setUp(self): super(AbstractProcessGroupWrapperTest, self).setUp() # For Windows platform, Python does not support fork, change it to spawn here.
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=1, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--dir', default='logs', metavar='L', help='directory where summary logs are stored') if dist.is_available(): parser.add_argument('--backend', type=str, help='Distributed backend', choices=[dist.Backend.GLOO, dist.Backend.NCCL, dist.Backend.MPI], default=dist.Backend.GLOO) args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() if use_cuda: print('Using CUDA') writer = SummaryWriter(args.dir) torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") if should_distribute(): print('Using distributed PyTorch with {} backend'.format(args.backend)) dist.init_process_group(backend=args.backend) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.test_batch_size, shuffle=False, **kwargs) model = Net().to(device) if is_distributed(): Distributor = nn.parallel.DistributedDataParallel if use_cuda \ else nn.parallel.DistributedDataParallelCPU model = Distributor(model) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch, writer) test(args, model, device, test_loader, writer, epoch) if (args.save_model): torch.save(model.state_dict(),"mnist_cnn.pt")
def is_dist_avail_and_initialized(): if not dist.is_available(): return False if not dist.is_initialized(): return False return True
def get_dist_info(): assert dist.is_initialized(), "还没有初始化分布式!" assert dist.is_available(), "分布式在当前设备不可用!" rank = dist.get_rank() world_size = dist.get_world_size() return rank, world_size
DEFAULT_TIMEOUT = 300 CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel": 500} if INIT_METHOD.startswith("file://"): FOLDER = INIT_METHOD[7:] def get_timeout(test_id): test_name = test_id.split(".")[-1] if test_name in CUSTOMIZED_TIMEOUT: return CUSTOMIZED_TIMEOUT[test_name] else: return DEFAULT_TIMEOUT if not dist.is_available(): print("Distributed not available, skipping tests") sys.exit(0) SKIP_IF_NO_CUDA_EXIT_CODE = 75 SKIP_IF_NO_GPU_EXIT_CODE = 76 SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE = 77 SKIP_IF_BACKEND_UNAVAILABLE = 78 def skip_if_no_cuda_distributed(func): func.skip_if_no_cuda_distributed = True @wraps(func) def wrapper(*args, **kwargs): if not torch.cuda.is_available():
def setup_common_training_handlers( trainer, train_sampler=None, to_save=None, save_every_iters=1000, output_path=None, lr_scheduler=None, with_gpu_stats=False, output_names=None, with_pbars=True, with_pbar_on_iters=True, log_every_iters=100, device="cuda", ): """Helper method to setup trainer with common handlers (it also supports distributed configuration): - :class:`~ignite.handlers.TerminateOnNan` - handler to setup learning rate scheduling - :class:`~ignite.handlers.ModelCheckpoint` - :class:`~ignite.metrics.RunningAverage` on `update_function` output - Two progress bars on epochs and optionally on iterations Args: trainer (Engine): trainer engine. Output of trainer's `update_function` should be a dictionary or sequence or a single tensor. train_sampler (torch.utils.data.DistributedSampler, optional): Optional distributed sampler used to call `set_epoch` method on epoch started event. to_save (dict, optional): dictionary with objects to save in the checkpoint. This is used with :class:`~ignite.handlers.ModelCheckpoint`. save_every_iters (int, optional): saving interval. By default, `to_save` objects are stored each 1000 iterations. output_path (str, optional): output path to indicate where `to_save` objects are stored. lr_scheduler (ParamScheduler or subclass of `torch.optim.lr_scheduler._LRScheduler`): learning rate scheduler as native torch LRScheduler or ignite's parameter scheduler. with_gpu_stats (bool, optional): if True, :class:`~ignite.contrib.metrics.handlers.GpuInfo` is attached to the trainer. This requires `pynvml` package to be installed. output_names (list/tuple): list of names associated with `update_function` output dictionary. with_pbars (bool, optional): if True, two progress bars on epochs and optionally on iterations are attached with_pbar_on_iters (bool, optional): if True, a progress bar on iterations is attached to the trainer. log_every_iters (int, optional): logging interval for :class:`~ignite.contrib.metrics.handlers.GpuInfo` and for epoch-wise progress bar. device (str of torch.device, optional): Optional device specification in case of distributed computation usage. """ kwargs = dict( to_save=to_save, save_every_iters=save_every_iters, output_path=output_path, lr_scheduler=lr_scheduler, with_gpu_stats=with_gpu_stats, output_names=output_names, with_pbars=with_pbars, with_pbar_on_iters=with_pbar_on_iters, log_every_iters=log_every_iters, device=device, ) if dist.is_available() and dist.is_initialized(): _setup_common_distrib_training_handlers(trainer, train_sampler=train_sampler, **kwargs) else: if train_sampler is not None: warnings.warn( "Argument train_sampler distributed sampler used to call `set_epoch` method on epoch " "started event, but no distributed setting detected", UserWarning, ) _setup_common_training_handlers(trainer, **kwargs)
def get_world_size() -> int: if not dist.is_available(): return 1 if not dist.is_initialized(): return 1 return dist.get_world_size()
def should_distribute(): return dist.is_available() and WORLD_SIZE > 1
import time import unittest from functools import wraps, reduce from contextlib import contextmanager import torch import torch.distributed as dist from common import TestCase BACKEND = os.environ['BACKEND'] TEMP_DIR = os.environ['TEMP_DIR'] MASTER_PORT = '29500' MASTER_ADDR = '127.0.0.1' if not dist.is_available(): print('Distributed not available, skipping tests') sys.exit(0) @contextmanager def _lock(): lockfile = os.path.join(TEMP_DIR, 'lockfile') with open(lockfile, 'w') as lf: try: fcntl.flock(lf.fileno(), fcntl.LOCK_EX) yield finally: fcntl.flock(lf.fileno(), fcntl.LOCK_UN) lf.close()
def main(): args = parse_args() # Devices if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True logger.info( f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}" ) # Load config config = BertConfig.from_json_file(args.config_file) # Load task config with open(args.tasks_config_file, "r") as f: task_cfg = edict(yaml.safe_load(f)) task_id = args.task.strip() task = "TASK" + task_id # Output dirs if "/" in args.from_pretrained: timeStamp = args.from_pretrained.split("/")[1] else: timeStamp = args.from_pretrained savePath = os.path.join(args.output_dir, timeStamp) if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Dataset batch_size, task2num_iters, dset_val, dl_val = LoadDatasetEval( args, config, task_cfg, args.task) max_subiter_images = dset_val.max_num_images # Model if args.zero_shot: config.visual_target_weights = {} model = BertForVLPreTraining.from_pretrained(args.from_pretrained, config=config) else: model = BertForVLTasks.from_pretrained(args.from_pretrained, config=config, task_cfg=task_cfg, task_ids=[task]) # Move to GPU(s) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, deay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) raise ValueError("Please run with a single GPU") # Print summary if default_gpu: print("***** Running evaluation *****") print(" Num Iters: ", task2num_iters) print(" Batch size: ", batch_size) # Evaluate model.eval() results = [] others = [] score_matrix = np.zeros( (args.num_images * args.captions_per_image, args.num_images)) target_matrix = np.zeros( (args.num_images * args.captions_per_image, args.num_images)) rank_vector = np.ones( args.num_images * args.captions_per_image) * args.num_images count = 0 for i, batch in tqdm(enumerate(dl_val), total=task2num_iters[task]): batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch) features, spatials, image_mask, question, input_mask, segment_ids, target, caption_idx, image_idx = batch features = features.squeeze(0) spatials = spatials.squeeze(0) image_mask = image_mask.squeeze(0) question = question.repeat(features.size(0), 1) segment_ids = segment_ids.repeat(features.size(0), 1) input_mask = input_mask.repeat(features.size(0), 1) with torch.no_grad(): if args.zero_shot: _, _, vil_logit, _, _, _ = model(question, features, spatials, segment_ids, input_mask, image_mask) score_matrix[caption_idx, image_idx * max_subiter_images:(image_idx + 1) * max_subiter_images] = (torch.softmax( vil_logit, dim=1)[:, 0].view(-1).cpu().numpy()) target_matrix[caption_idx, image_idx * max_subiter_images:(image_idx + 1) * max_subiter_images] = ( target.view(-1).float().cpu().numpy()) else: vil_logit, _, _, _ = model(question, features, spatials, task, segment_ids, input_mask, image_mask) score_matrix[caption_idx, image_idx * max_subiter_images:(image_idx + 1) * max_subiter_images] = ( vil_logit.view(-1).cpu().numpy()) target_matrix[caption_idx, image_idx * max_subiter_images:(image_idx + 1) * max_subiter_images] = ( target.view(-1).float().cpu().numpy()) if image_idx.item() == args.num_subiters - 1: rank = np.where( (np.argsort(-score_matrix[caption_idx]) == np.where( target_matrix[caption_idx] == 1)[0][0]) == 1)[0][0] rank_vector[caption_idx] = rank cur_rank_vector = rank_vector[:caption_idx + 1] r1 = 100.0 * np.sum(cur_rank_vector < 1) / len(cur_rank_vector) r5 = 100.0 * np.sum(cur_rank_vector < 5) / len(cur_rank_vector) r10 = 100.0 * np.sum( cur_rank_vector < 10) / len(cur_rank_vector) medr = np.floor(np.median(cur_rank_vector) + 1) meanr = np.mean(cur_rank_vector) + 1 print( "%d Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (count, r1, r5, r10, medr, meanr)) results.append( np.argsort(-score_matrix[caption_idx]).tolist()[:20]) count += 1 r1 = 100.0 * np.sum(rank_vector < 1) / len(rank_vector) r5 = 100.0 * np.sum(rank_vector < 5) / len(rank_vector) r10 = 100.0 * np.sum(rank_vector < 10) / len(rank_vector) medr = np.floor(np.median(rank_vector) + 1) meanr = np.mean(rank_vector) + 1 print("************************************************") print("****************Image Retrieval*****************") print("************************************************") print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (r1, r5, r10, medr, meanr)) print("************************************************") if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task_id]["val_split"]) json.dump(results, open(json_path + "_result.json", "w")) json.dump(others, open(json_path + "_others.json", "w")) # Text Retrieval rank_vector = np.zeros(args.num_images) for image_idx in range(args.num_images): ranks = [] tgt_captions = np.where(target_matrix[:, image_idx] == 1)[0] sorted_scores = np.argsort(-score_matrix[:, image_idx]) for tgt_caption in tgt_captions: ranks.append(np.where((sorted_scores == tgt_caption) == 1)[0][0]) rank_vector[image_idx] = min(ranks) r1 = 100.0 * np.sum(rank_vector < 1) / len(rank_vector) r5 = 100.0 * np.sum(rank_vector < 5) / len(rank_vector) r10 = 100.0 * np.sum(rank_vector < 10) / len(rank_vector) medr = np.floor(np.median(rank_vector) + 1) meanr = np.mean(rank_vector) + 1 print("************************************************") print("****************Text Retrieval******************") print("************************************************") print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (r1, r5, r10, medr, meanr)) print("************************************************")