def _get_sampler(self, epoch) -> "DistributedSampler": if self.split == "train": # For video model training, we don't necessarily want to use all possible # clips in the video in one training epoch. More often, we randomly # sample at most N clips per training video. In practice, N is often 1 clip_sampler = RandomClipSampler(self.video_clips, self.clips_per_video) else: # For video model testing, we sample N evenly spaced clips per test # video. We will simply average predictions over them clip_sampler = UniformClipSampler(self.video_clips, self.clips_per_video) clip_sampler = MaxLengthClipSampler(clip_sampler, num_samples=self.num_samples) world_size = get_world_size() rank = get_rank() sampler = DistributedSampler( clip_sampler, num_replicas=world_size, rank=rank, shuffle=self.shuffle, group_size=self.clips_per_video, ) sampler.set_epoch(epoch) return sampler
def _log_performance_metrics(self, task: "tasks.ClassyTask") -> None: """ Compute and log performance metrics. """ phase_type = task.phase_type batches = len(task.losses) if self.start_time is None: logging.warning("start_time not initialized") else: # Average batch time calculation total_batch_time = time.time() - self.start_time average_batch_time = total_batch_time / batches logging.info( "Average %s batch time (ms) for %d batches: %d" % (phase_type, batches, 1000.0 * average_batch_time) ) # Train step time breakdown if not hasattr(task, "perf_stats") or task.perf_stats is None: logging.warning('"perf_stats" not set in task') elif task.train: logging.info( "Train step time breakdown (rank {}):\n{}".format( get_rank(), task.perf_stats.report_str() ) )
def _log_loss_meters(self, task: "tasks.ClassyTask", local_variables: Dict[str, Any]) -> None: """ Compute and log the loss and meters. """ phase_type = task.phase_type phase_type_idx = task.train_phase_idx if task.train else task.eval_phase_idx batches = len(task.losses) # Loss for the phase loss = sum(task.losses) / (batches * task.get_batchsize_per_replica()) log_strs = [ "Rank: {}, {} phase: {}, processed batches: {}".format( get_rank(), phase_type, phase_type_idx, batches), "{} loss: {}".format(phase_type, loss), "Meters:", ] acc = [] for meter in task.meters: log_strs.append("{}".format(meter)) acc.append(meter) logging.info("\n".join(log_strs)) return acc
def _get_sampler(self, epoch): world_size = get_world_size() rank = get_rank() sampler = DistributedSampler(self, num_replicas=world_size, rank=rank, shuffle=self.shuffle) sampler.set_epoch(epoch) return sampler
def __init__(self, buffer_params, temperature: float): super(SimclrInfoNCECriterion, self).__init__() self.use_gpu = get_cuda_device_index() > -1 self.temperature = temperature self.num_pos = 2 self.buffer_params = buffer_params self.criterion = nn.CrossEntropyLoss() self.dist_rank = get_rank() self.pos_mask = None self.neg_mask = None self.precompute_pos_neg_mask() logging.info(f"Creating Info-NCE loss on Rank: {self.dist_rank}")
def compute_partition_function(self, out): num_items = self.memory.size(0) with torch.no_grad(): batch_mean = out.mean() # NOTE: this relies of "mean" computation being stable and deterministic # across all nodes. Could be replaced with smarter ways. if torch.distributed.is_available( ) and torch.distributed.is_initialized(): batch_mean_gathered = gather_from_all(batch_mean) all_batch_mean = batch_mean_gathered.mean().squeeze().item() else: all_batch_mean = batch_mean.item() self.params[2] = all_batch_mean * num_items Z = self.params[2].clone().detach().item() rank = get_rank() logging.info(f"Rank: {rank}; Normalization constant Z is set to {Z}")
def __init__(self, cfg: AttrDict, data_source: str, path: str, split: str, dataset_name: str): super(AirstoreDataset, self).__init__( queue_size=cfg["DATA"][split]["BATCHSIZE_PER_REPLICA"]) self.pathmanager = create_path_manager() self.cfg = cfg self.batch_size = cfg["DATA"][split]["BATCHSIZE_PER_REPLICA"] self.airstore_uri = path self.split = split self.epoch = 0 self.start_iter = 0 self.enable_queue_dataset = cfg["DATA"][ self.split]["ENABLE_QUEUE_DATASET"] self.global_rank = get_rank() self.global_world_size = get_world_size() self._iterator = None
def _get_sampler(self, epoch: int): """ Return a :class:`torch.utils.data.sampler.Sampler` to sample the data. This is used to distribute the data across the replicas. If shuffling is enabled, every epoch will have a different shuffle. Args: epoch: The epoch being fetched. Returns: A sampler which tells the data loader which sample to load next. """ world_size = get_world_size() rank = get_rank() sampler = DistributedSampler(self, num_replicas=world_size, rank=rank, shuffle=self.shuffle) sampler.set_epoch(epoch) return sampler
def __init__( self, temperature: float, crops_for_assign: List[int], num_crops: int, num_iters: int, epsilon: float, use_double_prec: bool, num_prototypes: List[int], local_queue_length: int, embedding_dim: int, temp_hard_assignment_iters: int, output_dir: str, ): super(SwAVCriterion, self).__init__() self.use_gpu = get_cuda_device_index() > -1 self.temperature = temperature self.crops_for_assign = crops_for_assign self.num_crops = num_crops self.nmb_sinkhornknopp_iters = num_iters self.epsilon = epsilon self.use_double_prec = use_double_prec self.num_prototypes = num_prototypes self.nmb_heads = len(self.num_prototypes) self.embedding_dim = embedding_dim self.temp_hard_assignment_iters = temp_hard_assignment_iters self.local_queue_length = local_queue_length self.dist_rank = get_rank() self.world_size = get_world_size() self.log_softmax = nn.LogSoftmax(dim=1).cuda() self.softmax = nn.Softmax(dim=1).cuda() self.register_buffer("num_iteration", torch.zeros(1, dtype=int)) self.use_queue = False if local_queue_length > 0: self.initialize_queue() self.output_dir = output_dir
def on_update(self, task: "tasks.ClassyTask") -> None: """ Executed after after parameter update. If the current phase is training, and it's a logging iteration, we compute and log several helpul training stats to keep track of ongoing training. For monitoring the batch size (average training iteration time), we allow monitoring the stats (optionally) for every N iterations to get better idea about the batch time and training eta. Set the btime_freq input using cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY=N ensuring that cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS = True. """ phase_type = "train" if task.train else "test" if is_primary() and phase_type == "train": train_phase_idx = task.train_phase_idx log_freq = task.config["LOG_FREQUENCY"] iteration = task.iteration if torch.cuda.is_available(): peak_mem_used = int(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0) else: peak_mem_used = -1 if ((iteration == 1) or (iteration % log_freq == 0) or (iteration <= 100 and iteration % 5 == 0)): loss_val = round(task.last_batch.loss.data.cpu().item(), 5) if len(task.batch_time) > 0: batch_times = task.batch_time else: batch_times = [0] avg_time = sum(batch_times) / len(batch_times) eta_secs = avg_time * (task.max_iteration - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_secs))) if isinstance(task.optimizer.options_view.lr, set): lr_val = list(task.optimizer.options_view.lr) else: lr_val = round(task.optimizer.options_view.lr, 5) batch_time = int(1000.0 * avg_time) rank = get_rank() log_data = { "Rank": rank, "ep": train_phase_idx, "iter": iteration, "lr": lr_val, "loss": loss_val, "btime(ms)": batch_time, "eta": eta_string, "peak_mem(M)": peak_mem_used, } if iteration == 1: # Set max iterations. Currently used in benchmark_suite_scheduler.py log_data["max_iterations"] = task.max_iteration if self.btime_freq and len(batch_times) >= self.btime_freq: rolling_avg_time = (sum(batch_times[-self.btime_freq:]) / self.btime_freq) rolling_eta_secs = int(rolling_avg_time * (task.max_iteration - iteration)) rolling_eta_str = str( datetime.timedelta(seconds=int(rolling_eta_secs))) rolling_btime = int(1000.0 * rolling_avg_time) log_data[ f"btime({self.btime_freq}iters)(ms)"] = rolling_btime log_data["rolling_eta"] = rolling_eta_str # to maintain the backwards compatibility with the log.txt # logs, we convert the json to the previous format. # the stdout.json can be used to use the json format of logs. stdout_data = "" for key, value in log_data.items(): stdout_data = (f"{stdout_data}[{key}: {value}] " if key == "ep" else f"{stdout_data}{key}: {value}; ") logging.info(stdout_data.strip()) self.json_stdout_logger.write(json.dumps(log_data) + "\n")
def _log_training_epoch(self, task): train_phase_idx = task.train_phase_idx log_freq = task.config["LOG_FREQUENCY"] iteration = task.iteration if torch.cuda.is_available(): peak_mem_used = int(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0) else: peak_mem_used = -1 if ((iteration == 1) or (iteration % log_freq == 0) or (iteration <= 100 and iteration % 5 == 0)): loss_val = round(task.last_batch.loss.data.cpu().item(), 5) if len(task.batch_time) > 0: batch_times = task.batch_time else: batch_times = [0] avg_time = sum(batch_times) / len(batch_times) eta_secs = avg_time * (task.max_iteration - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_secs))) if isinstance(task.optimizer.options_view.lr, (set, list)): lr_val = list(task.optimizer.options_view.lr) else: lr_val = round(task.optimizer.options_view.lr, 5) if isinstance(task.optimizer.options_view.weight_decay, (set, list)): wd_val = list(task.optimizer.options_view.weight_decay) else: wd_val = round(task.optimizer.options_view.weight_decay, 5) batch_time = int(1000.0 * avg_time) rank = get_rank() log_data = { "Rank": rank, "ep": train_phase_idx, "iter": iteration, "lr": lr_val, "loss": loss_val, "btime(ms)": batch_time, "eta": eta_string, "peak_mem(M)": peak_mem_used, "weight_decay": wd_val, } # Add customized data registered by other hooks log_data.update(task.additional_log_data) if iteration == 1: # Set max iterations. Currently used in benchmark_suite_scheduler.py log_data["max_iterations"] = task.max_iteration if self.btime_freq and len(batch_times) >= self.btime_freq: rolling_avg_time = (sum(batch_times[-self.btime_freq:]) / self.btime_freq) rolling_eta_secs = int(rolling_avg_time * (task.max_iteration - iteration)) rolling_eta_str = str( datetime.timedelta(seconds=int(rolling_eta_secs))) rolling_btime = int(1000.0 * rolling_avg_time) log_data[f"btime({self.btime_freq}iters)(ms)"] = rolling_btime log_data["rolling_eta"] = rolling_eta_str # to maintain the backwards compatibility with the log.txt # logs, we convert the json to the previous format. # the stdout.json can be used to use the json format of logs. stdout_data = "" for key, value in log_data.items(): stdout_data = (f"{stdout_data}[{key}: {value}] " if key == "ep" else f"{stdout_data}{key}: {value}; ") logging.info(stdout_data.strip()) self.json_stdout_logger.write(json.dumps(log_data) + "\n")
def on_update(self, task: "tasks.ClassyTask") -> None: """ Executed after after parameter update. If the current phase is training, and it's a logging iteration, we compute and log several helpul training stats to keep track of ongoing training. For monitoring the batch size (average training iteration time), we allow monitoring the stats (optionally) for every N iterations to get better idea about the batch time and training eta. Set the btime_freq input using cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY=N ensuring that cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS = True. """ phase_type = "train" if task.train else "test" if is_primary() and phase_type == "train": train_phase_idx = task.train_phase_idx log_freq = task.config["LOG_FREQUENCY"] iteration = task.iteration if torch.cuda.is_available(): peak_mem_used = int(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0) else: peak_mem_used = -1 if ((iteration == 1) or (iteration % log_freq == 0) or (iteration <= 100 and iteration % 5 == 0)): loss_val = round(task.last_batch.loss.data.cpu().item(), 5) if len(task.batch_time) > 0: batch_times = task.batch_time else: batch_times = [0] avg_time = sum(batch_times) / len(batch_times) eta_secs = avg_time * (task.max_iteration - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_secs))) if isinstance(task.optimizer.options_view.lr, set): lr_val = list(task.optimizer.options_view.lr) else: lr_val = round(task.optimizer.options_view.lr, 5) batch_time = int(1000.0 * avg_time) rank = get_rank() log_str = (f"Rank: {rank}; " f"[ep: {train_phase_idx}] " f"iter: {iteration}; " f"lr: {lr_val}; " f"loss: {loss_val}; " f"btime(ms): {batch_time}; " f"eta: {eta_string}; " f"peak_mem: {peak_mem_used}M") if self.btime_freq and len(batch_times) >= self.btime_freq: rolling_avg_time = (sum(batch_times[-self.btime_freq:]) / self.btime_freq) rolling_eta_secs = int(rolling_avg_time * (task.max_iteration - iteration)) rolling_eta_str = str( datetime.timedelta(seconds=int(rolling_eta_secs))) rolling_btime = int(1000.0 * rolling_avg_time) log_str = ( f"{log_str}; " f"btime({self.btime_freq}iters): {rolling_btime} ms; " f"rolling_eta: {rolling_eta_str}") logging.info(log_str)
def cluster_memory(self): self.start_idx = 0 j = 0 with torch.no_grad(): for i_K, K in enumerate(self.num_clusters): # run distributed k-means # init centroids with elements from memory bank of rank 0 centroids = torch.empty( K, self.embedding_dim).cuda(non_blocking=True) if get_rank() == 0: random_idx = torch.randperm( len(self.local_memory_embeddings[j]))[:K] assert len(random_idx ) >= K, "please reduce the number of centroids" centroids = self.local_memory_embeddings[j][random_idx] dist.broadcast(centroids, 0) for n_iter in range(self.nmb_kmeans_iters + 1): # E step dot_products = torch.mm(self.local_memory_embeddings[j], centroids.t()) _, assignments = dot_products.max(dim=1) # finish if n_iter == self.nmb_kmeans_iters: break # M step where_helper = get_indices_sparse( assignments.cpu().numpy()) counts = torch.zeros(K).cuda(non_blocking=True).int() emb_sums = torch.zeros( K, self.embedding_dim).cuda(non_blocking=True) for k in range(len(where_helper)): if len(where_helper[k][0]) > 0: emb_sums[k] = torch.sum( self.local_memory_embeddings[j][where_helper[k] [0]], dim=0, ) counts[k] = len(where_helper[k][0]) all_reduce_sum(counts) mask = counts > 0 all_reduce_sum(emb_sums) centroids[mask] = emb_sums[mask] / counts[mask].unsqueeze( 1) # normalize centroids centroids = nn.functional.normalize(centroids, dim=1, p=2) getattr(self, "centroids" + str(i_K)).copy_(centroids) # gather the assignments assignments_all = gather_from_all(assignments) indexes_all = gather_from_all(self.local_memory_index) self.assignments[i_K] = -100 self.assignments[i_K][indexes_all] = assignments_all j = (j + 1) % self.nmb_mbs logging.info(f"Rank: {get_rank()}, clustering of the memory bank done")