def _construct_exemplar_set(self, task_data: Dataset, dist_args=None, **kwargs) -> None: """ update the buffer with the new task exemplars, chosen randomly for each class. Args: new_task_data (Dataset): The new task data dist_args (Optional[Dict]): a dictionary of the distributed processing values in case of multiple gpu (ex: rank of the device) (default: None) """ distributed = dist_args is not None new_class_labels = task_data.cur_task for class_label in new_class_labels: num_images_to_add = min(self.n_mems_per_cla, self.max_mems_pool_size) if not distributed: class_images_indices = task_data.get_image_indices_by_cla( class_label, num_images_to_add) else: raise NotImplementedError for image_index in class_images_indices: image, label1, label2 = task_data.get_item(image_index) if label2 != NO_LABEL_PLACEHOLDER: warnings.warn( f"Sample is being added to the buffer with labels {label1} and {label2}" ) self.add_sample(class_label, image, (label1, label2))
def test_Dataset_divide_data_across_tasks_CIL(expected_task_data_size): lifelong_dataset = Dataset(dataset, tasks, setup=CIL_SETUP, essential_transforms_fn=lambda x: x) for task_id in range(len(tasks)): assert len(lifelong_dataset.task_id_to_data_idx[task_id] ) == expected_task_data_size[task_id]
def _construct_exemplar_set(self, task_data: Dataset, dist_args: Optional[Dict] = None, **kwargs) -> None: """ update the buffer with the new task exemplars, chosen randomly for each class. Args: new_task_data (Dataset): The new task data dist_args (Optional[Dict]): a dictionary of the distributed processing values in case of multiple gpu (ex: rank of the device) (default: None) """ distributed = dist_args is not None if distributed: rank = dist_args['rank'] else: rank = 0 new_class_labels = task_data.cur_task for class_label in new_class_labels: num_images_to_add = min(self.n_mems_per_cla, self.max_mems_pool_size) class_images_indices = task_data.get_image_indices_by_cla( class_label, num_images_to_add) if distributed: device = torch.device(f"cuda:{dist_args['gpu']}") class_images_indices_to_broadcast = torch.from_numpy( class_images_indices).to(device) torch.distributed.broadcast(class_images_indices_to_broadcast, 0) class_images_indices = class_images_indices_to_broadcast.cpu( ).numpy() for image_index in class_images_indices: image, label1, label2 = task_data.get_item(image_index) if label2 != NO_LABEL_PLACEHOLDER: warnings.warn( f"Sample is being added to the buffer with labels {label1} and {label2}" ) self.add_sample(class_label, image, (label1, label2), rank=rank)
def test_Dataset_divide_data_across_tasks_IIRC_Train(expected_task_data_size): lifelong_dataset = Dataset(dataset, tasks, essential_transforms_fn=lambda x: x, setup=IIRC_SETUP, test_mode=False, superclass_data_pct=superclass_data_pct, subclass_data_pct=subclass_data_pct) for task_id in range(len(tasks)): assert len(lifelong_dataset.task_id_to_data_idx[task_id] ) == expected_task_data_size[task_id]
def _prepare_model_for_new_task(self, task_data: Dataset, dist_args: Optional[dict] = None, **kwargs) -> None: """ A method specific function that takes place before the starting epoch of each new task (runs from the prepare_model_for_task function). It copies the old network and freezes it's gradients. It also extends the output layer, imprints weights for those extended nodes, and change the trainable parameters Args: task_data (Dataset): The new task dataset dist_args (Optional[Dict]): a dictionary of the distributed processing values in case of multiple gpu (ex: rank of the device) (default: None) """ self.old_net = copy_freeze(self.net) self.old_net.eval() cur_task_id = self.cur_task_id num_old_classes = int(sum(self.n_cla_per_tsk[: cur_task_id])) num_new_classes = self.n_cla_per_tsk[cur_task_id] device = next(self.net.parameters()).device # Extend last layer if cur_task_id > 0: output_layer = cosine_linear.SplitCosineLinear(in_features=self.latent_dim, out_features1=num_old_classes, out_features2=num_new_classes, sigma=self.sigma).to(device) if cur_task_id == 1: output_layer.fc1.weight.data = self.net.model.output_layer.weight.data else: out_features1 = self.net.model.output_layer.fc1.out_features output_layer.fc1.weight.data[:out_features1] = self.net.model.output_layer.fc1.weight.data output_layer.fc1.weight.data[out_features1:] = self.net.model.output_layer.fc2.weight.data output_layer.sigma.data = self.net.model.output_layer.sigma.data self.net.model.output_layer = output_layer self.lambda_cur = self.lambda_base * math.sqrt(num_old_classes * 1.0 / num_new_classes) print_msg(f"Lambda for less forget is set to {self.lambda_cur}") elif cur_task_id != 0: raise ValueError("task id cannot be negative") # Imprint weights with task_data.disable_augmentations(): if cur_task_id > 0: print_msg("Imprinting weights") self.net = self._imprint_weights(task_data, self.net, dist_args) # Fix parameters of FC1 for less forget and reset optimizer/scheduler if cur_task_id > 0: trainable_parameters = [param for name, param in self.net.named_parameters() if "output_layer.fc1" not in name] else: trainable_parameters = self.net.parameters() self.reset_optimizer_and_scheduler(trainable_parameters)
def test_choose_task(expected_task_data_size): lifelong_dataset = Dataset(dataset, tasks, essential_transforms_fn=lambda x: x, setup=IIRC_SETUP, superclass_data_pct=superclass_data_pct, subclass_data_pct=subclass_data_pct) for task_id in range(len(tasks)): lifelong_dataset.choose_task(task_id) assert len(lifelong_dataset) == expected_task_data_size[task_id] assert set(lifelong_dataset.seen_classes) == set( [cla for task in tasks[:task_id + 1] for cla in task]) # test that the no labels from outside the current task are given (when not using complete information mode in # the case of IIRC) assert lifelong_dataset.cur_task == tasks[task_id] for i in range(len(lifelong_dataset)): image, label_1, label_2 = lifelong_dataset[i] # Check that only one label is given when not using the complete information mode (in IIRC) assert label_2 == NO_LABEL_PLACEHOLDER assert label_1 in tasks[task_id]
def test_complete_information_mode(expected_task_data_size, expected_data_up_to_size): lifelong_dataset = Dataset(dataset, tasks, essential_transforms_fn=lambda x: x, setup=IIRC_SETUP, superclass_data_pct=superclass_data_pct, subclass_data_pct=subclass_data_pct) lifelong_dataset.enable_complete_information_mode() for task_id in range(len(tasks)): lifelong_dataset.choose_task(task_id) assert len(lifelong_dataset) == expected_task_data_size[task_id] assert set(lifelong_dataset.seen_classes) == set( [cla for task in tasks[:task_id + 1] for cla in task]) # test that all labels that have been observed so far are given assert lifelong_dataset.cur_task == tasks[task_id] for i in range(len(lifelong_dataset)): image, label_1, label_2 = lifelong_dataset[i] assert label_1 == NO_LABEL_PLACEHOLDER or label_1 in lifelong_dataset.seen_classes assert label_2 == NO_LABEL_PLACEHOLDER or label_2 in lifelong_dataset.seen_classes if label_1 in subclasses_superclasses.keys(): assert label_2 == subclasses_superclasses[label_1] elif label_2 in subclasses_superclasses.keys(): assert label_1 == subclasses_superclasses[label_2] lifelong_dataset.load_tasks_up_to(task_id) assert len(lifelong_dataset) == expected_data_up_to_size[task_id] assert set(lifelong_dataset.cur_task) == set( [cla for task in tasks[:task_id + 1] for cla in task]) for i in range(len(lifelong_dataset)): image, label_1, label_2 = lifelong_dataset[i] assert label_1 == NO_LABEL_PLACEHOLDER or label_1 in lifelong_dataset.seen_classes assert label_2 == NO_LABEL_PLACEHOLDER or label_2 in lifelong_dataset.seen_classes if label_1 in subclasses_superclasses.keys(): assert label_2 == subclasses_superclasses[label_1] elif label_2 in subclasses_superclasses.keys(): assert label_1 == subclasses_superclasses[label_2]
def _imprint_weights(self, task_data: Dataset, model: Union[ResNet, ResNetCIFAR], dist_args: Optional[dict] = None) -> Union[ResNet, ResNetCIFAR]: distributed = dist_args is not None if distributed: device = torch.device(f"cuda:{dist_args['gpu']}") else: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') class_names = task_data.cur_task class_names_2_idx = self.class_names_to_idx model.eval() num_old_classes = model.model.output_layer.fc1.out_features old_weights_norm = model.model.output_layer.fc1.weight.data.norm(dim=1, keepdim=True) average_old_weights_norm = torch.mean(old_weights_norm, dim=0) new_weights = torch.zeros_like(model.model.output_layer.fc2.weight.data) for cla in class_names: cla_id = class_names_2_idx[cla] if cla_id < num_old_classes: continue num_samples = 1000 class_indices = task_data.get_image_indices_by_cla(cla, num_samples=num_samples, shuffle=False) if distributed: # make sure all the gpus use the same random indices class_data_indices_to_broadcast = torch.from_numpy(class_indices).to(device) torch.distributed.broadcast(class_data_indices_to_broadcast, 0) class_indices = class_data_indices_to_broadcast.cpu().numpy() sampler = SubsetSampler(class_indices) class_loader = DataLoader(task_data, batch_size=self.batch_size, sampler=sampler) normalized_latent_feat = [] with torch.no_grad(): for minibatch in class_loader: inputs = minibatch[0].to(device) output, latent_features = model(inputs) latent_features = latent_features.detach() latent_features = F.normalize(latent_features, p=2, dim=-1) normalized_latent_feat.append(latent_features) normalized_latent_feat = torch.cat(normalized_latent_feat, dim=0) mean_latent_feat = torch.mean(normalized_latent_feat, dim=0) normalized_mean_latent = F.normalize(mean_latent_feat, p=2, dim=0) new_weights[cla_id - num_old_classes] = normalized_mean_latent * average_old_weights_norm model.model.output_layer.fc2.weight.data = new_weights return model
def _construct_exemplar_set(self, task_data: Dataset, dist_args: Optional[dict] = None, model: torch.nn.Module = None, batch_size=1, **kwargs): """ Update the buffer with the new task samples using herding Args: task_data (Dataset): The new task data dist_args (Optional[Dict]): a dictionary of the distributed processing values in case of multiple gpu (ex: rank of the device) (default: None) model (BaseMethod): The current method object to calculate the latent variables batch_size (int): The minibatch size """ distributed = dist_args is not None if distributed: device = torch.device(f"cuda:{dist_args['gpu']}") rank = dist_args['rank'] else: device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') rank = 0 new_class_labels = task_data.cur_task model.eval() with task_data.disable_augmentations( ): # disable augmentations then enable them (if they were already enabled) with torch.no_grad(): for class_label in new_class_labels: class_data_indices = task_data.get_image_indices_by_cla( class_label, self.max_mems_pool_size) if distributed: device = torch.device(f"cuda:{dist_args['gpu']}") class_data_indices_to_broadcast = torch.from_numpy( class_data_indices).to(device) dist.broadcast(class_data_indices_to_broadcast, 0) class_data_indices = class_data_indices_to_broadcast.cpu( ).numpy() sampler = SubsetSampler(class_data_indices) class_loader = DataLoader(task_data, batch_size=batch_size, sampler=sampler) latent_vectors = [] for minibatch in class_loader: images = minibatch[0].to(device) output, out_latent = model.forward_net(images) out_latent = out_latent.detach() out_latent = F.normalize(out_latent, p=2, dim=-1) latent_vectors.append(out_latent) latent_vectors = torch.cat(latent_vectors, dim=0) class_mean = torch.mean(latent_vectors, dim=0) chosen_exemplars_ind = [] exemplars_mean = torch.zeros_like(class_mean) while len(chosen_exemplars_ind) < min( self.n_mems_per_cla, len(class_data_indices)): potential_exemplars_mean = (exemplars_mean.unsqueeze(0) * len(chosen_exemplars_ind) + latent_vectors) \ / (len(chosen_exemplars_ind) + 1) distance = (class_mean.unsqueeze(0) - potential_exemplars_mean).norm(dim=-1) shuffled_index = torch.argmin(distance).item() exemplars_mean = potential_exemplars_mean[ shuffled_index, :].clone() exemplar_index = class_data_indices[shuffled_index] chosen_exemplars_ind.append(exemplar_index) latent_vectors[shuffled_index, :] = float("inf") for image_index in chosen_exemplars_ind: image, label1, label2 = task_data.get_item(image_index) if label2 != NO_LABEL_PLACEHOLDER: warnings.warn( f"Sample is being added to the buffer with labels {label1} and {label2}" ) self.add_sample(class_label, image, (label1, label2), rank=rank)
def test_get_shuffled_image_indices(class1, class2, class1_size0, class1_size01, class2_size1, class2_size01): lifelong_dataset = Dataset(dataset, tasks, essential_transforms_fn=lambda x: x, setup=IIRC_SETUP, superclass_data_pct=superclass_data_pct, subclass_data_pct=subclass_data_pct) assert class1 in tasks[0] assert class2 in tasks[1] lifelong_dataset.choose_task(0) class1_indices = lifelong_dataset.get_image_indices_by_cla(class1) assert len(class1_indices) == class1_size0 for idx in class1_indices: image, label_1, label_2 = lifelong_dataset[idx] assert label_1 == class1 assert label_2 == NO_LABEL_PLACEHOLDER lifelong_dataset.choose_task(1) class2_indices = lifelong_dataset.get_image_indices_by_cla(class2) assert len(class2_indices) == class2_size1 for idx in class2_indices: image, label_1, label_2 = lifelong_dataset[idx] assert label_1 == class2 assert label_2 == NO_LABEL_PLACEHOLDER lifelong_dataset.enable_complete_information_mode() lifelong_dataset.load_tasks_up_to(1) class1_indices = lifelong_dataset.get_image_indices_by_cla(class1) class2_indices = lifelong_dataset.get_image_indices_by_cla(class2) assert len(class1_indices) == class1_size01 assert len(class2_indices) == class2_size01 for idx in class1_indices: image, label_1, label_2 = lifelong_dataset[idx] if label_1 == class1: assert label_2 == class2 or label_2 == NO_LABEL_PLACEHOLDER elif label_2 == class1: assert label_1 == class2 or label_1 == NO_LABEL_PLACEHOLDER else: raise ValueError(f"{class1} is not there") for idx in class2_indices: image, label_1, label_2 = lifelong_dataset[idx] if label_1 == class2: assert label_2 == class1 elif label_2 == class2: assert label_1 == class1 else: raise ValueError(f"{class2} is not there")