def create_model(self): """ Initializing the model to optimize. """ # creating the model and sending it to the proper device self.model = MOVEModel(emb_size=self.cfg['emb_size']) self.model.to(self.device) # computing and printing the total number of parameters of the model self.num_params = 0 for param in self.model.parameters(): self.num_params += np.prod(param.size()) print('Total number of parameters for the model: {:.0f}'.format( self.num_params))
def create_model(self): """ Initializing the model to optimize. """ # creating the student model and sending it to the proper device self.model = MOVEModel(emb_size=self.cfg['emb_size'], sum_method=4, final_activation=3) self.model.to(self.device) # initializing necessary models/data for KD training self.teacher = None self.lp_layer = None self.centroids = None # creating the teacher model and sending it to the proper device # this step is for the distance-based KD training if self.cfg['kd_loss'] == 'distance': self.teacher = MOVEModel(emb_size=16000, sum_method=4, final_activation=3) self.teacher.load_state_dict( torch.load(os.path.join(self.cfg['main_path'], 'saved_models/model_move.pt'), map_location='cpu')) self.teacher.to(self.device) self.teacher.eval() # creating the linear projection layer and loading the class centroids # this step is for the cluster-based KD training elif self.cfg['kd_loss'] == 'cluster': self.lp_layer = nn.Linear(in_features=16000, out_features=self.cfg['emb_size'], bias=False) self.lp_layer.to(self.device) self.centroids = torch.load( os.path.join(self.cfg['main_path'], 'data/centroids.pt')) # computing and printing the total number of parameters of the new model self.num_params = 0 for param in self.model.parameters(): self.num_params += np.prod(param.size()) print('Total number of parameters for the model: {:.0f}'.format( self.num_params))
def create_model(self): """ Initializing the model to optimize. """ # creating and loading the learned parameters of the MOVE model # this model stands as our base model self.model = MOVEModel(emb_size=16000, sum_method=4, final_activation=3) self.model.load_state_dict( torch.load(os.path.join(self.cfg['main_path'], 'saved_models/model_move.pt'), map_location='cpu')) # freezing the parameters of all the parameters of the base model for param in self.model.parameters(): param.requires_grad = False # creating a new linear layer and a new batch normalization layer self.model.lin1 = torch.nn.Linear(in_features=256, out_features=self.cfg['emb_size'], bias=False) self.model.lin_bn = torch.nn.BatchNorm1d(self.cfg['emb_size'], affine=False) # setting the embedding size of the model self.model.fin_emb_size = self.cfg['emb_size'] # sending the model to the proper device self.model.to(self.device) # computing and printing the total number of parameters of the new model self.num_params = 0 for param in self.model.parameters(): self.num_params += np.prod(param.size()) print('Total number of parameters for the model: {:.0f}'.format( self.num_params))
def evaluate(save_name, model_type, emb_size, sum_method, final_activation): print("Prepare random dataset") num_examples = 1000 test_data = [] for i in tqdm(range(num_examples)): t_length = np.random.randint(1000, 2000) cremaPCP = np.random.rand(t_length, 12) cremaPCP_tensor = torch.from_numpy(cremaPCP).t() cremaPCP_reshaped = torch.cat( (cremaPCP_tensor, cremaPCP_tensor))[:23].unsqueeze(0) test_data.append(cremaPCP_reshaped) test_map_set = MOVEDatasetFull(data=test_data) test_map_loader = DataLoader(test_map_set, batch_size=1, shuffle=False) print("Initialize model") # initializing the model if model_type == 0: move_model = MOVEModel(emb_size=emb_size, sum_method=sum_method, final_activation=final_activation) elif model_type == 1: move_model = MOVEModelNT(emb_size=emb_size, sum_method=sum_method, final_activation=final_activation) # loading a pre-trained model model_name = 'saved_models/model_{}.pt'.format(save_name) print("Load model") move_model.load_state_dict(torch.load(model_name, map_location='cpu')) move_model.eval() # sending the model to gpu, if available if torch.cuda.is_available(): move_model.cuda() if torch.cuda.is_available(): device = 'cuda:0' else: device = 'cpu' print("Run extract feature") with torch.no_grad(): # deactivating gradient tracking for testing move_model.eval() # setting the model to evaluation mode # tensor for storing all the embeddings obtained from the test set embed_all = torch.tensor([], device=device) for batch_idx, item in tqdm(enumerate(test_map_loader)): if torch.cuda.is_available( ): # sending the pcp features and the labels to cuda if available item = item.cuda() res_1 = move_model( item ) # obtaining the embeddings of each song in the mini-batch embed_all = torch.cat( (embed_all, res_1 )) # adding the embedding of the current song to the others return embed_all.cpu()
class KDTrainer(BaseTrainer): """ Trainer object for Knowledge Distillation experiments. """ def __init__(self, cfg, experiment_name): """ Initializing the trainer :param cfg: dictionary that holds the config hyper-parameters :param experiment_name: name of the experiment """ # initializing the parent Trainer object super().__init__(cfg, experiment_name) def handle_training_batches(self): """ Training loop for one mini-epoch. :return: training loss for the current mini-epoch """ # setting the model to training mode self.model.train() # initializing a list object to hold losses from each iteration epoch_loss = [] # training loop for batch_idx, batch in enumerate(self.data_loader): # if overfit_batch == 1, only the same batch is trained. # this helps to see whether there are any issues with optimization. # a fast over-fitting behaviour is expected. if self.cfg['overfit_batch'] == 1: if batch_idx == 0: overfit_batch = batch else: batch = overfit_batch # making sure the data and labels are in the correct device and in float32 type items, labels = batch items = handle_device(items, self.device) labels = handle_device(labels, self.device) # forward pass of the student model # obtaining the embeddings of each item in the batch embs_s = self.model(items) # if the distance-based KD loss is chosen, # we obtain the embeddings of each item from the teacher model with torch.no_grad(): embs_t = self.teacher( items) if self.cfg['kd_loss'] == 'distance' else None # calculating the KD loss for the iteration kd_loss = KD_LOSS_DICT[self.cfg['kd_loss']]( embs_s=embs_s, embs_t=embs_t, emb_size=self.cfg['emb_size'], lp_layer=self.lp_layer, labels=labels, centroids=self.centroids) # calculating the triplet loss for the iteration main_loss = triplet_loss( data=embs_s, labels=labels, emb_size=self.cfg['emb_size'], margin=self.cfg['margin'], mining_strategy=self.cfg['mining_strategy']) # summing KD and triplet loss values loss = kd_loss + main_loss # setting gradients of the optimizer to zero self.optimizer.zero_grad() # calculating gradients with backpropagation loss.backward() # updating the weights self.optimizer.step() # logging the loss value of the current batch epoch_loss.append(loss.detach().item()) # logging the loss value of the current mini-epoch return np.mean(epoch_loss) def create_model(self): """ Initializing the model to optimize. """ # creating the student model and sending it to the proper device self.model = MOVEModel(emb_size=self.cfg['emb_size'], sum_method=4, final_activation=3) self.model.to(self.device) # initializing necessary models/data for KD training self.teacher = None self.lp_layer = None self.centroids = None # creating the teacher model and sending it to the proper device # this step is for the distance-based KD training if self.cfg['kd_loss'] == 'distance': self.teacher = MOVEModel(emb_size=16000, sum_method=4, final_activation=3) self.teacher.load_state_dict( torch.load(os.path.join(self.cfg['main_path'], 'saved_models/model_move.pt'), map_location='cpu')) self.teacher.to(self.device) self.teacher.eval() # creating the linear projection layer and loading the class centroids # this step is for the cluster-based KD training elif self.cfg['kd_loss'] == 'cluster': self.lp_layer = nn.Linear(in_features=16000, out_features=self.cfg['emb_size'], bias=False) self.lp_layer.to(self.device) self.centroids = torch.load( os.path.join(self.cfg['main_path'], 'data/centroids.pt')) # computing and printing the total number of parameters of the new model self.num_params = 0 for param in self.model.parameters(): self.num_params += np.prod(param.size()) print('Total number of parameters for the model: {:.0f}'.format( self.num_params)) def create_optimizer(self): """ Initializing the optimizer. In the case of distance-based KD training, no additional parameters are given to the optimizer. In the case of cluster-based KD training, the parameters of the linear projection layer are updated, as well as the parameters of the student model. """ # getting the parameters of the student model opt_params = list(self.model.parameters()) # for the cluster-based KD training, append the parameters of # the linear projection layer for the optimizer if self.cfg['kd_loss'] == 'cluster': opt_params += list(self.lp_layer.parameters()) if self.cfg['optimizer'] == 0: self.optimizer = torch.optim.SGD(opt_params, lr=self.cfg['learning_rate'], momentum=self.cfg['momentum']) elif self.cfg['optimizer'] == 1: self.optimizer = Ranger(opt_params, lr=self.cfg['learning_rate']) else: self.optimizer = None
for audio_file in tqdm(audio_files): crema_feature = compute_features(audio_path=audio_file, params=params, feature=feature) idxs = np.arange(0, crema_feature.shape[0], 8) temp_tensor = torch.from_numpy(crema_feature[idxs].T) crema_feature_list.append( torch.cat((temp_tensor, temp_tensor))[:23].unsqueeze(0)) test_set = FullSizeInstanceDataset(data=crema_feature_list) test_loader = DataLoader(test_set, batch_size=1, shuffle=False) print("Initializing model") # initializing the model model = MOVEModel(emb_size=args.emb_size) # loading a pre-trained model model_name = os.path.join(args.main_path, 'saved_models', '{}_models'.format(args.exp_type), 'model_{}.pt'.format(experiment_name)) model.load_state_dict(torch.load(model_name, map_location='cpu')) # sending the model to gpu, if available device = 'cuda:0' if torch.cuda.is_available() else 'cpu' model.to(device) remove_items = [] with torch.no_grad(): # disabling gradient tracking model.eval() # setting the model to evaluation mode
def train(save_name, train_path, chunks, val_path, save_model, save_summary, seed, num_of_epochs, model_type, emb_size, sum_method, final_activation, lr, lrsch, lrsch_factor, momentum, patch_len, num_of_labels, ytc, data_aug, norm_dist, mining_strategy, margin ): """ Main training function of MOVE. For a detailed explanation of parameters, please check 'python move_main.py -- help' :param save_name: name to save model and experiment summary :param train_path: path of the training data :param chunks: how many chunks to use for the training data :param val_path: path of the validation data :param save_model: whether to save model (1) or not (0) :param save_summary: whether to save experiment summary (1) or not (0) :param seed: random seed :param num_of_epochs: number of epochs for training :param model_type: which model to use: MOVE (0) or MOVE without transposition invariance (1) :param emb_size: the size of the final embeddings produced by the model :param sum_method: the summarization method for the model :param final_activation: final activation to use for the model :param lr: value of learning rate :param lrsch: which learning rate scheduler to use :param lrsch_factor: the decrease rate of learning rate :param momentum: momentum for optimizer :param patch_len: number of frames for each song to be used in training :param num_of_labels: number of labels per mini-batch :param ytc: whether to exclude the songs overlapping with ytc for training :param data_aug: whether to use data augmentation :param norm_dist: whether to normalize squared euclidean distances with the embedding size :param mining_strategy: which mining strategy to use :param margin: the margin for the triplet loss """ summary = dict() # initializing the summary dict # initiating the necessary random seeds np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.cuda.manual_seed(seed) # initializing the model if model_type == 0: move_model = MOVEModel(emb_size=emb_size, sum_method=sum_method, final_activation=final_activation) elif model_type == 1: move_model = MOVEModelNT(emb_size=emb_size, sum_method=sum_method, final_activation=final_activation) else: raise Exception('Invalid number for the model parameter.') # sending the model to gpu, if available if torch.cuda.is_available(): move_model.cuda() # initiating the optimizer optimizer = SGD(move_model.parameters(), lr=lr, momentum=momentum) # initializing the lists for tracking losses train_loss_log = [] val_loss_log = [] val_map_log = [] # loading the training and validation data if chunks == 1: # hack for handling 1 chunk for training data train_path = '{}_1.pt'.format(train_path) else: train_path = train_path train_data, train_labels = import_dataset_from_pt('data/{}'.format(train_path), chunks=chunks, model_type=model_type) print('Train data has been loaded!') val_data, val_labels = import_dataset_from_pt('data/{}'.format(val_path), chunks=1, model_type=model_type) print('Validation data has been loaded!') # selecting the H dimension of the input data # different models handle different size inputs if model_type == 0: h = 23 else: h = 12 # initializing the MOVE dataset objects and data loaders # we use validation set to track two things, (1) triplet loss, (2) mean average precision # to check mean average precision on the full songs, # we need to define another dataset object and data loader for it train_set = MOVEDatasetFixed(train_data, train_labels, h=h, w=patch_len, data_aug=data_aug, ytc=ytc) train_loader = DataLoader(train_set, batch_size=num_of_labels, shuffle=True, collate_fn=triplet_mining_collate, drop_last=True) val_set = MOVEDatasetFixed(val_data, val_labels, h=h, w=patch_len, data_aug=0) val_loader = DataLoader(val_set, batch_size=num_of_labels, shuffle=True, collate_fn=triplet_mining_collate, drop_last=True) val_map_set = MOVEDatasetFull(val_data, val_labels) val_map_loader = DataLoader(val_map_set, batch_size=1, shuffle=False) # initializing the learning rate scheduler if lrsch == 0: pass else: if lrsch == 1: milestones = [80] else: milestones = [80, 100] lr_schedule = lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=lrsch_factor) # calculating the number of parameters of the model tmp = 0 for p in move_model.parameters(): tmp += np.prod(p.size()) print('Num of parameters = {}'.format(int(tmp))) print('--- Training starts ---') print('Model name: {}'.format(save_name)) start_time = time.monotonic() # start time for tracking the duration of entire training # main training loop for epoch in range(num_of_epochs): last_epoch = epoch # tracking last epoch to make sure that model didn't quit early start = time.monotonic() # start time for the training loop train_loss = train_triplet_mining(move_model=move_model, optimizer=optimizer, train_loader=train_loader, margin=margin, norm_dist=norm_dist, mining_strategy=mining_strategy) print('Training loop: Epoch {} - Duration {:.2f} mins'.format(epoch, (time.monotonic()-start)/60)) start = time.monotonic() # start time for the validation loop val_loss = validate_triplet_mining(move_model=move_model, val_loader=val_loader, margin=margin, norm_dist=norm_dist, mining_strategy=mining_strategy) print('Validation loop: Epoch {} - Duration {:.2f} mins'.format(epoch, (time.monotonic()-start)/60)) start = time.monotonic() # start time for the mean average precision calculation # calculating the pairwise distances on validation set dist_map_matrix = test(move_model=move_model, test_loader=val_map_loader).cpu() # calculation performance metrics # average_precision function uses similarities, not distances # we multiple the distances with -1, and set the diagonal (self-similarity) -inf val_map_score = average_precision( -1 * dist_map_matrix.float().clone() + torch.diag(torch.ones(len(val_data)) * float('-inf')), dataset=0) print('Test loop: Epoch {} - Duration {:.2f} mins'.format(epoch, (time.monotonic()-start)/60)) # saving loss values for the summary train_loss_log.append(train_loss) val_loss_log.append(val_loss) val_map_log.append(val_map_score.item()) # saving model if needed if save_model == 1: if not os.path.exists('saved_models/'): os.mkdir('saved_models/') torch.save(move_model.state_dict(), 'saved_models/model_{}.pt'.format(save_name)) # printing the losses print('training_loss: {}'.format(train_loss)) print('val_loss: {}'.format(val_loss)) # activate learning rate scheduler if needed if lrsch != 0: lr_schedule.step() # dumping current loss values to the summary summary['train_loss_log'] = train_loss_log summary['val_loss_log'] = val_loss_log summary['val_map_log'] = val_map_log # save summary, if needed, after each epoch if save_summary == 1: if not os.path.exists('experiment_summaries/'): os.mkdir('experiment_summaries/') with open('experiment_summaries/summary_{}.json'.format(save_name), 'w') as log: json.dump(summary, log, indent='\t') end_time = time.monotonic() # end time of the entire training loop # logging all code parameters in the summary file summary['save_name'] = save_name summary['train_path'] = train_path, summary['chunks'] = chunks, summary['val_path'] = val_path, summary['save_model'] = save_model, summary['save_summary'] = save_summary, summary['random_seed'] = seed, summary['num_of_epochs'] = num_of_epochs, summary['model_type'] = model_type, summary['emb_size'] = emb_size, summary['sum_method'] = sum_method, summary['final_activation'] = final_activation, summary['learning_rate'] = lr, summary['lr_schedule'] = lrsch, summary['lrsch_factor'] = lrsch_factor, summary['momentum'] = momentum, summary['patch_len'] = patch_len, summary['num_of_labels'] = num_of_labels, summary['ytc_labels'] = ytc, summary['data_aug'] = data_aug, summary['norm_dist'] = norm_dist, summary['mining_strategy'] = mining_strategy, summary['margin'] = margin summary['last_epoch'] = last_epoch summary['training_time'] = end_time - start_time summary['train_loss_log'] = train_loss_log summary['val_loss_log'] = val_loss_log summary['val_map_log'] = val_map_log # saving the last version of the summary if save_summary == 1: if not os.path.exists('experiment_summaries/'): os.mkdir('experiment_summaries/') with open('experiment_summaries/summary_{}.json'.format(save_name), 'w') as log: json.dump(summary, log, indent='\t') # saving the last version of the model if save_model == 1: if not os.path.exists('saved_models/'): os.mkdir('saved_models/') torch.save(move_model.state_dict(), 'saved_models/model_{}.pt'.format(save_name))
def evaluate(save_name, model_type, emb_size, sum_method, final_activation, dataset, dataset_name): """ Main evaluation function of MOVE. For a detailed explanation of parameters, please check 'python move_main.py -- help' :param save_name: name to save model and experiment summary :param model_type: which model to use: MOVE (0) or MOVE without transposition invariance (1) :param emb_size: the size of the final embeddings produced by the model :param sum_method: the summarization method for the model :param final_activation: final activation to use for the model :param dataset: which dataset to evaluate the model on. (0) validation set, (1) da-tacos, (2) ytc :param dataset_name: name of the file to evaluate """ # indicating which dataset to use for evaluation # val_subset_crema is the name of our validation set if dataset_name == '': if dataset == 0: dataset_name = 'data/val_subset_crema.pt' elif dataset == 1: dataset_name = 'data/benchmark_crema.pt' else: dataset_name = 'data/ytc_crema.h5' else: dataset_name = 'data/{}'.format(dataset_name) print('Evaluating model {} on dataset {}.'.format(save_name, dataset_name)) # initializing the model if model_type == 0: move_model = MOVEModel(emb_size=emb_size, sum_method=sum_method, final_activation=final_activation) elif model_type == 1: move_model = MOVEModelNT(emb_size=emb_size, sum_method=sum_method, final_activation=final_activation) # loading a pre-trained model model_name = 'saved_models/model_{}.pt'.format(save_name) move_model.load_state_dict(torch.load(model_name, map_location='cpu')) move_model.eval() # sending the model to gpu, if available if torch.cuda.is_available(): move_model.cuda() # loading test data, initializing the dataset object and the data loader test_data, test_labels = import_dataset_from_pt(filename=dataset_name) test_map_set = MOVEDatasetFull(data=test_data, labels=test_labels) test_map_loader = DataLoader(test_map_set, batch_size=1, shuffle=False) # calculating the pairwise distances dist_map_matrix = test(move_model=move_model, test_loader=test_map_loader).cpu() # calculating the performance metrics average_precision(-1 * dist_map_matrix.clone() + torch.diag(torch.ones(len(test_data)) * float('-inf')), dataset=dataset)
def evaluate(exp_name, exp_type, main_path, emb_size, loss, data_dir): device = 'cuda:0' if torch.cuda.is_available() else 'cpu' print('Evaluating model {}.'.format(exp_name)) file_list = enumerate_h5_files(data_dir) file_list.sort(key=lambda x: os.path.splitext(os.path.basename(x))[0]) print("Number feature files: {}".format(len(file_list))) data = [] name = list(map(lambda x: os.path.splitext(os.path.relpath(x, data_dir))[0], file_list)) print("name: {}".format(name)) #image_with_index_list = dict(zip(name, range(len(name)))) #print("image_with_index_list: {}".format(image_with_index_list)) for file in tqdm(file_list): temp_crema = dd.io.load(file)["crema"] #print("crema shape: {}".format(temp_crema.shape)) idxs = np.arange(0, temp_crema.shape[0], 8) temp_tensor = torch.from_numpy(temp_crema[idxs].T) data.append(torch.cat((temp_tensor, temp_tensor))[:23].unsqueeze(0)) #name.append(os.path.splitext(os.path.basename(file))[0]) test_set = FullSizeInstanceDataset(data=data) test_loader = DataLoader(test_set, batch_size=1, shuffle=False) print("Initializing model") # initializing the model model = MOVEModel(emb_size=emb_size) # loading a pre-trained model model_name = os.path.join(main_path, 'saved_models', '{}_models'.format(exp_type), 'model_{}.pt'.format(exp_name)) model.load_state_dict(torch.load(model_name, map_location='cpu')) # sending the model to gpu, if available model.to(device) remove_items = [] with torch.no_grad(): # disabling gradient tracking model.eval() # setting the model to evaluation mode # initializing an empty tensor for storing the embeddings embed_all = torch.tensor([], device=device) # iterating through the data loader for batch_idx, item in tqdm(enumerate(test_loader)): try: # sending the items to the proper device item = handle_device(item, device) # forward pass of the model # obtaining the embeddings of each item in the batch emb = model(item) # appending the current embedding to the collection of embeddings embed_all = torch.cat((embed_all, emb)) except Exception as e: print("Error: {}, input shape: {}, index".format(e, item.shape, batch_idx)) remove_items.append(name[batch_idx]) continue for re_item in remove_items: name.remove(re_item) print("name length: {}".format(len(name))) image_with_index_list = dict(zip(name, range(len(name)))) embed_all = F.normalize(embed_all, p=2, dim=1) return embed_all.cpu(), image_with_index_list
class LSRTrainer(BaseTrainer): """ Trainer object for Latent Space Reconfiguration experiments. """ def __init__(self, cfg, experiment_name): """ Initializing the trainer :param cfg: dictionary that holds the config hyper-parameters :param experiment_name: name of the experiment """ # initializing the parent Trainer object super().__init__(cfg, experiment_name) def handle_training_batches(self): """ Training loop for one mini-epoch. :return: training loss for the current mini-epoch """ # setting the model to training mode self.model.train() # initializing a list object to hold losses from each iteration epoch_loss = [] # for the first epoch, only the linear layer is trained. # starting from the second epoch, all the parameters of the model are trained. if self.current_epoch == 1: for param in self.model.parameters(): param.requires_grad = True # training loop for batch_idx, batch in enumerate(self.data_loader): # if overfit_batch == 1, only the same batch is trained. # this helps to see whether there are any issues with optimization. # a fast over-fitting behaviour is expected. if self.cfg['overfit_batch'] == 1: if batch_idx == 0: overfit_batch = batch else: batch = overfit_batch # making sure the data and labels are in the correct device and in float32 type items, labels = batch items = handle_device(items, self.device) labels = handle_device(labels, self.device) # forward pass of the model # obtaining the embeddings of each item in the batch embs = self.model(items) # calculating the loss value for the iteration loss = LOSS_DICT[self.cfg['loss']]( data=embs, labels=labels, emb_size=self.model.fin_emb_size, proxies=self.proxies, margin=self.cfg['margin'], mining_strategy=self.cfg['mining_strategy']) # setting gradients of the optimizer to zero self.optimizer.zero_grad() # calculating gradients with backpropagation loss.backward() # updating the weights self.optimizer.step() # logging the loss value of the current batch epoch_loss.append(loss.detach().item()) # logging the loss value of the current mini-epoch return np.mean(epoch_loss) def create_model(self): """ Initializing the model to optimize. """ # creating and loading the learned parameters of the MOVE model # this model stands as our base model self.model = MOVEModel(emb_size=16000, sum_method=4, final_activation=3) self.model.load_state_dict( torch.load(os.path.join(self.cfg['main_path'], 'saved_models/model_move.pt'), map_location='cpu')) # freezing the parameters of all the parameters of the base model for param in self.model.parameters(): param.requires_grad = False # creating a new linear layer and a new batch normalization layer self.model.lin1 = torch.nn.Linear(in_features=256, out_features=self.cfg['emb_size'], bias=False) self.model.lin_bn = torch.nn.BatchNorm1d(self.cfg['emb_size'], affine=False) # setting the embedding size of the model self.model.fin_emb_size = self.cfg['emb_size'] # sending the model to the proper device self.model.to(self.device) # computing and printing the total number of parameters of the new model self.num_params = 0 for param in self.model.parameters(): self.num_params += np.prod(param.size()) print('Total number of parameters for the model: {:.0f}'.format( self.num_params)) def create_optimizer(self): """ Initializing the optimizer. For LSR training, we have two types of parameters. 'new_param' are the ones from the new linear layer, and 'finetune_param' are the ones from the 'feature extractor' part of MOVE model. By distinguishing them, we can set different learning rates for each parameter group. """ # getting parameter groups as explained above param_list = ['lin1.weight', 'lin1.bias'] new_param = [ par[1] for par in self.model.named_parameters() if par[0] in param_list ] finetune_param = [ par[1] for par in self.model.named_parameters() if par[0] not in param_list ] # initializing proxies if a proxy-based loss is used self.proxies = None if self.cfg['loss'] in [1, 2, 3]: self.proxies = torch.nn.Parameter( torch.randn(14499, self.cfg['emb_size'], requires_grad=True, device=self.device)) new_param.append(self.proxies) # setting the proper learning rates and initializing the optimizer opt_params = [{ 'params': finetune_param, 'lr': self.cfg['finetune_learning_rate'] }, { 'params': new_param }] if self.cfg['optimizer'] == 0: self.optimizer = torch.optim.SGD(opt_params, lr=self.cfg['learning_rate'], momentum=self.cfg['momentum']) elif self.cfg['optimizer'] == 1: self.optimizer = Ranger(opt_params, lr=self.cfg['learning_rate']) else: self.optimizer = None
def evaluate(exp_name, exp_type, main_path, emb_size, loss): """ Main evaluation function of MOVE. For a detailed explanation of parameters, please check 'python move_main.py -- help' :param main_path: main working directory :param exp_name: name to save model and experiment summary :param exp_type: type of experiment :param emb_size: the size of the final embeddings produced by the model :param loss: the loss used for training the model """ device = 'cuda:0' if torch.cuda.is_available() else 'cpu' eval_dataset = os.path.join(main_path, 'data/benchmark_crema.pt') print('Evaluating model {} on dataset {}.'.format(exp_name, eval_dataset)) # initializing the model model = MOVEModel(emb_size=emb_size) # loading a pre-trained model model_name = os.path.join(main_path, 'saved_models', '{}_models'.format(exp_type), 'model_{}.pt'.format(exp_name)) model.load_state_dict(torch.load(model_name, map_location='cpu')) # sending the model to gpu, if available model.to(device) # loading test data, initializing the dataset object and the data loader test_data, test_labels = import_dataset_from_pt(filename=eval_dataset, suffix=False) test_set = FullSizeInstanceDataset(data=test_data) test_loader = DataLoader(test_set, batch_size=1, shuffle=False) start_time = time.monotonic() with torch.no_grad(): # disabling gradient tracking model.eval() # setting the model to evaluation mode # initializing an empty tensor for storing the embeddings embed_all = torch.tensor([], device=device) # iterating through the data loader for batch_idx, item in enumerate(test_loader): # sending the items to the proper device item = handle_device(item, device) # forward pass of the model # obtaining the embeddings of each item in the batch emb = model(item) # appending the current embedding to the collection of embeddings embed_all = torch.cat((embed_all, emb)) # if Triplet or ProxyNCA loss is used, the distance function is Euclidean distance if loss in [0, 1]: dist_all = pairwise_euclidean_distance(embed_all) dist_all /= model.fin_emb_size # if NormalizedSoftmax loss is used, the distance function is cosine distance elif loss == 2: dist_all = -1 * pairwise_cosine_similarity(embed_all) # if Group loss is used, the distance function is Pearson correlation coefficient else: dist_all = -1 * pairwise_pearson_coef(embed_all) # computing evaluation metrics from the obtained distances average_precision(-1 * dist_all.cpu().float().clone() + torch.diag(torch.ones(len(test_data)) * float('-inf')), dataset=1) test_time = time.monotonic() - start_time print('Total time: {:.0f}m{:.0f}s.'.format(test_time // 60, test_time % 60))
class MOVETrainer(BaseTrainer): """ Trainer object for baseline experiments with MOVE. """ def __init__(self, cfg, experiment_name): """ Initializing the trainer :param cfg: dictionary that holds the config hyper-parameters :param experiment_name: name of the experiment """ # initializing the parent Trainer object super().__init__(cfg, experiment_name) def handle_training_batches(self): """ Training loop for one mini-epoch. :return: training loss for the current mini-epoch """ # setting the model to training mode self.model.train() # initializing a list object to hold losses from each iteration epoch_loss = [] # training loop for batch_idx, batch in enumerate(self.data_loader): # if overfit_batch == 1, only the same batch is trained. # this helps to see whether there are any issues with optimization. # a fast over-fitting behaviour is expected. if self.cfg['overfit_batch'] == 1: if batch_idx == 0: overfit_batch = batch else: batch = overfit_batch # making sure the data and labels are in the correct device and in float32 type items, labels = batch items = handle_device(items, self.device) labels = handle_device(labels, self.device) # forward pass of the model # obtaining the embeddings of each item in the batch embs = self.model(items) # calculating the loss value for the iteration loss = LOSS_DICT[self.cfg['loss']]( data=embs, labels=labels, emb_size=self.model.fin_emb_size, proxies=self.proxies, margin=self.cfg['margin'], mining_strategy=self.cfg['mining_strategy']) # setting gradients of the optimizer to zero self.optimizer.zero_grad() # calculating gradients with backpropagation loss.backward() # updating the weights self.optimizer.step() # logging the loss value of the current batch epoch_loss.append(loss.detach().item()) # logging the loss value of the current mini-epoch return np.mean(epoch_loss) def create_model(self): """ Initializing the model to optimize. """ # creating the model and sending it to the proper device self.model = MOVEModel(emb_size=self.cfg['emb_size']) self.model.to(self.device) # computing and printing the total number of parameters of the model self.num_params = 0 for param in self.model.parameters(): self.num_params += np.prod(param.size()) print('Total number of parameters for the model: {:.0f}'.format( self.num_params)) def create_optimizer(self): """ Initializing the optimizer. """ # parameters to train opt_params = list(self.model.parameters()) # initializing proxies if a proxy-based loss is used self.proxies = None if self.cfg['loss'] in [1, 2, 3]: self.proxies = torch.nn.Parameter( torch.randn(14499, self.cfg['emb_size'], requires_grad=True, device=self.device)) opt_params.append(self.proxies) if self.cfg['optimizer'] == 0: self.optimizer = torch.optim.SGD(opt_params, lr=self.cfg['learning_rate'], momentum=self.cfg['momentum']) elif self.cfg['optimizer'] == 1: self.optimizer = Ranger(opt_params, lr=self.cfg['learning_rate']) else: self.optimizer = None
class PruningTrainer(BaseTrainer): """ Trainer object for Pruning experiments. """ def __init__(self, cfg, experiment_name): """ Initializing the trainer :param cfg: dictionary that holds the config hyper-parameters :param experiment_name: name of the experiment """ # initializing the parent Trainer object super().__init__(cfg, experiment_name) def train(self, save_logs=True): """ Main training function for Pruning experiments. It overrides the training function of the BaseTrainer for adding pruning-related functionality. :param save_logs: whether to save training and validation loss logs """ # save the initial parameters of the model for other pruning iterations torch.save( self.model.state_dict(), os.path.join(self.cfg['main_path'], 'saved_models', 'pruning_models', 'model_{}_initial.pt'.format(self.experiment_name))) # iterating full-training cycles for pruning for prune_iteration in range(self.cfg['pruning_iterations'] + 1): self.prune_iteration = prune_iteration # loading the initial parameters of the model if prune_iteration > 0: self.model.load_state_dict( torch.load( os.path.join( self.cfg['main_path'], 'saved_models', 'pruning_models', 'model_{}_initial.pt'.format( self.experiment_name)))) # resetting the learning rate for param_group in self.optimizer.param_groups: param_group['lr'] = self.cfg['learning_rate'] # re-creating the learning rate schedule for the new training cycle self.create_lr_scheduler() # execute a full training cycle super().train(save_logs=False) # selecting which indices of the linear layer to prune # based on the trained model self.select_indices_to_prune() if save_logs: with open( './experiment_logs/{}_logs/{}.json'.format( self.cfg['exp_type'], self.experiment_name), 'w') as f: json.dump( { 'train_loss_log': self.train_loss_log, 'val_loss_log': self.val_loss_log }, f) def handle_training_batches(self): """ Training loop for one mini-epoch. :return: training loss for the current mini-epoch """ # setting the model to training mode self.model.train() # initializing a list object to hold losses from each iteration epoch_loss = [] # training loop for batch_idx, batch in enumerate(self.data_loader): # if overfit_batch == 1, only the same batch is trained. # this helps to see whether there are any issues with optimization. # a fast over-fitting behaviour is expected. if self.cfg['overfit_batch'] == 1: if batch_idx == 0: overfit_batch = batch else: batch = overfit_batch # making sure the data and labels are in the correct device and in float32 type items, labels = batch items = handle_device(items, self.device) labels = handle_device(labels, self.device) # forward pass of the model # obtaining the embeddings of each item in the batch embs = self.model(items) # calculating the loss value for the iteration loss = triplet_loss(data=embs, labels=labels, emb_size=self.cfg['emb_size'], margin=self.cfg['margin'], mining_strategy=self.cfg['mining_strategy']) # setting gradients of the optimizer to zero self.optimizer.zero_grad() # calculating gradients with backpropagation loss.backward() # updating the weights self.optimizer.step() # applying the zero-mask to the selected indices if self.prune_iteration > 0: self.apply_mask() # logging the loss value of the current batch epoch_loss.append(loss.detach().item()) # logging the loss value of the current mini-epoch return np.mean(epoch_loss) def apply_mask(self): """ Applying the mask tensor to the linear layer to 'prune' weights. """ self.model.lin1.weight.data = self.model.lin1.weight.data * self.mask self.model.fin_emb_size = self.model.lin1.weight.shape[ 0] - NUM_OF_ROWS_TO_PRUNE[self.prune_iteration] def select_indices_to_prune(self): """ Selecting which indices to prune based on the trained model. :return: """ self.indices_to_prune = torch.topk( torch.abs(self.model.lin1.weight).mean(dim=1), k=NUM_OF_ROWS_TO_PRUNE[self.prune_iteration], largest=False).indices # creating a mask of ones and zeros mask = torch.ones(self.model.lin1.weight.shape) zero_row = torch.zeros(1, self.model.lin1.weight.shape[1]) # sending the tensors to the proper device mask = handle_device(mask, self.device) zero_row = handle_device(zero_row, self.device) # finalizing the mask based on the selected indices mask[self.indices_to_prune] = zero_row self.mask = mask def create_model(self): """ Initializing the model to optimize. """ # creating the model and sending it to the proper device self.model = MOVEModel(emb_size=16000) self.model.to(self.device) # computing and printing the total number of parameters of the new model self.num_params = 0 for param in self.model.parameters(): self.num_params += np.prod(param.size()) print('Total number of parameters for the model: {:.0f}'.format( self.num_params)) def create_optimizer(self): """ Initializing the optimizer. """ if self.cfg['optimizer'] == 0: self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.cfg['learning_rate'], momentum=self.cfg['momentum']) elif self.cfg['optimizer'] == 1: self.optimizer = Ranger(self.model.parameters(), lr=self.cfg['learning_rate']) else: self.optimizer = None