def forward(self, x): x = self.features(x) x = flatten(x, 1) x = self.fc(x) return x
def forward(self, x): x = self.features(x) x = self.avgpool(x) x = torch.flatten(x, 1) x = self.classifier(x) return x
def forward(self, x): x = self.pool(F.relu(self.conv(x))) x = torch.flatten(x, start_dim=1) h = F.relu(self.linear_1(x)) y = F.softmax(self.linear_2(h)) return y
def forward(self, x): x = self.pool(x) x = torch.flatten(x, 1) x = self.fc(x) return x
else: numbOfBatches = batchSize leftIndex = batchID * batchSize rightIndex = leftIndex + numbOfBatches courseid_LR = course_id_LR[leftIndex: rightIndex].clone().long() videoid = video_id[leftIndex: rightIndex].clone().long() continuesfeature1 = continues_feature1[leftIndex: rightIndex].clone() courseid_CNN = course_id_CNN[leftIndex: rightIndex].clone().long() continuesfeature2 = continues_feature2[leftIndex: rightIndex].clone() predictions,LR_result,GRU_result = model(courseid_LR,courseid_CNN,continuesfeature1,continuesfeature2,videoid,numbOfBatches) predictions = torch.flatten(predictions) LR_result = torch.flatten(LR_result) GRU_result = torch.flatten(GRU_result) loss_final = MSELoss(predictions,y[leftIndex: rightIndex].float()) loss_lr = MSELoss(LR_result,y[leftIndex: rightIndex].float()) loss_gru = MSELoss(GRU_result,y[leftIndex: rightIndex].float()) # print('loss: ',loss) loss = loss_final + loss_lr +loss_gru optimizer.zero_grad() loss.backward() optimizer.step() loss_value.append(loss.item()) #testing
def fit(self, insample_dataloader, outsample_dataloader): # Instantiate optimization tools loss = nn.CrossEntropyLoss() optimizer = optim.SGD([{'params': self.model.input_layer.parameters()}, {'params': self.model.hidden_layers.parameters()}, {'params': self.model.output_layer.parameters(), 'weight_decay': self.params['output_l2_decay']}], lr=self.params['initial_lr'], momentum=self.params['initial_momentum']) constrainer = WeightNormConstrainer(norm=self.params['weight_norm']) # Initialize counters and trajectories step = 0 epoch = 0 metric_trajectories = {'step': [], 'epoch': [], 'insample_accuracy': [], 'outsample_accuracy': [], 'insample_cross_entropy': [], 'outsample_cross_entropy': [] } print('\n'+'='*36+' Fitting DCLF '+'='*36) while step <= self.params['iterations']: # Train epoch += 1 self.model.train() for batch in iter(insample_dataloader): step+=1 if step > self.params['iterations']: continue batch_x = t.flatten(batch[0].to(self.device), start_dim=1) batch_y = batch[1].to(self.device) optimizer.zero_grad() # TODO: make predictions, compute the cross entropy loss and perform backward propagation logits = None t.nn.utils.clip_grad_norm_(self.model.parameters(), 20) optimizer.step() # Evaluate metrics if (step % params['display_step'] == 0): in_cross_entropy = self.evaluate_cross_entropy(insample_dataloader) out_cross_entropy = self.evaluate_cross_entropy(outsample_dataloader) in_accuracy = self.evaluate_accuracy(insample_dataloader) out_accuracy = self.evaluate_accuracy(outsample_dataloader) print('Epoch:', '%d,' % epoch, 'Step:', '%d,' % step, 'In Loss: {:.7f},'.format(in_cross_entropy), 'Out Loss: {:.7f},'.format(out_cross_entropy), 'In Acc: {:03.3f},'.format(in_accuracy), 'Out Acc: {:03.3f}'.format(out_accuracy)) metric_trajectories['insample_cross_entropy'].append(in_cross_entropy) metric_trajectories['outsample_cross_entropy'].append(out_cross_entropy) metric_trajectories['insample_accuracy'].append(in_accuracy) metric_trajectories['outsample_accuracy'].append(out_accuracy) # Update optimizer learning rate if step % self.params['adjust_lr_step'] == 0: self.adjust_lr(optimizer=optimizer, lr_decay=self.params['lr_decay']) # Update optimizer momentum if step % self.params['adjust_momentum_step'] == 0 and \ step < self.params['momentum_change_steps']: self.adjust_momentum(optimizer=optimizer, step=step, momentum_change_steps=self.params['momentum_change_steps'], initial_momentum=self.params['initial_momentum'], final_momentum=self.params['final_momentum']) # Constraint max_norm of weights if self.params['apply_weight_norm'] and \ (step % self.params['adjust_norm_step'] == 0): self.model.apply(constrainer) # Store trajectories print('\n'+'='*35+' Finished Train '+'='*35) self.trajectories = metric_trajectories
def _post_backward_hook(self, param: Parameter, *unused: Any) -> None: """ At the start of :func:`_post_backward_hook`, ``param.grad`` contains the full gradient for the local batch. The reduce-scatter op will replace ``param.grad`` with a single shard of the summed gradient across all GPUs. This shard will align with the current GPU rank. For example:: before reduce_scatter: param.grad (GPU #0): [1, 2, 3, 4] param.grad (GPU #1): [5, 6, 7, 8] after reduce_scatter: param.grad (GPU #0): [6, 8] # 1+5, 2+6 param.grad (GPU #1): [10, 12] # 3+7, 4+8 The local GPU's ``optim.step`` is responsible for updating a single shard of params, also corresponding to the current GPU's rank. This alignment is created by :func:`_shard_parameters`, which ensures that the local optimizer only sees the relevant parameter shard. """ # First hook callback will see PRE state. If we have multiple params, # then subsequent hook callbacks will see POST state. self._assert_state( [TrainingState_.BACKWARD_PRE, TrainingState_.BACKWARD_POST]) self.training_state = TrainingState_.BACKWARD_POST if param.grad is None: return if param.grad.requires_grad: raise RuntimeError( "FSDP only works with gradients that don't require gradients") self._free_full_params([param]) # Switch to local shard after backward. self._use_param_local_shard([param]) # Wait for all work in the current stream to finish, then start the # reductions in post_backward stream. self._streams["post_backward"].wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(self._streams["post_backward"]): orig_grad_data = param.grad.data if self.gradient_predivide_factor > 1: # Average grad by world_size for consistency with PyTorch DDP. param.grad.div_(self.gradient_predivide_factor) if param._is_sharded: # type: ignore[attr-defined] grad_flatten = torch.flatten(param.grad) chunks = list(grad_flatten.chunk(self.world_size)) num_pad = self.world_size * chunks[0].numel( ) - param.grad.numel() input_flattened = F.pad(grad_flatten, [0, num_pad]) output = torch.zeros_like(chunks[0]) dist._reduce_scatter_base(output, input_flattened, group=self.process_group) if self.gradient_postdivide_factor > 1: # Average grad by world_size for consistency with PyTorch DDP. output.div_(self.gradient_postdivide_factor) param.grad.data = output else: # Currently the only way for _is_sharded to be False is if # world_size == 1. This could be relaxed in the future, e.g, # no sharding like PyTorch DDP, in which case grads should be # all-reduced here. assert ( self.world_size == 1 ), "Currently the only way for _is_sharded to be False is \ world_size == 1" # After _post_backward_hook returns, orig_grad_data will eventually # go out of scope, at which point it could otherwise be freed for # further reuse by the main stream while the div/reduce_scatter/copy # are underway in the post_backward stream. See: # github.com/NVIDIA/apex/blob/master/apex/parallel/distributed.py orig_grad_data.record_stream(self._streams["post_backward"])
def forward(self, batch_size, encoded_hidden, encoded_outputs, encoded_lens, story, max_res_len, target_batches, use_teacher_forcing, slot_temp): all_point_outputs = torch.zeros(len(slot_temp), batch_size, max_res_len, self.vocab_size) all_gate_outputs = torch.zeros(len(slot_temp), batch_size, self.nb_gate) if USE_CUDA: all_point_outputs = all_point_outputs.cuda() all_gate_outputs = all_gate_outputs.cuda() # Get the slot embedding slot_emb_dict = {} for i, slot in enumerate(slot_temp): # Domain embbeding if slot.split("-")[0] in self.slot_w2i.keys(): domain_w2idx = [self.slot_w2i[slot.split("-")[0]]] domain_w2idx = torch.tensor(domain_w2idx) if USE_CUDA: domain_w2idx = domain_w2idx.cuda() domain_emb = self.Slot_emb(domain_w2idx) # Slot embbeding if slot.split("-")[1] in self.slot_w2i.keys(): slot_w2idx = [self.slot_w2i[slot.split("-")[1]]] slot_w2idx = torch.tensor(slot_w2idx) if USE_CUDA: slot_w2idx = slot_w2idx.cuda() slot_emb = self.Slot_emb(slot_w2idx) # Combine two embeddings as one query combined_emb = domain_emb + slot_emb slot_emb_dict[slot] = combined_emb slot_emb_exp = combined_emb.expand_as(encoded_hidden) if i == 0: slot_emb_arr = slot_emb_exp.clone() else: slot_emb_arr = torch.cat((slot_emb_arr, slot_emb_exp), dim=0) if args["parallel_decode"]: # Compute pointer-generator output, puting all (domain, slot) in one batch decoder_input = self.dropout_layer(slot_emb_arr).view(-1, self.hidden_size) # (batch*|slot|) * emb hidden = encoded_hidden.repeat(1, len(slot_temp), 1) # 1 * (batch*|slot|) * emb words_point_out = [[] for i in range(len(slot_temp))] words_class_out = [] for wi in range(max_res_len): dec_state, hidden = self.gru(decoder_input.expand_as(hidden), hidden) enc_out = encoded_outputs.repeat(len(slot_temp), 1, 1) enc_len = encoded_lens * len(slot_temp) context_vec, logits, prob = self.attend(enc_out, hidden.squeeze(0), enc_len) if wi == 0: all_gate_outputs = torch.reshape(self.W_gate(context_vec), all_gate_outputs.size()) p_vocab = self.attend_vocab(self.embedding.weight, hidden.squeeze(0)) p_gen_vec = torch.cat([dec_state.squeeze(0), context_vec, decoder_input], -1) vocab_pointer_switches = self.sigmoid(self.W_ratio(p_gen_vec)) p_context_ptr = torch.zeros(p_vocab.size()) if USE_CUDA: p_context_ptr = p_context_ptr.cuda() p_context_ptr.scatter_add_(1, story.repeat(len(slot_temp), 1), prob) final_p_vocab = (1 - vocab_pointer_switches).expand_as(p_context_ptr) * p_context_ptr + \ vocab_pointer_switches.expand_as(p_context_ptr) * p_vocab pred_word = torch.argmax(final_p_vocab, dim=1) words = [self.lang.index2word[w_idx.item()] for w_idx in pred_word] for si in range(len(slot_temp)): words_point_out[si].append(words[si*batch_size:(si+1)*batch_size]) all_point_outputs[:, :, wi, :] = torch.reshape(final_p_vocab, (len(slot_temp), batch_size, self.vocab_size)) if use_teacher_forcing: decoder_input = self.embedding(torch.flatten(target_batches[:, :, wi].transpose(1,0))) else: decoder_input = self.embedding(pred_word) if USE_CUDA: decoder_input = decoder_input.cuda() else: # Compute pointer-generator output, decoding each (domain, slot) one-by-one words_point_out = [] counter = 0 for slot in slot_temp: hidden = encoded_hidden words = [] slot_emb = slot_emb_dict[slot] decoder_input = self.dropout_layer(slot_emb).expand(batch_size, self.hidden_size) for wi in range(max_res_len): dec_state, hidden = self.gru(decoder_input.expand_as(hidden), hidden) context_vec, logits, prob = self.attend(encoded_outputs, hidden.squeeze(0), encoded_lens) if wi == 0: all_gate_outputs[counter] = self.W_gate(context_vec) p_vocab = self.attend_vocab(self.embedding.weight, hidden.squeeze(0)) p_gen_vec = torch.cat([dec_state.squeeze(0), context_vec, decoder_input], -1) vocab_pointer_switches = self.sigmoid(self.W_ratio(p_gen_vec)) p_context_ptr = torch.zeros(p_vocab.size()) if USE_CUDA: p_context_ptr = p_context_ptr.cuda() p_context_ptr.scatter_add_(1, story, prob) final_p_vocab = (1 - vocab_pointer_switches).expand_as(p_context_ptr) * p_context_ptr + \ vocab_pointer_switches.expand_as(p_context_ptr) * p_vocab pred_word = torch.argmax(final_p_vocab, dim=1) words.append([self.lang.index2word[w_idx.item()] for w_idx in pred_word]) all_point_outputs[counter, :, wi, :] = final_p_vocab if use_teacher_forcing: decoder_input = self.embedding(target_batches[:, counter, wi]) # Chosen word is next input else: decoder_input = self.embedding(pred_word) if USE_CUDA: decoder_input = decoder_input.cuda() counter += 1 words_point_out.append(words) return all_point_outputs, all_gate_outputs, words_point_out, []
def forward(self, x): return torch.flatten(x, 1)
def cluster_initializer_online_update(layer, new_train_x, graph): def assign_cluster_idx(node, cluster_idx): if hasattr(node, 'new_cluster_idx'): node.new_cluster_idx = list( dict.fromkeys(node.new_cluster_idx + cluster_idx)) else: node.new_cluster_idx = cluster_idx if type(layer) == FactorizedLeafLayer.FactorizedLeafLayer: if not leafs_oa_bool: return ef_array = layer.ef_array num_var = ef_array.num_var array_shape = ef_array.array_shape # lookup array consisting of nodes for certain replica_idx replica_lookup = [[ n for n in layer.nodes if n.einet_address.replica_idx == replica_idx ] for replica_idx in range(array_shape[1])] with torch.no_grad(): # Consturct (num_var, num_replica, num_stats) matrix params = ef_array.params.permute(1, 0, 2, 3)[0] for sc in range(num_var): # construct a replica array for every scope for replica_idx in range(array_shape[1]): # first lookup nodes with correct replica_idx for n in replica_lookup[replica_idx]: # when the current scope is in the scope of the node if sc in n.scope and hasattr( n, 'new_cluster_idx') and len( n.new_cluster_idx) > 0: params[sc][ replica_idx] *= layer.total_samples_matrix[sc][ replica_idx] for n_c_idx in n.new_cluster_idx: params[sc][replica_idx] += new_train_x[ n_c_idx][sc] layer.total_samples_matrix[sc][replica_idx] += len( n.new_cluster_idx) params[sc][ replica_idx] /= layer.total_samples_matrix[sc][ replica_idx] # construct correct weight matrix representation ef_array.params = torch.nn.Parameter( params.repeat(array_shape[0], 1, 1, 1).permute(1, 0, 2, 3)) elif type(layer) == SumLayer.EinsumLayer: """ Einsum layer """ for product in layer.products: # direct successors of current product node sum_successors = list(graph.succ[product]) for sum_node in sum_successors: # successors of successor of current product node product_successors = list(graph.succ[sum_node]) if len(product_successors) > 1: # next layer is EinsumMixingLayer # Hence only append cluster_idx to the successor of current product node (sum node) assign_cluster_idx(sum_node, product.new_cluster_idx) elif len(product_successors) == 1: # next layer is EinsumLayer # hence append cluster_idx to the succesor of the succesor (product node) for p in product_successors: assign_cluster_idx(p, product.new_cluster_idx) else: # next layer is FactorizedLeafLayer # Hence only append cluster_idx to the successor of current product node (leaf node) assign_cluster_idx(sum_node, product.new_cluster_idx) elif type(layer) == SumLayer.EinsumMixingLayer: with torch.no_grad(): params = layer.params[0] for si, sum_node in enumerate(layer.nodes): if hasattr(sum_node, 'new_cluster_idx'): new_train_x = new_train_x[sum_node.new_cluster_idx] else: sum_node.new_cluster_idx = [ i for i in range(len(new_train_x)) ] if hasattr(sum_node, 'cl_centers'): # calculate the cluster to which the new sample should be added nearest_index = [-1 for x in new_train_x] nearest_dist = [-1 for x in new_train_x] for i, c in enumerate(sum_node.cl_centers): c_dists = [] for x in new_train_x: c_dists.append( distance.euclidean(c, torch.flatten(x).cpu())) for j, d in enumerate(c_dists): if d < nearest_dist[j] or nearest_index[j] < 0: nearest_index[j] = i nearest_dist[j] = c_dists[j] # retrieve, update and store weights if weights_oa_bool: params[si] *= torch.sum( torch.tensor(sum_node.un_weights)).to( torch.device(cuda_device)) for i in nearest_index: params[si][i] += 1 sum_node.un_weights[i] += 1 params[si] /= torch.sum( torch.tensor(sum_node.un_weights)).to( torch.device(cuda_device)) # weights = sum_node.un_weights # if not layer._use_em: # weights = weights.astype(float) # weights -= np.mean(weights) # weights /= np.std(weights) # params[si] *= torch.tensor(weights).to(torch.device(cuda_device)) # calculate division of data points cluster_idx = [[] for i in range(layer.max_components)] for i, id in enumerate(nearest_index): cluster_idx[id].append(i) else: cluster_idx = [ sum_node.new_cluster_idx for i in range(layer.max_components) ] # store datapoint idx in layer structure for i, product in enumerate(list(graph.succ[sum_node])): assign_cluster_idx(product, cluster_idx[i]) # construct correct weight matrix from params params = params.repeat(layer.num_sums, 1, 1).float() params = params.to(torch.device(cuda_device)) # normalize if layer._use_em: with torch.no_grad(): if layer.params_mask is not None: layer.params_mask = layer.params_mask.to( torch.device(cuda_device)) params.data *= layer.params_mask params.data = params.data / (params.data.sum( layer.normalization_dims, keepdim=True)) layer.params = torch.nn.Parameter(params)
def einsum_cluster_initializer_online_update(layer, new_train_x, graph): def assign_cluster_idx(node, cluster_idx): if hasattr(node, 'new_cluster_idx'): new_cluster_idx = [] if len(cluster_idx) != len(node.new_cluster_idx): raise AssertionError("Should not happen") for i in range(len(cluster_idx)): new_cluster_idx.append( list( dict.fromkeys(node.new_cluster_idx[i] + cluster_idx[i]))) if len(cluster_idx) != len(new_cluster_idx): raise AssertionError("Should not happen") node.new_cluster_idx = new_cluster_idx else: node.new_cluster_idx = cluster_idx if type(layer) == FactorizedLeafLayer.FactorizedLeafLayer: if not leafs_oa_bool: return ef_array = layer.ef_array num_var = ef_array.num_var array_shape = ef_array.array_shape # lookup array consisting of nodes for certain replica_idx replica_lookup = [[ n for n in layer.nodes if n.einet_address.replica_idx == replica_idx ] for replica_idx in range(array_shape[1])] with torch.no_grad(): # Consturct (num_var, num_replica, num_stats) matrix params = ef_array.params.permute(1, 0, 2, 3) for i in range(array_shape[0]): for sc in range(num_var): # construct a replica array for every scope for replica_idx in range(array_shape[1]): # first lookup nodes with correct replica_idx for n in replica_lookup[replica_idx]: # when the current scope is in the scope of the node if sc in n.scope and hasattr( n, 'new_cluster_idx') and len( n.new_cluster_idx) > 0: params[i][sc][ replica_idx] *= layer.total_samples_matrix[ i][sc][replica_idx] for n_c_idx in n.new_cluster_idx[i]: params[i][sc][replica_idx] += new_train_x[ n_c_idx][sc] layer.total_samples_matrix[i][sc][ replica_idx] += len(n.new_cluster_idx[i]) params[i][sc][ replica_idx] /= layer.total_samples_matrix[ i][sc][replica_idx] # construct correct weight matrix representation ef_array.params = torch.nn.Parameter(params.permute(1, 0, 2, 3)) elif type(layer) == SumLayer.EinsumLayer: """ Einsum layer """ with torch.no_grad(): params = layer.params for pi, product_node in enumerate(layer.products): for ki, k_idx in enumerate(product_node.new_cluster_idx): if product_node.cl_centers[ki] is not None and len( k_idx) > 0: new_train_x_k = new_train_x[k_idx].cpu() new_train_x_k = torch.index_select( new_train_x_k, 1, torch.tensor(product_node.scope)) # calculate the cluster to which the new sample should be added nearest_index = [-1 for x in new_train_x_k] nearest_dist = [-1 for x in new_train_x_k] for i, c in enumerate(product_node.cl_centers[ki]): c_dists = [] for x in new_train_x_k: c_dists.append( distance.euclidean(c, torch.flatten(x))) for j, d in enumerate(c_dists): if d < nearest_dist[j] or nearest_index[j] < 0: nearest_index[j] = i nearest_dist[j] = c_dists[j] # retrieve, update and store weights if weights_oa_bool: params = params.permute(2, 3, 0, 1) weights = torch.flatten(params[ki][pi]) weights *= torch.sum( torch.tensor(product_node.un_weights[ki])).to( torch.device(cuda_device)) for i in nearest_index: weights[i] += 1 product_node.un_weights[ki][i] += 1 weights /= torch.sum( torch.tensor(product_node.un_weights[ki])).to( torch.device(cuda_device)) params[ki][pi] = torch.reshape( weights, (layer.num_input_dist, layer.num_input_dist)) params = params.permute(2, 3, 0, 1) # weights = product_node.un_weights[ki] # if not layer._use_em: # weights = weights.astype(float) # weights -= np.mean(weights) # weights /= np.std(weights) # weights = np.resize(weights, (layer.num_input_dist, layer.num_input_dist)) # params = params.permute(2, 3, 0, 1) # params[ki][pi] = torch.tensor(weights) # params = params.permute(2, 3, 0, 1) # calculate division of data points cluster_idx = [ [] for i in range(pow(layer.num_input_dist, 2)) ] for i, id in enumerate(nearest_index): cluster_idx[id].append(i) cluster_idx = np.resize( cluster_idx, (layer.num_input_dist, layer.num_input_dist)) cluster_idx_left = np.sum(cluster_idx, 1) cluster_idx_right = np.sum(cluster_idx, 0) else: cluster_idx_left = [ k_idx for i in range(layer.num_input_dist) ] cluster_idx_right = [ k_idx for i in range(layer.num_input_dist) ] # direct successors of current product node, this is always 2 sum nodes in Einsum Networks sum_successors = list(graph.succ[product_node]) if len(list(graph.succ[list( graph.succ[product_node])[0]])) == 1: for p in list(graph.succ[list( graph.succ[product_node])[0]]): assign_cluster_idx(p, cluster_idx_left) else: assign_cluster_idx(sum_successors[0], cluster_idx_left) if len(list(graph.succ[list( graph.succ[product_node])[1]])) == 1: for p in list(graph.succ[list( graph.succ[product_node])[1]]): assign_cluster_idx(p, cluster_idx_right) else: assign_cluster_idx(sum_successors[1], cluster_idx_right) params = params.to(torch.device(cuda_device)) # normalize if layer._use_em: with torch.no_grad(): if layer.params_mask is not None: layer.params_mask = layer.params_mask.to( torch.device(cuda_device)) params.data *= layer.params_mask params.data = params.data / (params.data.sum( layer.normalization_dims, keepdim=True)) layer.params = torch.nn.Parameter(params) elif type(layer) == SumLayer.EinsumMixingLayer: with torch.no_grad(): params = layer.params for si, sum_node in enumerate(layer.nodes): cluster_idx_temp = [[] for i in range(layer.max_components)] if hasattr(sum_node, 'new_cluster_idx'): k_idxs = sum_node.new_cluster_idx else: # here we make the assumption that this only happens in the root node k_idxs = [[i for i in range(len(new_train_x))]] for ki, k_idx in enumerate(k_idxs): if sum_node.cl_centers[ki] is not None and len(k_idx) > 0: new_train_x_k = new_train_x[k_idx].cpu() new_train_x_k = torch.index_select( new_train_x_k, 1, torch.tensor(sum_node.scope)) # calculate the cluster to which the new sample should be added nearest_index = [-1 for x in new_train_x_k] nearest_dist = [-1 for x in new_train_x_k] for i, c in enumerate(sum_node.cl_centers[ki]): c_dists = [] for x in new_train_x_k: c_dists.append( distance.euclidean(c, torch.flatten(x))) for j, d in enumerate(c_dists): if d < nearest_dist[j] or nearest_index[j] < 0: nearest_index[j] = i nearest_dist[j] = c_dists[j] # retrieve, update and store weights if weights_oa_bool: params[ki][si] *= torch.sum( torch.tensor(sum_node.un_weights[ki])).to( torch.device(cuda_device)) for i in nearest_index: params[ki][si][i] += 1 sum_node.un_weights[ki][i] += 1 params[ki][si] /= torch.sum( torch.tensor(sum_node.un_weights[ki])).to( torch.device(cuda_device)) # weights = sum_node.un_weights[ki] # if not layer._use_em: # weights = weights.astype(float) # weights -= np.mean(weights) # weights /= np.std(weights) # params[ki][si] = torch.tensor(weights) # calculate division of data points cluster_idx = [[] for i in range(layer.max_components)] for i, id in enumerate(nearest_index): cluster_idx[id].append(i) else: cluster_idx = [ k_idx for i in range(layer.max_components) ] for i in range(len(cluster_idx)): cluster_idx_temp[i].append(cluster_idx[i]) # store datapoint idx in layer structure for i, product in enumerate(list(graph.succ[sum_node])): assign_cluster_idx(product, cluster_idx_temp[i]) # construct correct weight matrix from params params = params.to(torch.device(cuda_device)) # normalize if layer._use_em: with torch.no_grad(): if layer.params_mask is not None: layer.params_mask = layer.params_mask.to( torch.device(cuda_device)) params.data *= layer.params_mask params.data = params.data / (params.data.sum( layer.normalization_dims, keepdim=True)) layer.params = torch.nn.Parameter(params)
print( f'Epoch [{epoch + 1}/{num_epochs}], Loss: {maxloss:.8f}, Test Loss: {test_maxloss:.8f}' ) maxloss = 0.0 test_maxloss = 0.0 with torch.no_grad(): ff_model.eval() target_output = ff_model(eval_input) STACKCOUNT = 1 curr_input_mat = torch.hstack( (target_output[:STACKCOUNT, 6:15], target_output[:STACKCOUNT, 21:27])) vels_and_accels = torch.hstack( (eval_input[STACKCOUNT - 1, 0:6], eval_input[STACKCOUNT - 1, -18:])) new_input_feet = torch.flatten(curr_input_mat) lower_body_poses = None with torch.no_grad(): for curr_eval_idx in range(STACKCOUNT, eval_input.shape[0]): model_output = vae_model(new_input_feet, vels_and_accels) if lower_body_poses is None: lower_body_poses = model_output else: lower_body_poses = torch.vstack((lower_body_poses, model_output)) curr_input_mat = torch.roll(curr_input_mat, -1, 0) vels_and_accels = torch.hstack( (eval_input[curr_eval_idx, 0:6], eval_input[curr_eval_idx, -18:])) curr_input_mat[-1] = model_output
def test_explicit_hessian(): """Check computation of hessian of loss(B'WA) from https://github.com/yaroslavvb/kfac_pytorch/blob/master/derivation.pdf """ torch.set_default_dtype(torch.float64) A = torch.tensor([[-1., 4], [3, 0]]) B = torch.tensor([[-4., 3], [2, 6]]) X = torch.tensor([[-5., 0], [-2, -6]], requires_grad=True) Y = B.t() @ X @ A u.check_equal(Y, [[-52, 64], [-81, -108]]) loss = torch.sum(Y * Y) / 2 hess0 = u.hessian(loss, X).reshape([4, 4]) hess1 = u.Kron(A @ A.t(), B @ B.t()) u.check_equal(loss, 12512.5) # PyTorch autograd computes Hessian with respect to row-vectorized parameters, whereas # autograd_lib uses math convention and does column-vectorized. # Commuting order of Kronecker product switches between two representations u.check_equal(hess1.commute(), hess0) # Do a test using Linear layers instead of matrix multiplies model: u.SimpleFullyConnected2 = u.SimpleFullyConnected2([2, 2, 2], bias=False) model.layers[0].weight.data.copy_(X) # Transpose to match previous results, layers treat dim0 as batch dimension u.check_equal(model.layers[0](A.t()).t(), [[5, -20], [-16, -8]]) # XA = (A'X0)' model.layers[1].weight.data.copy_(B.t()) u.check_equal(model(A.t()).t(), Y) Y = model(A.t()).t() # transpose to data-dimension=columns loss = torch.sum(Y * Y) / 2 loss.backward() u.check_equal(model.layers[0].weight.grad, [[-2285, -105], [-1490, -1770]]) G = B @ Y @ A.t() u.check_equal(model.layers[0].weight.grad, G) u.check_equal(hess0, u.Kron(B @ B.t(), A @ A.t())) # compute newton step u.check_equal(u.Kron([email protected](), [email protected]()).pinv() @ u.vec(G), u.v2c([-5, -2, 0, -6])) # compute Newton step using factored representation autograd_lib.add_hooks(model) Y = model(A.t()) n = 2 loss = torch.sum(Y * Y) / 2 autograd_lib.backprop_hess(Y, hess_type='LeastSquares') autograd_lib.compute_hess(model, method='kron', attr_name='hess_kron', vecr_order=False, loss_aggregation='sum') param = model.layers[0].weight hess2 = param.hess_kron print(hess2) u.check_equal(hess2, [[425, 170, -75, -30], [170, 680, -30, -120], [-75, -30, 225, 90], [-30, -120, 90, 360]]) # Gradient test model.zero_grad() loss.backward() u.check_close(u.vec(G).flatten(), u.Vec(param.grad)) # Newton step test # Method 0: PyTorch native autograd newton_step0 = param.grad.flatten() @ torch.pinverse(hess0) newton_step0 = newton_step0.reshape(param.shape) u.check_equal(newton_step0, [[-5, 0], [-2, -6]]) # Method 1: colummn major order ihess2 = hess2.pinv() u.check_equal(ihess2.LL, [[1/16, 1/48], [1/48, 17/144]]) u.check_equal(ihess2.RR, [[2/45, -(1/90)], [-(1/90), 1/36]]) u.check_equal(torch.flatten(hess2.pinv() @ u.vec(G)), [-5, -2, 0, -6]) newton_step1 = (ihess2 @ u.Vec(param.grad)).matrix_form() # Method2: row major order ihess2_rowmajor = ihess2.commute() newton_step2 = ihess2_rowmajor @ u.Vecr(param.grad) newton_step2 = newton_step2.matrix_form() u.check_equal(newton_step0, newton_step1) u.check_equal(newton_step0, newton_step2)
def _test_explicit_hessian_refactored(): """Check computation of hessian of loss(B'WA) from https://github.com/yaroslavvb/kfac_pytorch/blob/master/derivation.pdf """ torch.set_default_dtype(torch.float64) A = torch.tensor([[-1., 4], [3, 0]]) B = torch.tensor([[-4., 3], [2, 6]]) X = torch.tensor([[-5., 0], [-2, -6]], requires_grad=True) Y = B.t() @ X @ A u.check_equal(Y, [[-52, 64], [-81, -108]]) loss = torch.sum(Y * Y) / 2 hess0 = u.hessian(loss, X).reshape([4, 4]) hess1 = u.Kron(A @ A.t(), B @ B.t()) u.check_equal(loss, 12512.5) # Do a test using Linear layers instead of matrix multiplies model: u.SimpleFullyConnected2 = u.SimpleFullyConnected2([2, 2, 2], bias=False) model.layers[0].weight.data.copy_(X) # Transpose to match previous results, layers treat dim0 as batch dimension u.check_equal(model.layers[0](A.t()).t(), [[5, -20], [-16, -8]]) # XA = (A'X0)' model.layers[1].weight.data.copy_(B.t()) u.check_equal(model(A.t()).t(), Y) Y = model(A.t()).t() # transpose to data-dimension=columns loss = torch.sum(Y * Y) / 2 loss.backward() u.check_equal(model.layers[0].weight.grad, [[-2285, -105], [-1490, -1770]]) G = B @ Y @ A.t() u.check_equal(model.layers[0].weight.grad, G) autograd_lib.register(model) activations_dict = autograd_lib.ModuleDict() # todo(y): make save_activations ctx manager automatically create A with autograd_lib.save_activations(activations_dict): Y = model(A.t()) Acov = autograd_lib.ModuleDict(autograd_lib.SecondOrderCov) for layer, activations in activations_dict.items(): print(layer, activations) Acov[layer].accumulate(activations, activations) autograd_lib.set_default_activations(activations_dict) autograd_lib.set_default_Acov(Acov) B = autograd_lib.ModuleDict(autograd_lib.SymmetricFourthOrderCov) autograd_lib.backward_accum(Y, "identity", B, retain_graph=False) print(B[model.layers[0]]) autograd_lib.backprop_hess(Y, hess_type='LeastSquares') autograd_lib.compute_hess(model, method='kron', attr_name='hess_kron', vecr_order=False, loss_aggregation='sum') param = model.layers[0].weight hess2 = param.hess_kron print(hess2) u.check_equal(hess2, [[425, 170, -75, -30], [170, 680, -30, -120], [-75, -30, 225, 90], [-30, -120, 90, 360]]) # Gradient test model.zero_grad() loss.backward() u.check_close(u.vec(G).flatten(), u.Vec(param.grad)) # Newton step test # Method 0: PyTorch native autograd newton_step0 = param.grad.flatten() @ torch.pinverse(hess0) newton_step0 = newton_step0.reshape(param.shape) u.check_equal(newton_step0, [[-5, 0], [-2, -6]]) # Method 1: colummn major order ihess2 = hess2.pinv() u.check_equal(ihess2.LL, [[1/16, 1/48], [1/48, 17/144]]) u.check_equal(ihess2.RR, [[2/45, -(1/90)], [-(1/90), 1/36]]) u.check_equal(torch.flatten(hess2.pinv() @ u.vec(G)), [-5, -2, 0, -6]) newton_step1 = (ihess2 @ u.Vec(param.grad)).matrix_form() # Method2: row major order ihess2_rowmajor = ihess2.commute() newton_step2 = ihess2_rowmajor @ u.Vecr(param.grad) newton_step2 = newton_step2.matrix_form() u.check_equal(newton_step0, newton_step1) u.check_equal(newton_step0, newton_step2)
def decode(self, x): x = F.relu(self.dec1(x)) x = F.relu(self.dec2(x)) x = F.relu(self.dec3(x)) x = F.relu(self.dec4(x)) x = F.relu(self.dec5(x)) x = F.relu(self.dec6(x)) return x def forward(self, x): x = self.encode(x) representation = x x = self.decode(x) return x, representation if __name__ == "__main__": random_data = torch.rand((1, 1, 28, 28)) print(random_data.shape) flat_data = torch.flatten(random_data) print(flat_data.shape) my_nn = AutoEncoder(input_shape=784, output_shape=8) my_nn.eval() print(my_nn(flat_data))
def quantAwareTrainingForward(model, x, stats, vis=False, axs=None, sym=False, num_bits=8, act_quant=False): conv1weight = model.conv1.weight.data model.conv1.weight.data = FakeQuantOp.apply(model.conv1.weight.data, num_bits) x = F.relu(model.conv1(x)) x = model.bn1(x) with torch.no_grad(): stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'conv1') if act_quant: x = FakeQuantOp.apply(x, num_bits, stats['conv1']['ema_min'], stats['conv1']['ema_max']) x = F.max_pool2d(x, 3, 2) conv2weight = model.conv2.weight.data model.conv2.weight.data = FakeQuantOp.apply(model.conv2.weight.data, num_bits) x = F.relu(model.conv2(x)) x = model.bn2(x) with torch.no_grad(): stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'conv2') if act_quant: x = FakeQuantOp.apply(x, num_bits, stats['conv2']['ema_min'], stats['conv2']['ema_max']) x = F.max_pool2d(x, 3, 2) conv3weight = model.conv3.weight.data model.conv3.weight.data = FakeQuantOp.apply(model.conv3.weight.data, num_bits) x = F.relu(model.conv3(x)) x = model.bn3(x) with torch.no_grad(): stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'conv3') if act_quant: x = FakeQuantOp.apply(x, num_bits, stats['conv3']['ema_min'], stats['conv3']['ema_max']) conv4weight = model.conv4.weight.data model.conv4.weight.data = FakeQuantOp.apply(model.conv4.weight.data, num_bits) x = F.relu(model.conv4(x)) x = model.bn4(x) with torch.no_grad(): stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'conv4') if act_quant: x = FakeQuantOp.apply(x, num_bits, stats['conv4']['ema_min'], stats['conv4']['ema_max']) conv5weight = model.conv5.weight.data model.conv5.weight.data = FakeQuantOp.apply(model.conv5.weight.data, num_bits) x = F.relu(model.conv5(x)) x = model.bn5(x) with torch.no_grad(): stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'conv5') if act_quant: x = FakeQuantOp.apply(x, num_bits, stats['conv5']['ema_min'], stats['conv5']['ema_max']) x = F.max_pool2d(x, 3, 2) x = F.adaptive_avg_pool2d(x, (6, 6)) x = torch.flatten(x, 1) # x = x.view(-1, 1250) # CIFAR x = model.dropout(x) fc1weight = model.fc1.weight.data model.fc1.weight.data = FakeQuantOp.apply(model.fc1.weight.data, num_bits) x = F.relu(model.fc1(x)) x = model.dropout(x) with torch.no_grad(): stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'fc1') if act_quant: x = FakeQuantOp.apply(x, num_bits, stats['fc1']['ema_min'], stats['fc1']['ema_max']) fc2weight = model.fc2.weight.data model.fc2.weight.data = FakeQuantOp.apply(model.fc2.weight.data, num_bits) x = F.relu(model.fc2(x)) with torch.no_grad(): stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'fc2') if act_quant: x = FakeQuantOp.apply(x, num_bits, stats['fc2']['ema_min'], stats['fc2']['ema_max']) x = model.fc3(x) with torch.no_grad(): stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'fc2') return x, \ conv1weight, conv2weight, conv3weight, conv4weight,conv5weight,\ fc1weight, fc2weight, stats
def handle_model( index: int, data: List[DataObj], torch_obj: Union[str, List], master_url: str = '127.0.0.1', iters: int = 1000, world_size: int = 2, early_stop_patience: int = -1, verbose: int = 1, mini_batch: int = -1, validation_pct: float = 0, device: str = 'cpu' ) -> List[Dict]: """ Runs the training of pytorch model, utilizing the distributed package. :param index: Partition index. Used for registering. :param data: The data from the partition :param torch_obj: The torch object string. Needs serialized :param master_url: The master url for the service. :param iters: The iterations for training :param world_size: The amount of partitions. Typically partitions + 1 for the driver :param verbose: whether to log the loss or not. :param mini_batch: Mini batch for training :param validation_pct: Validation percentage. :param device: The pytorch device to use for training. cpu/cuda :param early_stop_patience: Amount of patient for early stopping. -1 means don't use early stopping. :return: A list of the model state dictionary. """ # If a process has already been setup on the machine, kill it. if dist.is_initialized(): dist.destroy_process_group() # Set up the distributed server. os.environ['MASTER_ADDR'] = master_url os.environ['MASTER_PORT'] = '3333' dist.init_process_group('gloo', rank=index + 1, world_size=world_size) # Def Load model if index == -1: process_generic_model(torch_obj, iters, early_stop_patience > 0) return [] else: torch_obj = load_torch_model(torch_obj) # Loaded the model model = torch_obj.model.to(device) model.train() criterion = torch_obj.criterion optimizer = torch_obj.optimizer # Set up early stopping es = EarlyStopping(patience=early_stop_patience) should_stop = torch.zeros(1) has_early_stop = early_stop_patience > 0 partition_id = str(uuid4()) # Process the data. Converts to x_train, y_train, x_val, y_val data_obj = handle_features(data, validation_pct) # check if data is none. We will still need to register. if data_obj is None or data_obj.x_train is None: process_generic_model([list(p.shape) for p in model.parameters()], iters, early_stop_patience > 0) return [] # Passes all of the data x_train = data_obj.x_train.to(device) y_train = data_obj.y_train.to(device) if data_obj.y_train is not None else x_train x_val = data_obj.x_val.to(device) if data_obj.x_val is not None else None y_val = data_obj.y_val.to(device) if data_obj.y_val is not None else x_val for i in range(iters): optimizer.zero_grad() # utilize minibatch if 0 < mini_batch < len(data_obj.x_train): idxs = np.random.choice(len(data_obj.x_train), mini_batch, replace=False).tolist() x_train = data_obj.x_train[idxs] y_train = data_obj.y_train[idxs] y_pred = model(x_train) try: loss = criterion(y_pred, y_train) except RuntimeError as e: # utilized when loss need a long label y_train = torch.flatten(y_train.long()) loss = criterion(y_pred, y_train) loss_v = loss.item() # Process validation loss val_loss = None if x_val is not None: pred_val = model(x_val) try: val_loss = criterion(pred_val, y_val) except RuntimeError as e: y_val = torch.flatten(y_val.long()) val_loss = criterion(pred_val, y_val) val_loss = val_loss.item() # Calculate gradients loss.backward() # Distributed part of training. for param in model.parameters(): dist.all_reduce(param.grad.data, op=torch.distributed.ReduceOp.SUM) param.grad.data /= (world_size-1) # Processes the early stop work if has_early_stop: loss_to_use = val_loss if val_loss is not None else loss_v stop = es.step(loss_to_use) if stop: should_stop = should_stop + 1.0 dist.all_reduce(should_stop, op=torch.distributed.ReduceOp.SUM) if should_stop.item() > 0: break optimizer.step() if verbose: print(f"Partition: {partition_id}. Iteration: {i}. Loss: {loss_v}, Val Loss: {val_loss}") return [model.state_dict()]
def _flatten_tensor_optim_state( state_name: str, pos_dim_tensors: List[torch.Tensor], unflat_param_names: List[str], unflat_param_shapes: List[torch.Size], flat_param: FlatParameter, ) -> torch.Tensor: """ Flattens the positive-dimension tensor optimizer state given by the values ``tensors`` for the state ``state_name`` for a single flattened parameter ``flat_param`` corresponding to the unflattened parameter names ``unflat_param_names`` and unflatted parameter shapes ``unflat_param_shapes``. This flattens each unflattened parameter's tensor state into one tensor. NOTE: We use zero tensors for any unflattened parameters without state since some value is required to fill those entries. This assumes that the zero tensor is mathematically equivalent to having no state, which is true for Adam's ``exp_avg`` and ``exp_avg_sq`` but may not be true for all optimizers. Args: state_name (str): Optimizer state name. pos_dim_tensors (List[torch.Tensor]): Positive-dimension tensor optimizer state values for the unflattened parameters corresponding to the single flattened parameter. unflat_param_names (List[str]): A :class:`list` of unflattened parameter names corresponding to the single flattened parameter. unflat_param_shapes (List[torch.Size]): Unflattened parameter shapes corresponding to the single flattened parameter. flat_param (FlatParameter): The flattened parameter. Returns: flat_tensor (torch.Tensor): A flattened tensor containing the optimizer state corresponding to ``state_name`` constructed by concatenating the unflattened parameter tensor states in ``pos_dim_tensors`` (using zero tensors for any unflattened parameters without the state). """ non_none_tensors = [t for t in pos_dim_tensors if t is not None] # Check that all are tensors with the same dtype dtypes = set(t.dtype for t in non_none_tensors) if len(dtypes) != 1: raise ValueError( "All unflattened parameters comprising a single flattened " "parameter must have positive-dimension tensor state with the " f"same dtype but got dtypes {dtypes} for state {state_name} and " f"unflattened parameter names {unflat_param_names}" ) dtype = next(iter(dtypes)) # Check that each tensor state matches its parameter's shape for tensor, shape in zip(pos_dim_tensors, unflat_param_shapes): if tensor is None and len(shape) == 0: raise ValueError( "Flattening a zero-dimension parameter is not supported" ) elif tensor is not None and tensor.shape != shape: raise ValueError( "Tensor optimizer state does not have same shape as its " f"parameter: {tensor.shape} {shape}" ) # Flatten the tensor states cpu_device = torch.device("cpu") tensors = [ torch.flatten(state_value.to(cpu_device)) if state_value is not None else torch.flatten(torch.zeros( size=shape, dtype=dtype, device=cpu_device, )) for state_value, shape in zip(pos_dim_tensors, unflat_param_shapes) ] padding = flat_param.num_padded if padding > 0: tensors.append(torch.zeros(padding, dtype=dtype, device=cpu_device)) flat_tensor = torch.cat(tensors) # `flat_tensor`'s shape should be 1D and less than or equal to the # flattened parameter's shape (where the inequality is strict for positive # padding) if not flat_param._is_sharded: # currently, only when world size is 1 # If the parameter is not sharded, then `_full_param_padded` is not # used, so we skip the shape check return flat_tensor full_padded_dim = flat_param._full_param_padded.dim() # type: ignore[attr-defined] full_padded_shape = flat_param._full_param_padded.shape # type: ignore[attr-defined] assert flat_tensor.dim() == 1, \ f"`flat_tensor` should be 1D but got {flat_tensor.dim()} dims" assert full_padded_dim == 1, \ f"`_full_param_padded` should be 1D but got {full_padded_dim} dims" assert flat_tensor.shape[0] <= full_padded_shape[0], \ f"tensor optim state: {flat_tensor.shape} " \ f"parameter: {full_padded_shape}" return flat_tensor
def forward(self, x): out = self.conv(x) out = torch.flatten(out, 1) return self.linear(out)
def train(model, input_channel, optimizers, criterion, components, train_loader, val_loader, epoch, writer, args, use_CUDA=True, clamp=False, num_classes=10): model.train() accs = [] losses_w1 = [] losses_w2 = [] iter_val_loader = iter(val_loader) meta_criterion = nn.CrossEntropyLoss(reduce=False) index = 0 noisy_labels = [] true_labels = [] w = defaultdict() w_logger = defaultdict() losses_logger = defaultdict() accuracy_logger = ScalarLogger(prefix='accuracy') for c in components: w[c] = None w_logger[c] = WLogger() losses_logger[c] = ScalarLogger(prefix='loss') for (input, label, real) in train_loader: noisy_labels.append(label) true_labels.append(real) meta_model = get_model(args, num_classes=num_classes, input_channel=input_channel) meta_model.load_state_dict(model.state_dict()) if use_CUDA: meta_model = meta_model.cuda() val_input, val_label, iter_val_loader = get_val_samples( iter_val_loader, val_loader) input = to_var(input, requires_grad=False) label = to_var(label, requires_grad=False).long() val_input = to_var(val_input, requires_grad=False) val_label = to_var(val_label, requires_grad=False).long() meta_output = meta_model(input) cost = meta_criterion(meta_output, label) eps = to_var(torch.zeros(cost.size())) meta_loss = (cost * eps).sum() meta_model.zero_grad() if 'all' in components: grads = torch.autograd.grad(meta_loss, (meta_model.parameters()), create_graph=True) meta_model.update_params(0.001, source_params=grads) meta_val_output = meta_model(val_input) meta_val_loss = meta_criterion(meta_val_output, val_label).sum() grad_eps = torch.autograd.grad(meta_val_loss, eps, only_inputs=True)[0] if clamp: w['all'] = torch.clamp(-grad_eps, min=0) else: w['all'] = -grad_eps norm = torch.sum(abs(w['all'])) assert (clamp and len(components) == 1) or (len(components) > 1), "Error combination" w['all'] = w['all'] / norm if ('fc' in components): w['fc'] = copy.deepcopy(w['all']) w['fc'] = torch.clamp(w['fc'], max=0) w['all'] = torch.clamp(w['all'], min=0) elif ('backbone' in components): w['backbone'] = copy.deepcopy(w['all']) w['backbone'] = torch.clamp(w['backbone'], max=0) w['all'] = torch.clamp(w['all'], min=0) else: assert ('backbone' in components) and ('fc' in components) grads_backbone = torch.autograd.grad( meta_loss, (meta_model.backbone.parameters()), create_graph=True, retain_graph=True) grads_fc = torch.autograd.grad(meta_loss, (meta_model.fc.parameters()), create_graph=True) # Backbone Grads meta_model.backbone.update_params(0.001, source_params=grads_backbone) meta_val_feature = torch.flatten(meta_model.backbone(val_input), 1) meta_val_output = meta_model.fc(val_input) meta_val_loss = meta_criterion(meta_val_output, val_label).sum() if args.with_kl and args.reg_start <= epoch: train_feature = torch.flatten(meta_model.backbone(input), 1) meta_val_loss -= sample_wise_kl(train_feature, meta_val_feature) grad_eps = torch.autograd.grad(meta_val_loss, eps, only_inputs=True, retain_graph=True)[0] if clamp: w['backbone'] = torch.clamp(-grad_eps, min=0) else: w['backbone'] = -grad_eps norm = torch.sum(abs(w['backbone'])) w['backbone'] = w['backbone'] / norm # FC backward meta_model.load_state_dict(model.state_dict()) meta_model.fc.update_params(0.001, source_params=grads_fc) meta_val_output = meta_model(val_input) meta_val_loss = meta_criterion(meta_val_output, val_label).sum() grad_eps = torch.autograd.grad(meta_val_loss, eps, only_inputs=True, retain_graph=True)[0] if clamp: w['fc'] = torch.clamp(-grad_eps, min=0) else: w['fc'] = -grad_eps norm = torch.sum(abs(w['fc'])) w['fc'] = w['fc'] / norm index += 1 output = model(input) loss = defaultdict() prediction = torch.softmax(output, 1) for c in components: w_logger[c].update(w[c]) loss[c] = (meta_criterion(output, label) * w[c]).sum() optimizers[c].zero_grad() loss[c].backward(retain_graph=True) optimizers[c].step() losses_logger[c].update(loss[c]) top1 = accuracy(prediction, label) accuracy_logger.update(top1) noisy_labels = torch.cat(noisy_labels) true_labels = torch.cat(true_labels) mask = (noisy_labels != true_labels).cpu().numpy() for c in components: w_logger[c].write(writer, c, epoch) w_logger[c].mask_write(writer, c, epoch, mask) losses_logger[c].write(writer, c, epoch) accuracy_logger.write(writer, 'train', epoch) print("Training Epoch: {}, Accuracy: {}".format(epoch, accuracy_logger.avg())) return accuracy_logger.avg()
def evaluate(self, epoch_i): #--- load model --- self.load_model(epoch_i) self.model.eval() for r_id, doc_features in enumerate(tqdm( self.test_loader, desc='Test', dynamic_ncols=True, ascii=True)): _, d_t = doc_features try: if torch.sum(d_t)==0: continue #--- Gen 1st step --- with torch.no_grad(): hc_list = ( torch.zeros(self.config.num_layers, 1, self.config.hidden_size), torch.zeros(self.config.num_layers, 1, self.config.hidden_size) ) b = (0.0, [self.i2w[1]], [1], hc_list, d_t) _prob, hc_list, d_t = self.gen_one_step(b[2], b[3], b[4], self.sc_rnn_fw, self.w_hr_fw, self.w_ho_fw) top_indices = self.get_top_index(_prob) beam_candidates = [] for i in range(self.config.beam_size): wordix = top_indices[i] beam_candidates.append((b[0] + _prob[wordix], b[1] + [self.i2w[wordix]], [wordix], hc_list, d_t)) #--- Gen the whole sentence --- beams = beam_candidates[:self.config.beam_size] for t in range(self.config.gen_size - 1): beam_candidates = [] for b in beams: _prob, hc_list, d_t = self.gen_one_step(b[2], b[3], b[4], self.sc_rnn_fw, self.w_hr_fw, self.w_ho_fw) top_indices = self.get_top_index(_prob) for i in range(self.config.beam_size): #--- already EOS --- if b[2]==[2]: beam_candidates.append(b) break wordix = top_indices[i] beam_candidates.append((b[0] + _prob[wordix], b[1] + [self.i2w[wordix]], [wordix], hc_list, d_t)) beam_candidates.sort(key=lambda x:x[0]/(len(x[1])-1), reverse = True) # decreasing order beams = beam_candidates[:self.config.beam_size] # truncate to get new beams #--- RERANK beams --- beams = self.rerank(beams, doc_features[1]) beams.sort(key=lambda x: x[0], reverse=True) res = "[*]EP_{}_KW_[{}]_SENT_[{}]\n".format( epoch_i, ' '.join([self.i2k[int(j)] for j in torch.flatten(torch.nonzero(doc_features[1][0])).numpy()]), ' '.join(beams[0][1]) ) print(res) self.out_file.write(res) self.out_file.flush() except Exception as e: print('Exception: ', str(e)) pass # self.out_file.close() self.model.train()
def forward(self, x): x = self.features(x) x = torch.flatten(x, 1) logits = self.classifier(x) probas = torch.nn.functional.softmax(logits, dim=1) return logits, probas
def high_level_update_q_func_with_goal(self, batch): """ Compute loss for a given Q-function, or critics for the high level controller """ batch_next_state = batch["next_state"] batch_rewards = batch["reward"] batch_terminal = batch["is_state_terminal"] batch_state = batch["state"] batch_goal = batch["goal"] batch_actions = batch["action"] batch_discount = batch["discount"] with torch.no_grad(), pfrl.utils.evaluating( self.target_policy ), pfrl.utils.evaluating(self.policy), pfrl.utils.evaluating(self.target_q_func1), pfrl.utils.evaluating( self.target_q_func2 ): if self.add_entropy: next_action_distrib = self.policy(torch.cat([batch_next_state, batch_goal], -1)) next_actions_normalized = next_action_distrib.sample() next_actions = self.scale * next_actions_normalized else: next_action_distrib = self.target_policy(torch.cat([batch_next_state, batch_goal], -1)) next_actions_normalized = next_action_distrib.sample() next_actions = self.target_policy_smoothing_func( self.scale * next_actions_normalized ) entropy_term = 0 if self.add_entropy: next_log_prob = next_action_distrib.log_prob(next_actions_normalized) entropy_term = self.temperature * next_log_prob[..., None] next_q1 = self.target_q_func1((torch.cat([batch_next_state, batch_goal], -1), next_actions)) next_q2 = self.target_q_func2((torch.cat([batch_next_state, batch_goal], -1), next_actions)) next_q = torch.min(next_q1, next_q2) target_q = batch_rewards + batch_discount * ( 1.0 - batch_terminal ) * torch.flatten(next_q - entropy_term) predict_q1 = torch.flatten(self.q_func1((torch.cat([batch_state, batch_goal], -1), batch_actions))) predict_q2 = torch.flatten(self.q_func2((torch.cat([batch_state, batch_goal], -1), batch_actions))) loss1 = F.smooth_l1_loss(target_q, predict_q1) loss2 = F.smooth_l1_loss(target_q, predict_q2) # Update stats self.q1_record.extend(predict_q1.detach().cpu().numpy()) self.q2_record.extend(predict_q2.detach().cpu().numpy()) self.q_func1_loss_record.append(float(loss1)) self.q_func2_loss_record.append(float(loss2)) q1_recent_variance = np.var(list(self.q1_record)[-self.recent_variance_size:]) q2_recent_variance = np.var(list(self.q2_record)[-self.recent_variance_size:]) self.q_func1_variance_record.append(q1_recent_variance) self.q_func2_variance_record.append(q2_recent_variance) self.q_func1_optimizer.zero_grad() loss1.backward() if self.max_grad_norm is not None: clip_l2_grad_norm_(self.q_func1.parameters(), self.max_grad_norm) self.q_func1_optimizer.step() self.q_func2_optimizer.zero_grad() loss2.backward() if self.max_grad_norm is not None: clip_l2_grad_norm_(self.q_func2.parameters(), self.max_grad_norm) self.q_func2_optimizer.step() self.q_func_n_updates += 1
def forward(self, x): # Input dimensions: 490x326x3 # Output dimensions: 488x324x15 x = self.conv1(x) x = self.conv1_bn(x) x = F.relu(x) x = self.dropout1(x) # Input dimensions: 488x324x15 # Output dimensions: 486x322x15 x = self.conv2(x) x = self.conv2_bn(x) x = F.relu(x) x = self.dropout1(x) # Input dimensions: 486x322x15 # Output dimensions: 243x161x15 x = F.max_pool2d(x, 2) # Input dimensions: 243x161x15 # Output dimensions: 241x159x30 x = self.conv3(x) x = self.conv3_bn(x) x = F.relu(x) x = self.dropout1(x) # Input dimensions: 241x159x30 # Output dimensions: 239x157x30 x = self.conv4(x) x = self.conv4_bn(x) x = F.relu(x) x = self.dropout1(x) # Input dimensions: 239x157x30 # Output dimensions: 120x79x30 x = F.max_pool2d(x, 2, ceil_mode=True) # Input dimensions: 120x79x30 # Output dimensions: 118x77x60 x = self.conv5(x) x = self.conv5_bn(x) x = F.relu(x) x = self.dropout1(x) # Input dimensions: 118x77x60 # Output dimensions: 116x75x60 x = self.conv6(x) x = self.conv6_bn(x) x = F.relu(x) x = self.dropout1(x) # Input dimensions: 116x75x60 # Output dimensions: 58x38x60 x = F.max_pool2d(x, 2, ceil_mode=True) # Input dimensions: 58x38x60 # Output dimensions: 56x36x120 x = self.conv7(x) x = self.conv7_bn(x) x = F.relu(x) x = self.dropout1(x) # Input dimensions: 56x36x120 # Output dimensions: 54x34x120 x = self.conv8(x) x = self.conv8_bn(x) x = F.relu(x) x = self.dropout1(x) # Input dimensions: 54x34x120 # Output dimensions: 27x17x120 x = F.max_pool2d(x, 2, ceil_mode=True) # Input dimensions: 27x17x120 # Output dimensions: 55080x1 x = torch.flatten(x, 1) # Fully connected layers for x label prediction # Input dimensions: 55080x1 # Output dimensions: 256x1 x_label = self.fc1x(x) x_label = self.fc1x_bn(x_label) x_label = F.relu(x_label) x_label = self.dropout2(x_label) # Input dimensions: 256x1 # Output dimensions: 20x1 x_label = self.fc2x(x_label) # Fully connected layers for y label prediction # Input dimensions: 55080x1 # Output dimensions: 256x1 y_label = self.fc1y(x) y_label = self.fc1y_bn(y_label) y_label = F.relu(y_label) y_label = self.dropout2(y_label) # Input dimensions: 256x1 # Output dimensions: 20x1 y_label = self.fc2y(y_label) # Use log softmax to get probabilities for each class. We # can then get the class prediction by simply taking the index # with the maximum value. output_x = F.log_softmax(x_label, dim=1) output_y = F.log_softmax(y_label, dim=1) return output_x, output_y
def forward(self, input): return torch.flatten(input, start_dim=self.start_dim, end_dim=self.end_dim)
def forward_once(self, x): x = self.model(x) x = self.avgpool(x) x = torch.flatten(x, start_dim=1, end_dim=-1) return self.classifier(x)
def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.features(x) x = self.classifier(x) return torch.flatten(x, 1)
def forward(self, x): x = torch.flatten(x, start_dim=1) x = self.do(x) x = self.act(self.bn(self.fc1(x))) x = self.act_final(self.fc2(x)) return x
def repeat(input, repeats, dim): # return th.repeat_interleave(input, repeats, dim) # PyTorch 1.1 if dim < 0: dim += input.dim() return th.flatten(th.stack([input] * repeats, dim=dim + 1), dim, dim + 1)
def forward(self, x: torch.Tensor) -> torch.Tensor: return torch.flatten(x, start_dim=self.start_dim, end_dim=self.end_dim)