Ejemplo n.º 1
0
 def forward(self, x):
     x = self.features(x)
     x = flatten(x, 1)
     x = self.fc(x)
     return x
Ejemplo n.º 2
0
 def forward(self, x):
     x = self.features(x)
     x = self.avgpool(x)
     x = torch.flatten(x, 1)
     x = self.classifier(x)
     return x
Ejemplo n.º 3
0
 def forward(self, x):
     x = self.pool(F.relu(self.conv(x)))
     x = torch.flatten(x, start_dim=1)
     h = F.relu(self.linear_1(x))
     y = F.softmax(self.linear_2(h))
     return y
Ejemplo n.º 4
0
    def forward(self, x):
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x
Ejemplo n.º 5
0
        else:
            numbOfBatches = batchSize
        leftIndex = batchID * batchSize
        rightIndex = leftIndex + numbOfBatches
        
        
        courseid_LR =  course_id_LR[leftIndex: rightIndex].clone().long()
        videoid =  video_id[leftIndex: rightIndex].clone().long()
        continuesfeature1 =  continues_feature1[leftIndex: rightIndex].clone()  
        courseid_CNN =  course_id_CNN[leftIndex: rightIndex].clone().long()
        continuesfeature2 =  continues_feature2[leftIndex: rightIndex].clone()
        
        predictions,LR_result,GRU_result = model(courseid_LR,courseid_CNN,continuesfeature1,continuesfeature2,videoid,numbOfBatches)
        

        predictions = torch.flatten(predictions)
        LR_result = torch.flatten(LR_result)
        GRU_result = torch.flatten(GRU_result)

        loss_final = MSELoss(predictions,y[leftIndex: rightIndex].float())
        loss_lr = MSELoss(LR_result,y[leftIndex: rightIndex].float())
        loss_gru = MSELoss(GRU_result,y[leftIndex: rightIndex].float())
#         print('loss: ',loss)
        loss = loss_final + loss_lr +loss_gru
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_value.append(loss.item())
        
        
        #testing
Ejemplo n.º 6
0
    def fit(self, insample_dataloader, outsample_dataloader):        
        # Instantiate optimization tools
        loss = nn.CrossEntropyLoss()        
        optimizer = optim.SGD([{'params': self.model.input_layer.parameters()},
                               {'params': self.model.hidden_layers.parameters()},
                               {'params': self.model.output_layer.parameters(),
                                'weight_decay': self.params['output_l2_decay']}],
                               lr=self.params['initial_lr'],
                               momentum=self.params['initial_momentum'])
        
        constrainer = WeightNormConstrainer(norm=self.params['weight_norm'])

        # Initialize counters and trajectories
        step = 0
        epoch = 0
        metric_trajectories = {'step':  [],
                               'epoch':  [],
                               'insample_accuracy': [],
                               'outsample_accuracy': [],
                               'insample_cross_entropy': [],
                               'outsample_cross_entropy': []
                               }

        print('\n'+'='*36+' Fitting DCLF '+'='*36)
        while step <= self.params['iterations']:

            # Train
            epoch += 1
            self.model.train()
            for batch in iter(insample_dataloader):
                step+=1
                if step > self.params['iterations']:
                    continue
                                
                batch_x = t.flatten(batch[0].to(self.device), start_dim=1)
                batch_y = batch[1].to(self.device)
                
                optimizer.zero_grad()

                # TODO: make predictions, compute the cross entropy loss and perform backward propagation
                logits = None

                t.nn.utils.clip_grad_norm_(self.model.parameters(), 20)
                optimizer.step()

                # Evaluate metrics
                if (step % params['display_step'] == 0):
                    in_cross_entropy   = self.evaluate_cross_entropy(insample_dataloader)
                    out_cross_entropy  = self.evaluate_cross_entropy(outsample_dataloader)
                    in_accuracy        = self.evaluate_accuracy(insample_dataloader)
                    out_accuracy       = self.evaluate_accuracy(outsample_dataloader)

                    print('Epoch:', '%d,' % epoch,
                          'Step:', '%d,' % step,
                          'In Loss: {:.7f},'.format(in_cross_entropy),
                          'Out Loss: {:.7f},'.format(out_cross_entropy),
                          'In Acc: {:03.3f},'.format(in_accuracy),
                          'Out Acc: {:03.3f}'.format(out_accuracy))
                    
                    metric_trajectories['insample_cross_entropy'].append(in_cross_entropy)
                    metric_trajectories['outsample_cross_entropy'].append(out_cross_entropy)
                    metric_trajectories['insample_accuracy'].append(in_accuracy)
                    metric_trajectories['outsample_accuracy'].append(out_accuracy)

                # Update optimizer learning rate
                if step % self.params['adjust_lr_step'] == 0:
                    self.adjust_lr(optimizer=optimizer, lr_decay=self.params['lr_decay'])
                
                # Update optimizer momentum
                if step % self.params['adjust_momentum_step'] == 0 and \
                    step < self.params['momentum_change_steps']:
                    self.adjust_momentum(optimizer=optimizer, step=step,
                                         momentum_change_steps=self.params['momentum_change_steps'],
                                         initial_momentum=self.params['initial_momentum'],
                                         final_momentum=self.params['final_momentum'])
              
                # Constraint max_norm of weights
                if self.params['apply_weight_norm'] and \
                  (step % self.params['adjust_norm_step'] == 0):
                    self.model.apply(constrainer)

        # Store trajectories
        print('\n'+'='*35+' Finished Train '+'='*35)
        self.trajectories = metric_trajectories
    def _post_backward_hook(self, param: Parameter, *unused: Any) -> None:
        """
        At the start of :func:`_post_backward_hook`, ``param.grad`` contains the
        full gradient for the local batch. The reduce-scatter op will replace
        ``param.grad`` with a single shard of the summed gradient across all
        GPUs. This shard will align with the current GPU rank. For example::
            before reduce_scatter:
                param.grad (GPU #0): [1, 2, 3, 4]
                param.grad (GPU #1): [5, 6, 7, 8]
            after reduce_scatter:
                param.grad (GPU #0): [6, 8]    # 1+5, 2+6
                param.grad (GPU #1): [10, 12]  # 3+7, 4+8
        The local GPU's ``optim.step`` is responsible for updating a single
        shard of params, also corresponding to the current GPU's rank. This
        alignment is created by :func:`_shard_parameters`, which ensures that
        the local optimizer only sees the relevant parameter shard.
        """
        # First hook callback will see PRE state. If we have multiple params,
        # then subsequent hook callbacks will see POST state.
        self._assert_state(
            [TrainingState_.BACKWARD_PRE, TrainingState_.BACKWARD_POST])
        self.training_state = TrainingState_.BACKWARD_POST
        if param.grad is None:
            return

        if param.grad.requires_grad:
            raise RuntimeError(
                "FSDP only works with gradients that don't require gradients")

        self._free_full_params([param])
        # Switch to local shard after backward.
        self._use_param_local_shard([param])

        # Wait for all work in the current stream to finish, then start the
        # reductions in post_backward stream.
        self._streams["post_backward"].wait_stream(torch.cuda.current_stream())
        with torch.cuda.stream(self._streams["post_backward"]):
            orig_grad_data = param.grad.data

            if self.gradient_predivide_factor > 1:
                # Average grad by world_size for consistency with PyTorch DDP.
                param.grad.div_(self.gradient_predivide_factor)

            if param._is_sharded:  # type: ignore[attr-defined]
                grad_flatten = torch.flatten(param.grad)
                chunks = list(grad_flatten.chunk(self.world_size))
                num_pad = self.world_size * chunks[0].numel(
                ) - param.grad.numel()
                input_flattened = F.pad(grad_flatten, [0, num_pad])
                output = torch.zeros_like(chunks[0])
                dist._reduce_scatter_base(output,
                                          input_flattened,
                                          group=self.process_group)
                if self.gradient_postdivide_factor > 1:
                    # Average grad by world_size for consistency with PyTorch DDP.
                    output.div_(self.gradient_postdivide_factor)
                param.grad.data = output
            else:
                # Currently the only way for _is_sharded to be False is if
                # world_size == 1. This could be relaxed in the future, e.g,
                # no sharding like PyTorch DDP, in which case grads should be
                # all-reduced here.
                assert (
                    self.world_size == 1
                ), "Currently the only way for _is_sharded to be False is \
                    world_size == 1"

            # After _post_backward_hook returns, orig_grad_data will eventually
            # go out of scope, at which point it could otherwise be freed for
            # further reuse by the main stream while the div/reduce_scatter/copy
            # are underway in the post_backward stream. See:
            # github.com/NVIDIA/apex/blob/master/apex/parallel/distributed.py
            orig_grad_data.record_stream(self._streams["post_backward"])
Ejemplo n.º 8
0
    def forward(self, batch_size, encoded_hidden, encoded_outputs, encoded_lens, story, max_res_len, target_batches, use_teacher_forcing, slot_temp):
        all_point_outputs = torch.zeros(len(slot_temp), batch_size, max_res_len, self.vocab_size)
        all_gate_outputs = torch.zeros(len(slot_temp), batch_size, self.nb_gate)
        if USE_CUDA: 
            all_point_outputs = all_point_outputs.cuda()
            all_gate_outputs = all_gate_outputs.cuda()
        
        # Get the slot embedding 
        slot_emb_dict = {}
        for i, slot in enumerate(slot_temp):
            # Domain embbeding
            if slot.split("-")[0] in self.slot_w2i.keys():
                domain_w2idx = [self.slot_w2i[slot.split("-")[0]]]
                domain_w2idx = torch.tensor(domain_w2idx)
                if USE_CUDA: domain_w2idx = domain_w2idx.cuda()
                domain_emb = self.Slot_emb(domain_w2idx)
            # Slot embbeding
            if slot.split("-")[1] in self.slot_w2i.keys():
                slot_w2idx = [self.slot_w2i[slot.split("-")[1]]]
                slot_w2idx = torch.tensor(slot_w2idx)
                if USE_CUDA: slot_w2idx = slot_w2idx.cuda()
                slot_emb = self.Slot_emb(slot_w2idx)

            # Combine two embeddings as one query
            combined_emb = domain_emb + slot_emb
            slot_emb_dict[slot] = combined_emb
            slot_emb_exp = combined_emb.expand_as(encoded_hidden)
            if i == 0:
                slot_emb_arr = slot_emb_exp.clone()
            else:
                slot_emb_arr = torch.cat((slot_emb_arr, slot_emb_exp), dim=0)

        if args["parallel_decode"]:
            # Compute pointer-generator output, puting all (domain, slot) in one batch
            decoder_input = self.dropout_layer(slot_emb_arr).view(-1, self.hidden_size) # (batch*|slot|) * emb
            hidden = encoded_hidden.repeat(1, len(slot_temp), 1) # 1 * (batch*|slot|) * emb
            words_point_out = [[] for i in range(len(slot_temp))]
            words_class_out = []
            
            for wi in range(max_res_len):
                dec_state, hidden = self.gru(decoder_input.expand_as(hidden), hidden)

                enc_out = encoded_outputs.repeat(len(slot_temp), 1, 1)
                enc_len = encoded_lens * len(slot_temp)
                context_vec, logits, prob = self.attend(enc_out, hidden.squeeze(0), enc_len)

                if wi == 0: 
                    all_gate_outputs = torch.reshape(self.W_gate(context_vec), all_gate_outputs.size())

                p_vocab = self.attend_vocab(self.embedding.weight, hidden.squeeze(0))
                p_gen_vec = torch.cat([dec_state.squeeze(0), context_vec, decoder_input], -1)
                vocab_pointer_switches = self.sigmoid(self.W_ratio(p_gen_vec))
                p_context_ptr = torch.zeros(p_vocab.size())
                if USE_CUDA: p_context_ptr = p_context_ptr.cuda()
                
                p_context_ptr.scatter_add_(1, story.repeat(len(slot_temp), 1), prob)

                final_p_vocab = (1 - vocab_pointer_switches).expand_as(p_context_ptr) * p_context_ptr + \
                                vocab_pointer_switches.expand_as(p_context_ptr) * p_vocab
                pred_word = torch.argmax(final_p_vocab, dim=1)
                words = [self.lang.index2word[w_idx.item()] for w_idx in pred_word]
                
                for si in range(len(slot_temp)):
                    words_point_out[si].append(words[si*batch_size:(si+1)*batch_size])
                
                all_point_outputs[:, :, wi, :] = torch.reshape(final_p_vocab, (len(slot_temp), batch_size, self.vocab_size))
                
                if use_teacher_forcing:
                    decoder_input = self.embedding(torch.flatten(target_batches[:, :, wi].transpose(1,0)))
                else:
                    decoder_input = self.embedding(pred_word)   
                
                if USE_CUDA: decoder_input = decoder_input.cuda()
        else:
            # Compute pointer-generator output, decoding each (domain, slot) one-by-one
            words_point_out = []
            counter = 0
            for slot in slot_temp:
                hidden = encoded_hidden
                words = []
                slot_emb = slot_emb_dict[slot]
                decoder_input = self.dropout_layer(slot_emb).expand(batch_size, self.hidden_size)
                for wi in range(max_res_len):
                    dec_state, hidden = self.gru(decoder_input.expand_as(hidden), hidden)
                    context_vec, logits, prob = self.attend(encoded_outputs, hidden.squeeze(0), encoded_lens)
                    if wi == 0: 
                        all_gate_outputs[counter] = self.W_gate(context_vec)
                    p_vocab = self.attend_vocab(self.embedding.weight, hidden.squeeze(0))
                    p_gen_vec = torch.cat([dec_state.squeeze(0), context_vec, decoder_input], -1)
                    vocab_pointer_switches = self.sigmoid(self.W_ratio(p_gen_vec))
                    p_context_ptr = torch.zeros(p_vocab.size())
                    if USE_CUDA: p_context_ptr = p_context_ptr.cuda()
                    p_context_ptr.scatter_add_(1, story, prob)
                    final_p_vocab = (1 - vocab_pointer_switches).expand_as(p_context_ptr) * p_context_ptr + \
                                    vocab_pointer_switches.expand_as(p_context_ptr) * p_vocab
                    pred_word = torch.argmax(final_p_vocab, dim=1)
                    words.append([self.lang.index2word[w_idx.item()] for w_idx in pred_word])
                    all_point_outputs[counter, :, wi, :] = final_p_vocab
                    if use_teacher_forcing:
                        decoder_input = self.embedding(target_batches[:, counter, wi]) # Chosen word is next input
                    else:
                        decoder_input = self.embedding(pred_word)   
                    if USE_CUDA: decoder_input = decoder_input.cuda()
                counter += 1
                words_point_out.append(words)
        
        return all_point_outputs, all_gate_outputs, words_point_out, []
Ejemplo n.º 9
0
 def forward(self, x):
     return torch.flatten(x, 1)
Ejemplo n.º 10
0
def cluster_initializer_online_update(layer, new_train_x, graph):
    def assign_cluster_idx(node, cluster_idx):
        if hasattr(node, 'new_cluster_idx'):
            node.new_cluster_idx = list(
                dict.fromkeys(node.new_cluster_idx + cluster_idx))
        else:
            node.new_cluster_idx = cluster_idx

    if type(layer) == FactorizedLeafLayer.FactorizedLeafLayer:
        if not leafs_oa_bool:
            return
        ef_array = layer.ef_array

        num_var = ef_array.num_var
        array_shape = ef_array.array_shape

        # lookup array consisting of nodes for certain replica_idx
        replica_lookup = [[
            n for n in layer.nodes
            if n.einet_address.replica_idx == replica_idx
        ] for replica_idx in range(array_shape[1])]

        with torch.no_grad():
            # Consturct (num_var, num_replica, num_stats) matrix
            params = ef_array.params.permute(1, 0, 2, 3)[0]
            for sc in range(num_var):
                # construct a replica array for every scope
                for replica_idx in range(array_shape[1]):
                    # first lookup nodes with correct replica_idx
                    for n in replica_lookup[replica_idx]:
                        # when the current scope is in the scope of the node
                        if sc in n.scope and hasattr(
                                n, 'new_cluster_idx') and len(
                                    n.new_cluster_idx) > 0:
                            params[sc][
                                replica_idx] *= layer.total_samples_matrix[sc][
                                    replica_idx]
                            for n_c_idx in n.new_cluster_idx:
                                params[sc][replica_idx] += new_train_x[
                                    n_c_idx][sc]
                            layer.total_samples_matrix[sc][replica_idx] += len(
                                n.new_cluster_idx)
                            params[sc][
                                replica_idx] /= layer.total_samples_matrix[sc][
                                    replica_idx]
                # construct correct weight matrix representation
            ef_array.params = torch.nn.Parameter(
                params.repeat(array_shape[0], 1, 1, 1).permute(1, 0, 2, 3))

    elif type(layer) == SumLayer.EinsumLayer:
        """ Einsum layer """
        for product in layer.products:
            # direct successors of current product node
            sum_successors = list(graph.succ[product])
            for sum_node in sum_successors:
                # successors of successor of current product node
                product_successors = list(graph.succ[sum_node])

                if len(product_successors) > 1:
                    # next layer is EinsumMixingLayer
                    # Hence only append cluster_idx to the successor of current product node (sum node)
                    assign_cluster_idx(sum_node, product.new_cluster_idx)
                elif len(product_successors) == 1:
                    # next layer is EinsumLayer
                    # hence append cluster_idx to the succesor of the succesor (product node)
                    for p in product_successors:
                        assign_cluster_idx(p, product.new_cluster_idx)
                else:
                    # next layer is FactorizedLeafLayer
                    # Hence only append cluster_idx to the successor of current product node (leaf node)
                    assign_cluster_idx(sum_node, product.new_cluster_idx)

    elif type(layer) == SumLayer.EinsumMixingLayer:
        with torch.no_grad():
            params = layer.params[0]
            for si, sum_node in enumerate(layer.nodes):
                if hasattr(sum_node, 'new_cluster_idx'):
                    new_train_x = new_train_x[sum_node.new_cluster_idx]
                else:
                    sum_node.new_cluster_idx = [
                        i for i in range(len(new_train_x))
                    ]

                if hasattr(sum_node, 'cl_centers'):
                    # calculate the cluster to which the new sample should be added
                    nearest_index = [-1 for x in new_train_x]
                    nearest_dist = [-1 for x in new_train_x]
                    for i, c in enumerate(sum_node.cl_centers):
                        c_dists = []
                        for x in new_train_x:
                            c_dists.append(
                                distance.euclidean(c,
                                                   torch.flatten(x).cpu()))
                        for j, d in enumerate(c_dists):
                            if d < nearest_dist[j] or nearest_index[j] < 0:
                                nearest_index[j] = i
                                nearest_dist[j] = c_dists[j]
                    # retrieve, update and store weights

                    if weights_oa_bool:
                        params[si] *= torch.sum(
                            torch.tensor(sum_node.un_weights)).to(
                                torch.device(cuda_device))
                        for i in nearest_index:
                            params[si][i] += 1
                            sum_node.un_weights[i] += 1
                        params[si] /= torch.sum(
                            torch.tensor(sum_node.un_weights)).to(
                                torch.device(cuda_device))

                    # weights = sum_node.un_weights
                    # if not layer._use_em:
                    #     weights = weights.astype(float)
                    #     weights -= np.mean(weights)
                    #     weights /= np.std(weights)
                    # params[si] *= torch.tensor(weights).to(torch.device(cuda_device))

                    # calculate division of data points
                    cluster_idx = [[] for i in range(layer.max_components)]
                    for i, id in enumerate(nearest_index):
                        cluster_idx[id].append(i)
                else:
                    cluster_idx = [
                        sum_node.new_cluster_idx
                        for i in range(layer.max_components)
                    ]

                # store datapoint idx in layer structure
                for i, product in enumerate(list(graph.succ[sum_node])):
                    assign_cluster_idx(product, cluster_idx[i])

            # construct correct weight matrix from params
            params = params.repeat(layer.num_sums, 1, 1).float()
            params = params.to(torch.device(cuda_device))
            # normalize
            if layer._use_em:
                with torch.no_grad():
                    if layer.params_mask is not None:
                        layer.params_mask = layer.params_mask.to(
                            torch.device(cuda_device))
                        params.data *= layer.params_mask
                params.data = params.data / (params.data.sum(
                    layer.normalization_dims, keepdim=True))
            layer.params = torch.nn.Parameter(params)
Ejemplo n.º 11
0
def einsum_cluster_initializer_online_update(layer, new_train_x, graph):
    def assign_cluster_idx(node, cluster_idx):
        if hasattr(node, 'new_cluster_idx'):
            new_cluster_idx = []
            if len(cluster_idx) != len(node.new_cluster_idx):
                raise AssertionError("Should not happen")
            for i in range(len(cluster_idx)):
                new_cluster_idx.append(
                    list(
                        dict.fromkeys(node.new_cluster_idx[i] +
                                      cluster_idx[i])))

            if len(cluster_idx) != len(new_cluster_idx):
                raise AssertionError("Should not happen")

            node.new_cluster_idx = new_cluster_idx
        else:
            node.new_cluster_idx = cluster_idx

    if type(layer) == FactorizedLeafLayer.FactorizedLeafLayer:
        if not leafs_oa_bool:
            return
        ef_array = layer.ef_array

        num_var = ef_array.num_var
        array_shape = ef_array.array_shape

        # lookup array consisting of nodes for certain replica_idx
        replica_lookup = [[
            n for n in layer.nodes
            if n.einet_address.replica_idx == replica_idx
        ] for replica_idx in range(array_shape[1])]

        with torch.no_grad():
            # Consturct (num_var, num_replica, num_stats) matrix
            params = ef_array.params.permute(1, 0, 2, 3)
            for i in range(array_shape[0]):
                for sc in range(num_var):
                    # construct a replica array for every scope
                    for replica_idx in range(array_shape[1]):
                        # first lookup nodes with correct replica_idx
                        for n in replica_lookup[replica_idx]:
                            # when the current scope is in the scope of the node
                            if sc in n.scope and hasattr(
                                    n, 'new_cluster_idx') and len(
                                        n.new_cluster_idx) > 0:
                                params[i][sc][
                                    replica_idx] *= layer.total_samples_matrix[
                                        i][sc][replica_idx]
                                for n_c_idx in n.new_cluster_idx[i]:
                                    params[i][sc][replica_idx] += new_train_x[
                                        n_c_idx][sc]
                                layer.total_samples_matrix[i][sc][
                                    replica_idx] += len(n.new_cluster_idx[i])
                                params[i][sc][
                                    replica_idx] /= layer.total_samples_matrix[
                                        i][sc][replica_idx]
            # construct correct weight matrix representation
            ef_array.params = torch.nn.Parameter(params.permute(1, 0, 2, 3))

    elif type(layer) == SumLayer.EinsumLayer:
        """ Einsum layer """
        with torch.no_grad():
            params = layer.params
            for pi, product_node in enumerate(layer.products):
                for ki, k_idx in enumerate(product_node.new_cluster_idx):

                    if product_node.cl_centers[ki] is not None and len(
                            k_idx) > 0:
                        new_train_x_k = new_train_x[k_idx].cpu()
                        new_train_x_k = torch.index_select(
                            new_train_x_k, 1, torch.tensor(product_node.scope))

                        # calculate the cluster to which the new sample should be added
                        nearest_index = [-1 for x in new_train_x_k]
                        nearest_dist = [-1 for x in new_train_x_k]
                        for i, c in enumerate(product_node.cl_centers[ki]):
                            c_dists = []
                            for x in new_train_x_k:
                                c_dists.append(
                                    distance.euclidean(c, torch.flatten(x)))
                            for j, d in enumerate(c_dists):
                                if d < nearest_dist[j] or nearest_index[j] < 0:
                                    nearest_index[j] = i
                                    nearest_dist[j] = c_dists[j]

                        # retrieve, update and store weights
                        if weights_oa_bool:
                            params = params.permute(2, 3, 0, 1)
                            weights = torch.flatten(params[ki][pi])
                            weights *= torch.sum(
                                torch.tensor(product_node.un_weights[ki])).to(
                                    torch.device(cuda_device))
                            for i in nearest_index:
                                weights[i] += 1
                                product_node.un_weights[ki][i] += 1
                            weights /= torch.sum(
                                torch.tensor(product_node.un_weights[ki])).to(
                                    torch.device(cuda_device))
                            params[ki][pi] = torch.reshape(
                                weights,
                                (layer.num_input_dist, layer.num_input_dist))
                            params = params.permute(2, 3, 0, 1)

                        # weights = product_node.un_weights[ki]
                        # if not layer._use_em:
                        #     weights = weights.astype(float)
                        #     weights -= np.mean(weights)
                        #     weights /= np.std(weights)
                        # weights = np.resize(weights, (layer.num_input_dist, layer.num_input_dist))
                        # params = params.permute(2, 3, 0, 1)
                        # params[ki][pi] = torch.tensor(weights)
                        # params = params.permute(2, 3, 0, 1)

                        # calculate division of data points
                        cluster_idx = [
                            [] for i in range(pow(layer.num_input_dist, 2))
                        ]
                        for i, id in enumerate(nearest_index):
                            cluster_idx[id].append(i)

                        cluster_idx = np.resize(
                            cluster_idx,
                            (layer.num_input_dist, layer.num_input_dist))

                        cluster_idx_left = np.sum(cluster_idx, 1)
                        cluster_idx_right = np.sum(cluster_idx, 0)
                    else:
                        cluster_idx_left = [
                            k_idx for i in range(layer.num_input_dist)
                        ]
                        cluster_idx_right = [
                            k_idx for i in range(layer.num_input_dist)
                        ]

                    # direct successors of current product node, this is always 2 sum nodes in Einsum Networks
                    sum_successors = list(graph.succ[product_node])

                    if len(list(graph.succ[list(
                            graph.succ[product_node])[0]])) == 1:
                        for p in list(graph.succ[list(
                                graph.succ[product_node])[0]]):
                            assign_cluster_idx(p, cluster_idx_left)
                    else:
                        assign_cluster_idx(sum_successors[0], cluster_idx_left)

                    if len(list(graph.succ[list(
                            graph.succ[product_node])[1]])) == 1:
                        for p in list(graph.succ[list(
                                graph.succ[product_node])[1]]):
                            assign_cluster_idx(p, cluster_idx_right)
                    else:
                        assign_cluster_idx(sum_successors[1],
                                           cluster_idx_right)

            params = params.to(torch.device(cuda_device))
            # normalize
            if layer._use_em:
                with torch.no_grad():
                    if layer.params_mask is not None:
                        layer.params_mask = layer.params_mask.to(
                            torch.device(cuda_device))
                        params.data *= layer.params_mask
                params.data = params.data / (params.data.sum(
                    layer.normalization_dims, keepdim=True))
            layer.params = torch.nn.Parameter(params)

    elif type(layer) == SumLayer.EinsumMixingLayer:
        with torch.no_grad():
            params = layer.params
            for si, sum_node in enumerate(layer.nodes):
                cluster_idx_temp = [[] for i in range(layer.max_components)]

                if hasattr(sum_node, 'new_cluster_idx'):
                    k_idxs = sum_node.new_cluster_idx
                else:
                    # here we make the assumption that this only happens in the root node
                    k_idxs = [[i for i in range(len(new_train_x))]]

                for ki, k_idx in enumerate(k_idxs):

                    if sum_node.cl_centers[ki] is not None and len(k_idx) > 0:
                        new_train_x_k = new_train_x[k_idx].cpu()
                        new_train_x_k = torch.index_select(
                            new_train_x_k, 1, torch.tensor(sum_node.scope))

                        # calculate the cluster to which the new sample should be added
                        nearest_index = [-1 for x in new_train_x_k]
                        nearest_dist = [-1 for x in new_train_x_k]
                        for i, c in enumerate(sum_node.cl_centers[ki]):
                            c_dists = []
                            for x in new_train_x_k:
                                c_dists.append(
                                    distance.euclidean(c, torch.flatten(x)))
                            for j, d in enumerate(c_dists):
                                if d < nearest_dist[j] or nearest_index[j] < 0:
                                    nearest_index[j] = i
                                    nearest_dist[j] = c_dists[j]

                        # retrieve, update and store weights
                        if weights_oa_bool:
                            params[ki][si] *= torch.sum(
                                torch.tensor(sum_node.un_weights[ki])).to(
                                    torch.device(cuda_device))
                            for i in nearest_index:
                                params[ki][si][i] += 1
                                sum_node.un_weights[ki][i] += 1
                            params[ki][si] /= torch.sum(
                                torch.tensor(sum_node.un_weights[ki])).to(
                                    torch.device(cuda_device))

                        # weights = sum_node.un_weights[ki]
                        # if not layer._use_em:
                        #     weights = weights.astype(float)
                        #     weights -= np.mean(weights)
                        #     weights /= np.std(weights)
                        # params[ki][si] = torch.tensor(weights)

                        # calculate division of data points
                        cluster_idx = [[] for i in range(layer.max_components)]
                        for i, id in enumerate(nearest_index):
                            cluster_idx[id].append(i)
                    else:
                        cluster_idx = [
                            k_idx for i in range(layer.max_components)
                        ]

                    for i in range(len(cluster_idx)):
                        cluster_idx_temp[i].append(cluster_idx[i])

                # store datapoint idx in layer structure
                for i, product in enumerate(list(graph.succ[sum_node])):
                    assign_cluster_idx(product, cluster_idx_temp[i])

            # construct correct weight matrix from params
            params = params.to(torch.device(cuda_device))
            # normalize
            if layer._use_em:
                with torch.no_grad():
                    if layer.params_mask is not None:
                        layer.params_mask = layer.params_mask.to(
                            torch.device(cuda_device))
                        params.data *= layer.params_mask
                params.data = params.data / (params.data.sum(
                    layer.normalization_dims, keepdim=True))
            layer.params = torch.nn.Parameter(params)
        print(
            f'Epoch [{epoch + 1}/{num_epochs}], Loss: {maxloss:.8f}, Test Loss: {test_maxloss:.8f}'
        )
        maxloss = 0.0
        test_maxloss = 0.0

with torch.no_grad():
    ff_model.eval()
    target_output = ff_model(eval_input)

STACKCOUNT = 1
curr_input_mat = torch.hstack(
    (target_output[:STACKCOUNT, 6:15], target_output[:STACKCOUNT, 21:27]))
vels_and_accels = torch.hstack(
    (eval_input[STACKCOUNT - 1, 0:6], eval_input[STACKCOUNT - 1, -18:]))
new_input_feet = torch.flatten(curr_input_mat)

lower_body_poses = None

with torch.no_grad():
    for curr_eval_idx in range(STACKCOUNT, eval_input.shape[0]):
        model_output = vae_model(new_input_feet, vels_and_accels)
        if lower_body_poses is None:
            lower_body_poses = model_output
        else:
            lower_body_poses = torch.vstack((lower_body_poses, model_output))

        curr_input_mat = torch.roll(curr_input_mat, -1, 0)
        vels_and_accels = torch.hstack(
            (eval_input[curr_eval_idx, 0:6], eval_input[curr_eval_idx, -18:]))
        curr_input_mat[-1] = model_output
Ejemplo n.º 13
0
def test_explicit_hessian():
    """Check computation of hessian of loss(B'WA) from https://github.com/yaroslavvb/kfac_pytorch/blob/master/derivation.pdf


    """

    torch.set_default_dtype(torch.float64)
    A = torch.tensor([[-1., 4], [3, 0]])
    B = torch.tensor([[-4., 3], [2, 6]])
    X = torch.tensor([[-5., 0], [-2, -6]], requires_grad=True)

    Y = B.t() @ X @ A
    u.check_equal(Y, [[-52, 64], [-81, -108]])
    loss = torch.sum(Y * Y) / 2
    hess0 = u.hessian(loss, X).reshape([4, 4])
    hess1 = u.Kron(A @ A.t(), B @ B.t())

    u.check_equal(loss, 12512.5)

    # PyTorch autograd computes Hessian with respect to row-vectorized parameters, whereas
    # autograd_lib uses math convention and does column-vectorized.
    # Commuting order of Kronecker product switches between two representations
    u.check_equal(hess1.commute(), hess0)

    # Do a test using Linear layers instead of matrix multiplies
    model: u.SimpleFullyConnected2 = u.SimpleFullyConnected2([2, 2, 2], bias=False)
    model.layers[0].weight.data.copy_(X)

    # Transpose to match previous results, layers treat dim0 as batch dimension
    u.check_equal(model.layers[0](A.t()).t(), [[5, -20], [-16, -8]])  # XA = (A'X0)'

    model.layers[1].weight.data.copy_(B.t())
    u.check_equal(model(A.t()).t(), Y)

    Y = model(A.t()).t()    # transpose to data-dimension=columns
    loss = torch.sum(Y * Y) / 2
    loss.backward()

    u.check_equal(model.layers[0].weight.grad, [[-2285, -105], [-1490, -1770]])
    G = B @ Y @ A.t()
    u.check_equal(model.layers[0].weight.grad, G)

    u.check_equal(hess0, u.Kron(B @ B.t(), A @ A.t()))

    # compute newton step
    u.check_equal(u.Kron([email protected](), [email protected]()).pinv() @ u.vec(G), u.v2c([-5, -2, 0, -6]))

    # compute Newton step using factored representation
    autograd_lib.add_hooks(model)

    Y = model(A.t())
    n = 2
    loss = torch.sum(Y * Y) / 2
    autograd_lib.backprop_hess(Y, hess_type='LeastSquares')
    autograd_lib.compute_hess(model, method='kron', attr_name='hess_kron', vecr_order=False, loss_aggregation='sum')
    param = model.layers[0].weight

    hess2 = param.hess_kron
    print(hess2)

    u.check_equal(hess2, [[425, 170, -75, -30], [170, 680, -30, -120], [-75, -30, 225, 90], [-30, -120, 90, 360]])

    # Gradient test
    model.zero_grad()
    loss.backward()
    u.check_close(u.vec(G).flatten(), u.Vec(param.grad))

    # Newton step test
    # Method 0: PyTorch native autograd
    newton_step0 = param.grad.flatten() @ torch.pinverse(hess0)
    newton_step0 = newton_step0.reshape(param.shape)
    u.check_equal(newton_step0, [[-5, 0], [-2, -6]])

    # Method 1: colummn major order
    ihess2 = hess2.pinv()
    u.check_equal(ihess2.LL, [[1/16, 1/48], [1/48, 17/144]])
    u.check_equal(ihess2.RR, [[2/45, -(1/90)], [-(1/90), 1/36]])
    u.check_equal(torch.flatten(hess2.pinv() @ u.vec(G)), [-5, -2, 0, -6])
    newton_step1 = (ihess2 @ u.Vec(param.grad)).matrix_form()

    # Method2: row major order
    ihess2_rowmajor = ihess2.commute()
    newton_step2 = ihess2_rowmajor @ u.Vecr(param.grad)
    newton_step2 = newton_step2.matrix_form()

    u.check_equal(newton_step0, newton_step1)
    u.check_equal(newton_step0, newton_step2)
Ejemplo n.º 14
0
def _test_explicit_hessian_refactored():

    """Check computation of hessian of loss(B'WA) from https://github.com/yaroslavvb/kfac_pytorch/blob/master/derivation.pdf


    """

    torch.set_default_dtype(torch.float64)
    A = torch.tensor([[-1., 4], [3, 0]])
    B = torch.tensor([[-4., 3], [2, 6]])
    X = torch.tensor([[-5., 0], [-2, -6]], requires_grad=True)

    Y = B.t() @ X @ A
    u.check_equal(Y, [[-52, 64], [-81, -108]])
    loss = torch.sum(Y * Y) / 2
    hess0 = u.hessian(loss, X).reshape([4, 4])
    hess1 = u.Kron(A @ A.t(), B @ B.t())

    u.check_equal(loss, 12512.5)

    # Do a test using Linear layers instead of matrix multiplies
    model: u.SimpleFullyConnected2 = u.SimpleFullyConnected2([2, 2, 2], bias=False)
    model.layers[0].weight.data.copy_(X)

    # Transpose to match previous results, layers treat dim0 as batch dimension
    u.check_equal(model.layers[0](A.t()).t(), [[5, -20], [-16, -8]])  # XA = (A'X0)'

    model.layers[1].weight.data.copy_(B.t())
    u.check_equal(model(A.t()).t(), Y)

    Y = model(A.t()).t()    # transpose to data-dimension=columns
    loss = torch.sum(Y * Y) / 2
    loss.backward()

    u.check_equal(model.layers[0].weight.grad, [[-2285, -105], [-1490, -1770]])
    G = B @ Y @ A.t()
    u.check_equal(model.layers[0].weight.grad, G)

    autograd_lib.register(model)
    activations_dict = autograd_lib.ModuleDict()  # todo(y): make save_activations ctx manager automatically create A
    with autograd_lib.save_activations(activations_dict):
        Y = model(A.t())

    Acov = autograd_lib.ModuleDict(autograd_lib.SecondOrderCov)
    for layer, activations in activations_dict.items():
        print(layer, activations)
        Acov[layer].accumulate(activations, activations)
    autograd_lib.set_default_activations(activations_dict)
    autograd_lib.set_default_Acov(Acov)

    B = autograd_lib.ModuleDict(autograd_lib.SymmetricFourthOrderCov)
    autograd_lib.backward_accum(Y, "identity", B, retain_graph=False)

    print(B[model.layers[0]])

    autograd_lib.backprop_hess(Y, hess_type='LeastSquares')
    autograd_lib.compute_hess(model, method='kron', attr_name='hess_kron', vecr_order=False, loss_aggregation='sum')
    param = model.layers[0].weight

    hess2 = param.hess_kron
    print(hess2)

    u.check_equal(hess2, [[425, 170, -75, -30], [170, 680, -30, -120], [-75, -30, 225, 90], [-30, -120, 90, 360]])

    # Gradient test
    model.zero_grad()
    loss.backward()
    u.check_close(u.vec(G).flatten(), u.Vec(param.grad))

    # Newton step test
    # Method 0: PyTorch native autograd
    newton_step0 = param.grad.flatten() @ torch.pinverse(hess0)
    newton_step0 = newton_step0.reshape(param.shape)
    u.check_equal(newton_step0, [[-5, 0], [-2, -6]])

    # Method 1: colummn major order
    ihess2 = hess2.pinv()
    u.check_equal(ihess2.LL, [[1/16, 1/48], [1/48, 17/144]])
    u.check_equal(ihess2.RR, [[2/45, -(1/90)], [-(1/90), 1/36]])
    u.check_equal(torch.flatten(hess2.pinv() @ u.vec(G)), [-5, -2, 0, -6])
    newton_step1 = (ihess2 @ u.Vec(param.grad)).matrix_form()

    # Method2: row major order
    ihess2_rowmajor = ihess2.commute()
    newton_step2 = ihess2_rowmajor @ u.Vecr(param.grad)
    newton_step2 = newton_step2.matrix_form()

    u.check_equal(newton_step0, newton_step1)
    u.check_equal(newton_step0, newton_step2)
    def decode(self, x):
        x = F.relu(self.dec1(x))
        x = F.relu(self.dec2(x))
        x = F.relu(self.dec3(x))
        x = F.relu(self.dec4(x))
        x = F.relu(self.dec5(x))
        x = F.relu(self.dec6(x))

        return x

    def forward(self, x):

        x = self.encode(x)
        representation = x
        x = self.decode(x)

        return x, representation


if __name__ == "__main__":
    random_data = torch.rand((1, 1, 28, 28))
    print(random_data.shape)
    flat_data = torch.flatten(random_data)
    print(flat_data.shape)

    my_nn = AutoEncoder(input_shape=784, output_shape=8)
    my_nn.eval()

    print(my_nn(flat_data))
Ejemplo n.º 16
0
def quantAwareTrainingForward(model,
                              x,
                              stats,
                              vis=False,
                              axs=None,
                              sym=False,
                              num_bits=8,
                              act_quant=False):

    conv1weight = model.conv1.weight.data
    model.conv1.weight.data = FakeQuantOp.apply(model.conv1.weight.data,
                                                num_bits)
    x = F.relu(model.conv1(x))
    x = model.bn1(x)

    with torch.no_grad():
        stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'conv1')

    if act_quant:
        x = FakeQuantOp.apply(x, num_bits, stats['conv1']['ema_min'],
                              stats['conv1']['ema_max'])

    x = F.max_pool2d(x, 3, 2)

    conv2weight = model.conv2.weight.data
    model.conv2.weight.data = FakeQuantOp.apply(model.conv2.weight.data,
                                                num_bits)
    x = F.relu(model.conv2(x))
    x = model.bn2(x)

    with torch.no_grad():
        stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'conv2')

    if act_quant:
        x = FakeQuantOp.apply(x, num_bits, stats['conv2']['ema_min'],
                              stats['conv2']['ema_max'])

    x = F.max_pool2d(x, 3, 2)

    conv3weight = model.conv3.weight.data
    model.conv3.weight.data = FakeQuantOp.apply(model.conv3.weight.data,
                                                num_bits)
    x = F.relu(model.conv3(x))
    x = model.bn3(x)

    with torch.no_grad():
        stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'conv3')

    if act_quant:
        x = FakeQuantOp.apply(x, num_bits, stats['conv3']['ema_min'],
                              stats['conv3']['ema_max'])

    conv4weight = model.conv4.weight.data
    model.conv4.weight.data = FakeQuantOp.apply(model.conv4.weight.data,
                                                num_bits)
    x = F.relu(model.conv4(x))
    x = model.bn4(x)

    with torch.no_grad():
        stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'conv4')

    if act_quant:
        x = FakeQuantOp.apply(x, num_bits, stats['conv4']['ema_min'],
                              stats['conv4']['ema_max'])

    conv5weight = model.conv5.weight.data
    model.conv5.weight.data = FakeQuantOp.apply(model.conv5.weight.data,
                                                num_bits)
    x = F.relu(model.conv5(x))
    x = model.bn5(x)

    with torch.no_grad():
        stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'conv5')

    if act_quant:
        x = FakeQuantOp.apply(x, num_bits, stats['conv5']['ema_min'],
                              stats['conv5']['ema_max'])

    x = F.max_pool2d(x, 3, 2)
    x = F.adaptive_avg_pool2d(x, (6, 6))
    x = torch.flatten(x, 1)
    # x = x.view(-1, 1250)  # CIFAR
    x = model.dropout(x)

    fc1weight = model.fc1.weight.data
    model.fc1.weight.data = FakeQuantOp.apply(model.fc1.weight.data, num_bits)
    x = F.relu(model.fc1(x))
    x = model.dropout(x)

    with torch.no_grad():
        stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'fc1')

    if act_quant:
        x = FakeQuantOp.apply(x, num_bits, stats['fc1']['ema_min'],
                              stats['fc1']['ema_max'])

    fc2weight = model.fc2.weight.data
    model.fc2.weight.data = FakeQuantOp.apply(model.fc2.weight.data, num_bits)
    x = F.relu(model.fc2(x))

    with torch.no_grad():
        stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'fc2')

    if act_quant:
        x = FakeQuantOp.apply(x, num_bits, stats['fc2']['ema_min'],
                              stats['fc2']['ema_max'])

    x = model.fc3(x)

    with torch.no_grad():
        stats = updateStats(x.clone().view(x.shape[0], -1), stats, 'fc2')


    return x, \
           conv1weight, conv2weight, conv3weight, conv4weight,conv5weight,\
           fc1weight, fc2weight, stats
Ejemplo n.º 17
0
def handle_model(
    index: int,
    data: List[DataObj],
    torch_obj: Union[str, List],
    master_url: str = '127.0.0.1',
    iters: int = 1000,
    world_size: int = 2,
    early_stop_patience: int = -1,
    verbose: int = 1,
    mini_batch: int = -1,
    validation_pct: float = 0,
    device: str = 'cpu'
) -> List[Dict]:
    """
    Runs the training of pytorch model, utilizing the distributed package.

    :param index: Partition index. Used for registering.
    :param data: The data from the partition
    :param torch_obj: The torch object string. Needs serialized
    :param master_url: The master url for the service.
    :param iters: The iterations for training
    :param world_size: The amount of partitions. Typically partitions + 1 for the driver
    :param verbose: whether to log the loss or not.
    :param mini_batch: Mini batch for training
    :param validation_pct: Validation percentage.
    :param device: The pytorch device to use for training. cpu/cuda
    :param early_stop_patience: Amount of patient for early stopping. -1 means don't use early stopping.

    :return: A list of the model state dictionary.
    """

    # If a process has already been setup on the machine, kill it.
    if dist.is_initialized():
        dist.destroy_process_group()

    # Set up the distributed server.
    os.environ['MASTER_ADDR'] = master_url
    os.environ['MASTER_PORT'] = '3333'

    dist.init_process_group('gloo', rank=index + 1, world_size=world_size)

    # Def Load model
    if index == -1:
        process_generic_model(torch_obj, iters, early_stop_patience > 0)
        return []
    else:
        torch_obj = load_torch_model(torch_obj)

    # Loaded the model
    model = torch_obj.model.to(device)
    model.train()
    criterion = torch_obj.criterion
    optimizer = torch_obj.optimizer

    # Set up early stopping
    es = EarlyStopping(patience=early_stop_patience)
    should_stop = torch.zeros(1)
    has_early_stop = early_stop_patience > 0

    partition_id = str(uuid4())

    # Process the data. Converts to x_train, y_train, x_val, y_val
    data_obj = handle_features(data, validation_pct)

    # check if data is none. We will still need to register.
    if data_obj is None or data_obj.x_train is None:
        process_generic_model([list(p.shape) for p in model.parameters()], iters, early_stop_patience > 0)
        return []

    # Passes all of the data
    x_train = data_obj.x_train.to(device)
    y_train = data_obj.y_train.to(device) if data_obj.y_train is not None else x_train
    x_val = data_obj.x_val.to(device) if data_obj.x_val is not None else None
    y_val = data_obj.y_val.to(device) if data_obj.y_val is not None else x_val

    for i in range(iters):

        optimizer.zero_grad()

        # utilize minibatch
        if 0 < mini_batch < len(data_obj.x_train):
            idxs = np.random.choice(len(data_obj.x_train), mini_batch, replace=False).tolist()
            x_train = data_obj.x_train[idxs]
            y_train = data_obj.y_train[idxs]

        y_pred = model(x_train)

        try:
            loss = criterion(y_pred, y_train)
        except RuntimeError as e:
            # utilized when loss need a long label
            y_train = torch.flatten(y_train.long())
            loss = criterion(y_pred, y_train)

        loss_v = loss.item()

        # Process validation loss
        val_loss = None
        if x_val is not None:
            pred_val = model(x_val)

            try:
                val_loss = criterion(pred_val, y_val)
            except RuntimeError as e:
                y_val = torch.flatten(y_val.long())
                val_loss = criterion(pred_val, y_val)

            val_loss = val_loss.item()

        # Calculate gradients
        loss.backward()

        # Distributed part of training.
        for param in model.parameters():
            dist.all_reduce(param.grad.data, op=torch.distributed.ReduceOp.SUM)
            param.grad.data /= (world_size-1)

        # Processes the early stop work
        if has_early_stop:
            loss_to_use = val_loss if val_loss is not None else loss_v
            stop = es.step(loss_to_use)
            if stop:
                should_stop = should_stop + 1.0

            dist.all_reduce(should_stop, op=torch.distributed.ReduceOp.SUM)
            if should_stop.item() > 0:
                break

        optimizer.step()

        if verbose:
            print(f"Partition: {partition_id}. Iteration: {i}. Loss: {loss_v}, Val Loss: {val_loss}")

    return [model.state_dict()]
Ejemplo n.º 18
0
def _flatten_tensor_optim_state(
    state_name: str,
    pos_dim_tensors: List[torch.Tensor],
    unflat_param_names: List[str],
    unflat_param_shapes: List[torch.Size],
    flat_param: FlatParameter,
) -> torch.Tensor:
    """
    Flattens the positive-dimension tensor optimizer state given by the values
    ``tensors`` for the state ``state_name`` for a single flattened parameter
    ``flat_param`` corresponding to the unflattened parameter names
    ``unflat_param_names`` and unflatted parameter shapes
    ``unflat_param_shapes``. This flattens each unflattened parameter's tensor
    state into one tensor.

    NOTE: We use zero tensors for any unflattened parameters without state
    since some value is required to fill those entries. This assumes that the
    zero tensor is mathematically equivalent to having no state, which is true
    for Adam's ``exp_avg`` and ``exp_avg_sq`` but may not be true for all
    optimizers.

    Args:
        state_name (str): Optimizer state name.
        pos_dim_tensors (List[torch.Tensor]): Positive-dimension tensor
            optimizer state values for the unflattened parameters corresponding
            to the single flattened parameter.
        unflat_param_names (List[str]): A :class:`list` of unflattened
            parameter names corresponding to the single flattened parameter.
        unflat_param_shapes (List[torch.Size]): Unflattened parameter shapes
            corresponding to the single flattened parameter.
        flat_param (FlatParameter): The flattened parameter.

    Returns:
        flat_tensor (torch.Tensor): A flattened tensor containing the optimizer
            state corresponding to ``state_name`` constructed by concatenating
            the unflattened parameter tensor states in ``pos_dim_tensors``
            (using zero tensors for any unflattened parameters without the
            state).
    """
    non_none_tensors = [t for t in pos_dim_tensors if t is not None]
    # Check that all are tensors with the same dtype
    dtypes = set(t.dtype for t in non_none_tensors)
    if len(dtypes) != 1:
        raise ValueError(
            "All unflattened parameters comprising a single flattened "
            "parameter must have positive-dimension tensor state with the "
            f"same dtype but got dtypes {dtypes} for state {state_name} and "
            f"unflattened parameter names {unflat_param_names}"
        )
    dtype = next(iter(dtypes))
    # Check that each tensor state matches its parameter's shape
    for tensor, shape in zip(pos_dim_tensors, unflat_param_shapes):
        if tensor is None and len(shape) == 0:
            raise ValueError(
                "Flattening a zero-dimension parameter is not supported"
            )
        elif tensor is not None and tensor.shape != shape:
            raise ValueError(
                "Tensor optimizer state does not have same shape as its "
                f"parameter: {tensor.shape} {shape}"
            )
    # Flatten the tensor states
    cpu_device = torch.device("cpu")
    tensors = [
        torch.flatten(state_value.to(cpu_device)) if state_value is not None
        else torch.flatten(torch.zeros(
            size=shape, dtype=dtype, device=cpu_device,
        ))
        for state_value, shape
        in zip(pos_dim_tensors, unflat_param_shapes)
    ]
    padding = flat_param.num_padded
    if padding > 0:
        tensors.append(torch.zeros(padding, dtype=dtype, device=cpu_device))
    flat_tensor = torch.cat(tensors)
    # `flat_tensor`'s shape should be 1D and less than or equal to the
    # flattened parameter's shape (where the inequality is strict for positive
    # padding)
    if not flat_param._is_sharded:  # currently, only when world size is 1
        # If the parameter is not sharded, then `_full_param_padded` is not
        # used, so we skip the shape check
        return flat_tensor
    full_padded_dim = flat_param._full_param_padded.dim()  # type: ignore[attr-defined]
    full_padded_shape = flat_param._full_param_padded.shape  # type: ignore[attr-defined]
    assert flat_tensor.dim() == 1, \
        f"`flat_tensor` should be 1D but got {flat_tensor.dim()} dims"
    assert full_padded_dim == 1, \
        f"`_full_param_padded` should be 1D but got {full_padded_dim} dims"
    assert flat_tensor.shape[0] <= full_padded_shape[0], \
        f"tensor optim state: {flat_tensor.shape} " \
        f"parameter: {full_padded_shape}"
    return flat_tensor
Ejemplo n.º 19
0
 def forward(self, x):
     out = self.conv(x)
     out = torch.flatten(out, 1)
     return self.linear(out)
Ejemplo n.º 20
0
def train(model,
          input_channel,
          optimizers,
          criterion,
          components,
          train_loader,
          val_loader,
          epoch,
          writer,
          args,
          use_CUDA=True,
          clamp=False,
          num_classes=10):
    model.train()
    accs = []
    losses_w1 = []
    losses_w2 = []
    iter_val_loader = iter(val_loader)
    meta_criterion = nn.CrossEntropyLoss(reduce=False)
    index = 0
    noisy_labels = []
    true_labels = []

    w = defaultdict()
    w_logger = defaultdict()
    losses_logger = defaultdict()
    accuracy_logger = ScalarLogger(prefix='accuracy')
    for c in components:
        w[c] = None
        w_logger[c] = WLogger()
        losses_logger[c] = ScalarLogger(prefix='loss')

    for (input, label, real) in train_loader:
        noisy_labels.append(label)
        true_labels.append(real)

        meta_model = get_model(args,
                               num_classes=num_classes,
                               input_channel=input_channel)
        meta_model.load_state_dict(model.state_dict())
        if use_CUDA:
            meta_model = meta_model.cuda()

        val_input, val_label, iter_val_loader = get_val_samples(
            iter_val_loader, val_loader)
        input = to_var(input, requires_grad=False)
        label = to_var(label, requires_grad=False).long()
        val_input = to_var(val_input, requires_grad=False)
        val_label = to_var(val_label, requires_grad=False).long()

        meta_output = meta_model(input)
        cost = meta_criterion(meta_output, label)
        eps = to_var(torch.zeros(cost.size()))
        meta_loss = (cost * eps).sum()
        meta_model.zero_grad()

        if 'all' in components:
            grads = torch.autograd.grad(meta_loss, (meta_model.parameters()),
                                        create_graph=True)
            meta_model.update_params(0.001, source_params=grads)

            meta_val_output = meta_model(val_input)
            meta_val_loss = meta_criterion(meta_val_output, val_label).sum()
            grad_eps = torch.autograd.grad(meta_val_loss,
                                           eps,
                                           only_inputs=True)[0]
            if clamp:
                w['all'] = torch.clamp(-grad_eps, min=0)
            else:
                w['all'] = -grad_eps

            norm = torch.sum(abs(w['all']))
            assert (clamp and len(components)
                    == 1) or (len(components) > 1), "Error combination"
            w['all'] = w['all'] / norm
            if ('fc' in components):
                w['fc'] = copy.deepcopy(w['all'])
                w['fc'] = torch.clamp(w['fc'], max=0)
                w['all'] = torch.clamp(w['all'], min=0)
            elif ('backbone' in components):
                w['backbone'] = copy.deepcopy(w['all'])
                w['backbone'] = torch.clamp(w['backbone'], max=0)
                w['all'] = torch.clamp(w['all'], min=0)

        else:
            assert ('backbone' in components) and ('fc' in components)

            grads_backbone = torch.autograd.grad(
                meta_loss, (meta_model.backbone.parameters()),
                create_graph=True,
                retain_graph=True)
            grads_fc = torch.autograd.grad(meta_loss,
                                           (meta_model.fc.parameters()),
                                           create_graph=True)

            # Backbone Grads
            meta_model.backbone.update_params(0.001,
                                              source_params=grads_backbone)
            meta_val_feature = torch.flatten(meta_model.backbone(val_input), 1)
            meta_val_output = meta_model.fc(val_input)
            meta_val_loss = meta_criterion(meta_val_output, val_label).sum()

            if args.with_kl and args.reg_start <= epoch:
                train_feature = torch.flatten(meta_model.backbone(input), 1)
                meta_val_loss -= sample_wise_kl(train_feature,
                                                meta_val_feature)

            grad_eps = torch.autograd.grad(meta_val_loss,
                                           eps,
                                           only_inputs=True,
                                           retain_graph=True)[0]
            if clamp:
                w['backbone'] = torch.clamp(-grad_eps, min=0)
            else:
                w['backbone'] = -grad_eps
            norm = torch.sum(abs(w['backbone']))
            w['backbone'] = w['backbone'] / norm

            # FC backward
            meta_model.load_state_dict(model.state_dict())
            meta_model.fc.update_params(0.001, source_params=grads_fc)
            meta_val_output = meta_model(val_input)
            meta_val_loss = meta_criterion(meta_val_output, val_label).sum()
            grad_eps = torch.autograd.grad(meta_val_loss,
                                           eps,
                                           only_inputs=True,
                                           retain_graph=True)[0]

            if clamp:
                w['fc'] = torch.clamp(-grad_eps, min=0)
            else:
                w['fc'] = -grad_eps
            norm = torch.sum(abs(w['fc']))
            w['fc'] = w['fc'] / norm

        index += 1
        output = model(input)
        loss = defaultdict()
        prediction = torch.softmax(output, 1)
        for c in components:
            w_logger[c].update(w[c])
            loss[c] = (meta_criterion(output, label) * w[c]).sum()
            optimizers[c].zero_grad()
            loss[c].backward(retain_graph=True)
            optimizers[c].step()
            losses_logger[c].update(loss[c])

        top1 = accuracy(prediction, label)
        accuracy_logger.update(top1)

    noisy_labels = torch.cat(noisy_labels)
    true_labels = torch.cat(true_labels)
    mask = (noisy_labels != true_labels).cpu().numpy()
    for c in components:
        w_logger[c].write(writer, c, epoch)
        w_logger[c].mask_write(writer, c, epoch, mask)
        losses_logger[c].write(writer, c, epoch)

    accuracy_logger.write(writer, 'train', epoch)

    print("Training Epoch: {}, Accuracy: {}".format(epoch,
                                                    accuracy_logger.avg()))
    return accuracy_logger.avg()
Ejemplo n.º 21
0
    def evaluate(self, epoch_i):
        #--- load model ---
        self.load_model(epoch_i)
        self.model.eval()
        for r_id, doc_features in enumerate(tqdm(
                self.test_loader, desc='Test', dynamic_ncols=True, ascii=True)):
            _, d_t = doc_features 
            try:
                if torch.sum(d_t)==0:
                    continue
                #--- Gen 1st step ---
                with torch.no_grad():
                    hc_list = (
                        torch.zeros(self.config.num_layers, 1, self.config.hidden_size),
                        torch.zeros(self.config.num_layers, 1, self.config.hidden_size)
                    )

                b = (0.0, [self.i2w[1]], [1], hc_list, d_t)
                _prob, hc_list, d_t = self.gen_one_step(b[2], b[3], b[4], self.sc_rnn_fw, self.w_hr_fw, self.w_ho_fw)
                top_indices = self.get_top_index(_prob)
                beam_candidates = []        
                for i in range(self.config.beam_size):
                    wordix = top_indices[i]
                    beam_candidates.append((b[0] + _prob[wordix], b[1] + [self.i2w[wordix]], [wordix], hc_list, d_t))

                #--- Gen the whole sentence ---
                beams = beam_candidates[:self.config.beam_size]
                for t in range(self.config.gen_size - 1):
                    beam_candidates = []
                    for b in beams:
                        _prob, hc_list, d_t = self.gen_one_step(b[2], b[3], b[4], self.sc_rnn_fw, self.w_hr_fw, self.w_ho_fw)
                        top_indices = self.get_top_index(_prob)

                        for i in range(self.config.beam_size):
                            #--- already EOS ---
                            if b[2]==[2]: 
                                beam_candidates.append(b)
                                break
                            wordix = top_indices[i]
                            beam_candidates.append((b[0] + _prob[wordix], b[1] + [self.i2w[wordix]], [wordix], hc_list, d_t))

                    beam_candidates.sort(key=lambda x:x[0]/(len(x[1])-1), reverse = True) # decreasing order
                    beams = beam_candidates[:self.config.beam_size] # truncate to get new beams

                #--- RERANK beams ---
                beams = self.rerank(beams, doc_features[1])
                beams.sort(key=lambda x: x[0], reverse=True)

                res = "[*]EP_{}_KW_[{}]_SENT_[{}]\n".format(
                    epoch_i,
                    ' '.join([self.i2k[int(j)] for j in torch.flatten(torch.nonzero(doc_features[1][0])).numpy()]),
                    ' '.join(beams[0][1])
                )
                print(res)
                self.out_file.write(res)
                self.out_file.flush()
            except Exception as e:
                print('Exception: ', str(e))
                pass
#         self.out_file.close()
        
        self.model.train()
 def forward(self, x):
     x = self.features(x)
     x = torch.flatten(x, 1)
     logits = self.classifier(x)
     probas = torch.nn.functional.softmax(logits, dim=1)
     return logits, probas
    def high_level_update_q_func_with_goal(self, batch):
        """
        Compute loss for a given Q-function, or critics
        for the high level controller
        """
        batch_next_state = batch["next_state"]
        batch_rewards = batch["reward"]
        batch_terminal = batch["is_state_terminal"]
        batch_state = batch["state"]
        batch_goal = batch["goal"]
        batch_actions = batch["action"]
        batch_discount = batch["discount"]

        with torch.no_grad(), pfrl.utils.evaluating(
            self.target_policy
        ), pfrl.utils.evaluating(self.policy), pfrl.utils.evaluating(self.target_q_func1), pfrl.utils.evaluating(
            self.target_q_func2
        ):
            if self.add_entropy:
                next_action_distrib = self.policy(torch.cat([batch_next_state, batch_goal], -1))
                next_actions_normalized = next_action_distrib.sample()
                next_actions = self.scale * next_actions_normalized
            else:
                next_action_distrib = self.target_policy(torch.cat([batch_next_state, batch_goal], -1))
                next_actions_normalized = next_action_distrib.sample()
                next_actions = self.target_policy_smoothing_func(
                    self.scale * next_actions_normalized
                )

            entropy_term = 0
            if self.add_entropy:
                next_log_prob = next_action_distrib.log_prob(next_actions_normalized)
                entropy_term = self.temperature * next_log_prob[..., None]

            next_q1 = self.target_q_func1((torch.cat([batch_next_state, batch_goal], -1), next_actions))
            next_q2 = self.target_q_func2((torch.cat([batch_next_state, batch_goal], -1), next_actions))
            next_q = torch.min(next_q1, next_q2)

            target_q = batch_rewards + batch_discount * (
                1.0 - batch_terminal
            ) * torch.flatten(next_q - entropy_term)

        predict_q1 = torch.flatten(self.q_func1((torch.cat([batch_state, batch_goal], -1), batch_actions)))
        predict_q2 = torch.flatten(self.q_func2((torch.cat([batch_state, batch_goal], -1), batch_actions)))

        loss1 = F.smooth_l1_loss(target_q, predict_q1)
        loss2 = F.smooth_l1_loss(target_q, predict_q2)

        # Update stats
        self.q1_record.extend(predict_q1.detach().cpu().numpy())
        self.q2_record.extend(predict_q2.detach().cpu().numpy())
        self.q_func1_loss_record.append(float(loss1))
        self.q_func2_loss_record.append(float(loss2))

        q1_recent_variance = np.var(list(self.q1_record)[-self.recent_variance_size:])
        q2_recent_variance = np.var(list(self.q2_record)[-self.recent_variance_size:])
        self.q_func1_variance_record.append(q1_recent_variance)
        self.q_func2_variance_record.append(q2_recent_variance)

        self.q_func1_optimizer.zero_grad()
        loss1.backward()
        if self.max_grad_norm is not None:
            clip_l2_grad_norm_(self.q_func1.parameters(), self.max_grad_norm)
        self.q_func1_optimizer.step()

        self.q_func2_optimizer.zero_grad()
        loss2.backward()
        if self.max_grad_norm is not None:
            clip_l2_grad_norm_(self.q_func2.parameters(), self.max_grad_norm)
        self.q_func2_optimizer.step()

        self.q_func_n_updates += 1
    def forward(self, x):
        # Input dimensions: 490x326x3
        # Output dimensions: 488x324x15
        x = self.conv1(x)
        x = self.conv1_bn(x)
        x = F.relu(x)
        x = self.dropout1(x)
        # Input dimensions: 488x324x15
        # Output dimensions: 486x322x15
        x = self.conv2(x)
        x = self.conv2_bn(x)
        x = F.relu(x)
        x = self.dropout1(x)
        # Input dimensions: 486x322x15
        # Output dimensions: 243x161x15
        x = F.max_pool2d(x, 2)

        # Input dimensions: 243x161x15
        # Output dimensions: 241x159x30
        x = self.conv3(x)
        x = self.conv3_bn(x)
        x = F.relu(x)
        x = self.dropout1(x)
        # Input dimensions: 241x159x30
        # Output dimensions: 239x157x30
        x = self.conv4(x)
        x = self.conv4_bn(x)
        x = F.relu(x)
        x = self.dropout1(x)
        # Input dimensions: 239x157x30
        # Output dimensions: 120x79x30
        x = F.max_pool2d(x, 2, ceil_mode=True)

        # Input dimensions: 120x79x30
        # Output dimensions: 118x77x60
        x = self.conv5(x)
        x = self.conv5_bn(x)
        x = F.relu(x)
        x = self.dropout1(x)
        # Input dimensions: 118x77x60
        # Output dimensions: 116x75x60
        x = self.conv6(x)
        x = self.conv6_bn(x)
        x = F.relu(x)
        x = self.dropout1(x)
        # Input dimensions: 116x75x60
        # Output dimensions: 58x38x60
        x = F.max_pool2d(x, 2, ceil_mode=True)

        # Input dimensions: 58x38x60
        # Output dimensions: 56x36x120
        x = self.conv7(x)
        x = self.conv7_bn(x)
        x = F.relu(x)
        x = self.dropout1(x)
        # Input dimensions: 56x36x120
        # Output dimensions: 54x34x120
        x = self.conv8(x)
        x = self.conv8_bn(x)
        x = F.relu(x)
        x = self.dropout1(x)
        # Input dimensions: 54x34x120
        # Output dimensions: 27x17x120
        x = F.max_pool2d(x, 2, ceil_mode=True)

        # Input dimensions: 27x17x120
        # Output dimensions: 55080x1
        x = torch.flatten(x, 1)

        # Fully connected layers for x label prediction
        # Input dimensions: 55080x1
        # Output dimensions: 256x1
        x_label = self.fc1x(x)
        x_label = self.fc1x_bn(x_label)
        x_label = F.relu(x_label)
        x_label = self.dropout2(x_label)
        # Input dimensions: 256x1
        # Output dimensions: 20x1
        x_label = self.fc2x(x_label)

        # Fully connected layers for y label prediction
        # Input dimensions: 55080x1
        # Output dimensions: 256x1
        y_label = self.fc1y(x)
        y_label = self.fc1y_bn(y_label)
        y_label = F.relu(y_label)
        y_label = self.dropout2(y_label)
        # Input dimensions: 256x1
        # Output dimensions: 20x1
        y_label = self.fc2y(y_label)

        # Use log softmax to get probabilities for each class. We
        # can then get the class prediction by simply taking the index
        # with the maximum value.
        output_x = F.log_softmax(x_label, dim=1)
        output_y = F.log_softmax(y_label, dim=1)
        return output_x, output_y
Ejemplo n.º 25
0
 def forward(self, input):
     return torch.flatten(input,
                          start_dim=self.start_dim,
                          end_dim=self.end_dim)
Ejemplo n.º 26
0
 def forward_once(self, x):
     x = self.model(x)
     x = self.avgpool(x)
     x = torch.flatten(x, start_dim=1, end_dim=-1)
     return self.classifier(x)
Ejemplo n.º 27
0
 def forward(self, x: torch.Tensor) -> torch.Tensor:
     x = self.features(x)
     x = self.classifier(x)
     return torch.flatten(x, 1)
Ejemplo n.º 28
0
 def forward(self, x):
     x = torch.flatten(x, start_dim=1)
     x = self.do(x)
     x = self.act(self.bn(self.fc1(x)))
     x = self.act_final(self.fc2(x))
     return x
Ejemplo n.º 29
0
def repeat(input, repeats, dim):
    # return th.repeat_interleave(input, repeats, dim) # PyTorch 1.1
    if dim < 0:
        dim += input.dim()
    return th.flatten(th.stack([input] * repeats, dim=dim + 1), dim, dim + 1)
Ejemplo n.º 30
0
 def forward(self, x: torch.Tensor) -> torch.Tensor:
     return torch.flatten(x, start_dim=self.start_dim, end_dim=self.end_dim)