Exemple #1
0
    def run_x_times(self, number_of_feedforward_steps, taskenv):

        # we never are really done, because our automaton
        # equals one episode with one feedforward thing, so instead
        # we define an episode as enough draws:

        # number_of_feedforward_steps ~ our batch size / episode length

        oh_prev_action = F.one_hot(ts(0), self.num_actions)
        oh_prev_reached_state = F.one_hot(ts(0), self.num_states)
        prev_receivd_rewrd = ts([0])

        # initialize the hidden states / recursive inout to zeros
        cx = torch.zeros(1, self.num_rnn_units).view(1, 1, -1)
        hx = torch.zeros(1, self.num_rnn_units).view(1, 1, -1)

        #one batch ~ one episode of 200 trials, will be saved internally
        self.epbuffer = []

        taskenv.reset()

        for i in range(number_of_feedforward_steps):
            # i is also actually also our timestep variable

            cinput = torch.cat(
                (oh_prev_action, oh_prev_reached_state, prev_receivd_rewrd,
                 ts([i])), 0).float().view(1, 1, self.input_size)

            # run the lstm on a single trial
            out, (hx, cx) = self.lstm(cinput, (hx, cx))

            # LSTM output is used as input for two different layers
            # estimated value of the current state
            # ~ cummulative estimate for the future, kind of
            value = self.value_outp_layer(out)
            policy_out = self.action_outp_layer(out)

            # draw action from the last and only action distribution
            policy_distrib = self.act_smx(policy_out).contiguous()
            act_distr = torch.distributions.Categorical(
                policy_distrib.view(-1, self.num_actions)[-1])
            act = act_distr.sample()

            # mean entopy of our action distribution (not needed)
            # acc_entropy = acc_entropy+act_distr.entropy().mean()

            # execute action in the task_environment
            [reached_state, reward] = taskenv.conduct_action(act.item())
            # out: reached state is either 0 or 1; reward also either 0 or 1

            # do not keep any gradient releveant info, aka dont save any tensors
            # save as: action done -> stated reached upon that action -> reward received for that state, and actually predicted value of that action, all @ timepoint i
            self.epbuffer.append(
                [act.item(), reached_state, reward, i,
                 value.item()])

            #self.a2c_ts_out_buffer.append(SavedAction(act_distr.log_prob(act), value))

            # prepare vars for the next trial
            oh_prev_reached_state = F.one_hot(ts(reached_state),
                                              self.num_states)
            oh_prev_action = F.one_hot(act, self.num_actions)
            prev_receivd_rewrd = ts([reward])

        # end of for number of feedforward steps

        return self.epbuffer
Exemple #2
0
    def calc_loss_and_update_weights(self, epbuffer=None, t=0):

        # run the entire set of 200 trials again, not one by one, but instead as batch
        # use exactly the same data & actions as before
        # just so we have a better way for backpropagating the single
        # as it works by having the entire input as matrix

        # standard procedure is to take the internal buffer, it can also be given explicitly to make it nicer
        # to make the code more readily readable
        if epbuffer != None: epbuffer = self.epbuffer
        epbuffer = np.array(epbuffer)  # just make sure to convert to numpy

        ## prepare the input

        actions = epbuffer[:, 0]  # based on the policy head output of the A2C
        reached_states = epbuffer[:, 1]
        rewards = epbuffer[:, 2].astype(
            np.long
        )  # may be nessesary, as nparray may happen to be of type object np array, if we
        timesteps = epbuffer[:, 3].astype(
            np.long
        )  # i.e. use it to be tensors, otherwise no problem (so could also leave it away)
        pred_values = epbuffer[:,
                               4]  # based on the value head output of the A2C

        prev_actions = [0] + actions[:-1].tolist()  # prev conducted_actions
        prev_reached_states = [0] + reached_states[:-1].tolist(
        )  # previously reaced states through that action
        prev_rewards = [
            0
        ] + rewards[:-1].tolist()  # the result of the previous state

        # network needs tensors as input
        ohprev_actions = F.one_hot(ts(prev_actions).long(),
                                   self.num_actions).long()
        ohprev_reached_states = F.one_hot(
            ts(prev_reached_states).long(), self.num_states).long()
        prev_rewards = ts(prev_rewards).view(len(epbuffer), 1)
        timesteps_ts = ts(timesteps.tolist()).view(len(epbuffer), 1)

        #prev_reached_states = ts(prev_reached_states.tolist()).view(len(epbuffer),2)

        # merge them all horizontally (i.e. one array row contains one trial)
        cinput = torch.cat((ohprev_actions, ohprev_reached_states,
                            prev_rewards, timesteps_ts), 1)

        # transform it all into the right input array of shape
        # i.e. add another dimension for episode id; but we only have one episode of 200 trials to process
        # [trials per episode ~200, numer of episodes ~ 1, input size ~ action+state+rew+ts]
        cinput = cinput.float().view(len(epbuffer), 1, self.input_size)

        ## run the network

        # initialize the recurrence nodes of the LSTM; start with state zero, as should be the beginning of each episode (~200 trials)
        cx = torch.zeros(1, self.num_rnn_units).view(1, 1, -1)
        hx = torch.zeros(1, self.num_rnn_units).view(1, 1, -1)

        # feed the input into the LSTM nodes and get the output
        out, (hx, cx) = self.lstm(cinput, (hx, cx))

        # two heads for the A2C algorithm (actor critic);
        # feed in the output gathered from the hidden nodes of the LSTM
        values = self.value_outp_layer(out)
        policy_out1 = self.action_outp_layer(out)
        policy_out = self.act_smx(policy_out1)

        ## do the loss calculation

        # calculate the policy loss (has biggest influence)

        ohactions = F.one_hot(ts(actions.tolist()).long(), self.num_actions)
        resp_outps = torch.sum(policy_out.squeeze() * ohactions, dim=1)

        value_plus = np.asarray(pred_values.tolist() + [0.0])
        #value_plus = np.asarray(values.squeeze().tolist() + [0.0])
        und_adv = rewards + self.gamma * value_plus[1:] - value_plus[:-1]
        advantages = discount(und_adv, self.gamma)

        policy_loss = -torch.sum(
            torch.log(resp_outps + 1e-7) * ts(advantages.copy()))

        # calculate the value loss
        # compute the targets for the value head
        rewards_plus = np.asarray(rewards.tolist() + [0.0])
        # equals target_v, these are our targets, because they are the only real value we have
        disc_cumm_future_rewards = discount(rewards_plus, self.gamma)[:-1]

        # have to create a copy of the numpy array, to have the conscutive items of the array also in
        # conscutive positions on the memory, which is required for the transformation into a tensor
        diff = ts(disc_cumm_future_rewards.copy()) - values.squeeze()
        value_loss = 0.5 * torch.sum(diff * diff)

        # calculate the entropy loss
        # how certain is the network of its own decision
        entropy_loss = -torch.sum(policy_out * torch.log(policy_out + 1e-7))

        # conbine it all into one loss
        loss = 0.05 * value_loss + policy_loss - 0.05 * entropy_loss

        # reset the gradient
        self.optimizer.zero_grad()

        # calculate the gradient
        #loss.backward(retain_graph=True);
        loss.backward()

        # make sure the gradient is not too big
        torch.nn.utils.clip_grad_norm_(self.parameters(), 999.0)

        ### Here do all the bookkeeping
        # gradient will be applied afterwards
        self.wr.add_scalars(
            'losses', {
                'loss': loss,
                'val_loss': value_loss,
                'pol_loss': policy_loss,
                'ent_loss': entropy_loss
            }, t)

        self.wr.add_scalar('sum_rewards', rewards.sum(), t)

        # plot the parameters before the gradients have been applied
        self.wr.add_scalars(
            'ValueLayerParams',
            get_tb_dir_for_tensor_param_stats(self.value_outp_layer), t)
        self.wr.add_scalars(
            'PolcyLayerParams',
            get_tb_dir_for_tensor_param_stats(self.action_outp_layer), t)
        self.wr.add_scalars('LSTMNLayerParams',
                            get_tb_dir_for_tensor_param_stats(self.lstm), t)

        self.wr.add_scalars(
            'ValueLayerCGrads',
            get_tb_dir_for_tensor_param_stats(self.value_outp_layer,
                                              grad=True), t)
        self.wr.add_scalars(
            'PolcyLayerCGrads',
            get_tb_dir_for_tensor_param_stats(self.action_outp_layer,
                                              grad=True), t)
        self.wr.add_scalars(
            'LSTMNLayerCGrads',
            get_tb_dir_for_tensor_param_stats(self.lstm, grad=True), t)

        # apply the gradient
        self.optimizer.step()

        return {'loss': loss.item(), 'acc_ep_reward': rewards.sum().item()}
# t_data3 = t_data[int(2*n/3):]
# x_data3 = 6                   + np.multiply(t_data3**2/5, np.random.standard_normal(t_data3.shape))
# y_data3 = 4 * t_data3         + np.multiply(t_data3**2/5, np.random.standard_normal(t_data3.shape))
# label3 = np.concatenate((x_data3, y_data3), axis=1)

x_data = np.concatenate((x_data1,x_data2), axis=0)
y_data = np.concatenate((y_data1,y_data2), axis=0)
labels = np.concatenate((label1,label2), axis=0)
# print(labels)
# plt.scatter(x_data1, y_data1, marker='.')
# plt.scatter(x_data2, y_data2, marker='.')
# plt.scatter(x_data3, y_data3, marker='.')
# plt.show()
# sys.exit()

train_set = list(zip(ts(t_data), ts(labels)))
random.shuffle(train_set)
# train_set = train_set[0]
# print(train_set)
# for minibatch, label in train_set:
#     print(label)
#     sys.exit()
print("Data prepared.")


# initialize the model
model = nn.Sequential(
    nn.Linear(1, 6),
    nn.ReLU(),
    nn.Linear(6, 32),
    nn.ReLU(),
Exemple #4
0
#!/usr/bin/env python
# _*_coding:utf-8_*_
'''
@Time :    2021/1/28 9:48
@Author:  user
'''

import torch
import torch.tensor as ts

# a scalar

a = ts(42.)
print(a.dim(), a.item(), 2 * a)

# v vector

v = ts([1, 2, 3, 4, 3, 2, 1])
print(v.dim(), v.size(), v)

# m matrix

m = ts([[1, 2, 3], [2, 3, 4], [3, 4, 5], [5, 6, 7]])
print(m.dim(), m.size(), m)
    def calc_loss_and_update_weights(self, epbuffer=None, t=0):

        # run the entire set of 200 trials again, not one by one, but instead as batch
        # use exactly the same data & actions as before
        # just so we have a better way for backpropagating the single
        # as it works by having the entire input as matrix

        # standard procedure is to take the internal buffer, it can also be given explicitly to make it nicer
        # to make the code more readily readable
        if epbuffer != None: epbuffer = self.epbuffer

        epbuffer = np.array(epbuffer)

        ## prepare the input

        actions = epbuffer[:, 0]  # based on the policy head output of the A2C
        reached_states = epbuffer[:, 1]
        rewards = epbuffer[:, 2].astype(
            np.long
        )  # may be nessesary, as nparray may happen to be of type object np array, if we
        timesteps = epbuffer[:, 3].astype(
            np.long
        )  # i.e. use it to be tensors, otherwise no problem (so could also leave it away)
        pred_values = epbuffer[:,
                               4]  # based on the value head output of the A2C

        prev_actions = [0] + actions[:-1].tolist()  # prev conducted_actions
        prev_reached_states = [0] + reached_states[:-1].tolist(
        )  # previously reaced states through that action
        prev_rewards = [
            0
        ] + rewards[:-1].tolist()  # the result of the previous state

        # network needs tensors as input
        ohprev_actions = F.one_hot(ts(prev_actions).long(),
                                   self.num_actions).long()
        ohprev_reached_states = F.one_hot(
            ts(prev_reached_states).long(), self.num_states).long()
        prev_rewards = ts(prev_rewards).view(len(epbuffer), 1)
        timesteps_ts = ts(timesteps.tolist()).view(len(epbuffer), 1)

        #prev_reached_states = ts(prev_reached_states.tolist()).view(len(epbuffer),2)

        # merge them all horizontally (i.e. one array row contains one trial)
        cinput = torch.cat((ohprev_actions, ohprev_reached_states,
                            prev_rewards, timesteps_ts), 1)

        # transform it all into the right input array of shape
        # i.e. add another dimension for episode id; but we only have one episode of 200 trials to process
        # [trials per episode ~200, numer of episodes ~ 1, input size ~ action+state+rew+ts]
        cinput = cinput.float().view(len(epbuffer), 1, self.input_size)

        ## run the network

        # initialize the recurrence nodes of the LSTM; start with state zero, as should be the beginning of each episode (~200 trials)
        cx = torch.zeros(1, self.num_rnn_units).view(1, 1, -1)
        hx = torch.zeros(1, self.num_rnn_units).view(1, 1, -1)
        '''
    # feed the input into the LSTM nodes and get the output
    out, (hx, cx) = self.lstm(cinput, (hx, cx))
    
    # two heads for the A2C algorithm (actor critic); 
    # feed in the output gathered from the hidden nodes of the LSTM
    values = self.value_outp_layer(out)
    policy_out1 = self.action_outp_layer(out)
    policy_out = self.act_smx(policy_out1)
    '''

        ## do the loss calculation

        ### 2nd new according to:
        # https://github.com/ikostrikov/pytorch-a3c/blob/master/train.py

        values = self.epb_values  # already 201 elements
        log_probs = self.epb_logprb
        entropies = self.epb_entrop

        #R = torch.zeros(1, 1) # or last value just in case
        R = torch.tensor(0)  # or last value just in case
        self.epb_values[200].copy_(
            R)  # override the last calculated value with zeros (201st element)
        # or do override the R instead ...

        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)

        self.gae_lambda = 1
        self.entropy_coef = 0.05  # could also be set to 0.01 as in the original
        self.value_loss_coef = 0.05  # 0.5

        #self.gamma = 0.99?

        for i in reversed(range(len(rewards))):  # from i = 199 -> 0
            R = self.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + self.gamma * values[i + 1] - values[
                i]  # here is why we need 201 elements in values
            gae = gae * self.gamma * self.gae_lambda + delta_t

            policy_loss = policy_loss - log_probs[i] * gae.detach(
            ) - self.entropy_coef * entropies[i]

        # reset the gradient
        self.optimizer.zero_grad()

        loss = policy_loss + self.value_loss_coef * value_loss

        # calculate the gradient
        #loss.backward(retain_graph=True);
        loss.backward()

        # make sure the gradient is not too big
        torch.nn.utils.clip_grad_norm_(self.parameters(), 999.0)

        ### Here do all the bookkeeping
        # gradient will be applied afterwards
        self.wr.add_scalars(
            'losses', {
                'loss': loss,
                'val_loss': value_loss,
                'pol_loss': policy_loss,
                'ep_entropy': entropies.sum()
            }, t)

        self.wr.add_scalar('sum_rewards', rewards.sum(), t)

        # plot the parameters before the gradients have been applied
        self.wr.add_scalars(
            'ValueLayerParams',
            get_tb_dir_for_tensor_param_stats(self.value_outp_layer), t)
        self.wr.add_scalars(
            'PolcyLayerParams',
            get_tb_dir_for_tensor_param_stats(self.action_outp_layer), t)
        self.wr.add_scalars('LSTMNLayerParams',
                            get_tb_dir_for_tensor_param_stats(self.lstm), t)

        self.wr.add_scalars(
            'ValueLayerCGrads',
            get_tb_dir_for_tensor_param_stats(self.value_outp_layer,
                                              grad=True), t)
        self.wr.add_scalars(
            'PolcyLayerCGrads',
            get_tb_dir_for_tensor_param_stats(self.action_outp_layer,
                                              grad=True), t)
        self.wr.add_scalars(
            'LSTMNLayerCGrads',
            get_tb_dir_for_tensor_param_stats(self.lstm, grad=True), t)

        # apply the gradient
        self.optimizer.step()

        return {'loss': loss.item(), 'acc_ep_reward': rewards.sum().item()}
    def run_x_times(self, number_of_feedforward_steps, taskenv):

        # we never are really done, because our automaton
        # equals one episode with one feedforward thing, so instead
        # we define an episode as enough draws:

        # number_of_feedforward_steps ~ our batch size / episode length

        oh_prev_action = F.one_hot(ts(0), self.num_actions)
        oh_prev_reached_state = F.one_hot(ts(0), self.num_states)
        prev_receivd_rewrd = ts([0])

        # initialize the hidden states / recursive inout to zeros
        cx = torch.zeros(1, self.num_rnn_units).view(1, 1, -1)
        hx = torch.zeros(1, self.num_rnn_units).view(1, 1, -1)

        #one batch ~ one episode of 200 trials, will be saved internally
        self.epb_values = torch.zeros((200 + 1))
        self.epb_entrop = torch.zeros((200))
        self.epb_logprb = torch.zeros((200))

        self.epbuffer = []

        taskenv.reset()

        for i in range(number_of_feedforward_steps):
            # i is also actually also our timestep variable

            cinput = torch.cat(
                (oh_prev_action, oh_prev_reached_state, prev_receivd_rewrd,
                 ts([i])), 0).float().view(1, 1, self.input_size)

            # run the lstm on a single trial
            out, (hx, cx) = self.lstm(cinput, (hx, cx))

            # LSTM output is used as input for two different layers
            # estimated value of the current state
            # ~ cummulative estimate for the future, kind of
            value = self.value_outp_layer(out)
            policy_out = self.action_outp_layer(out)

            # draw action from the last and only action distribution
            policy_smx = self.act_smx(policy_out)
            policy_distrib = policy_smx.contiguous()
            act_distr = torch.distributions.Categorical(
                policy_distrib.view(-1, self.num_actions)[-1])
            act = act_distr.sample()

            # mean entopy of our action distribution (not needed)
            # acc_entropy = acc_entropy+act_distr.entropy().mean()

            # execute action in the task_environment
            [reached_state, reward] = taskenv.conduct_action(act.item())
            # out: reached state is either 0 or 1; reward also either 0 or 1

            # do not keep any gradient releveant info, aka dont save any tensors
            # save as: action done -> stated reached upon that action -> reward received for that state, and actually predicted value of that action, all @ timepoint i
            #self.epbuffer.append([ act.item(), reached_state, reward, i, value.squeeze().clone().detach(), act_distr.entropy().mean().clone().detach(), act_distr.log_prob(act).clone().detach()])
            self.epbuffer.append(
                [act.detach(), reached_state, reward, i,
                 value.item()])

            #prob = F.softmax(policy_out, dim=-1)
            #print("Pobs: ", prob, "\t", policy_smx, "\t", policy_distrib)

            log_policy_smx = F.log_softmax(
                policy_out, dim=-1)  # returns a probability thing
            #log_prob_a = log_policy_smx.squeeze().gather(0, act)
            #print("log_probn: ", log_prob_a, "old: ", act_distr.log_prob(act))

            entropy = -(log_policy_smx * policy_smx).squeeze().sum()
            #print("e: ", log_probn * prob)
            #print("e: ", entropy)

            self.epb_values[i] = value.squeeze()
            self.epb_entrop[i].copy_(entropy)
            self.epb_logprb[i].copy_(act_distr.log_prob(act))

            # alternative: (as in the original)
            # self.epb_entrop = []
            # self.epb_entrop.append(entropy)

            #print("Is that fine?")
            #dadaeee = input("lala: ")

            #self.a2c_ts_out_buffer.append(SavedAction(act_distr.log_prob(act), value))

            # prepare vars for the next trial
            oh_prev_reached_state = F.one_hot(ts(reached_state),
                                              self.num_states)
            oh_prev_action = F.one_hot(act, self.num_actions)
            prev_receivd_rewrd = ts([reward])

        # end of for number of feedforward steps

        # R = torch.zeros(1, 1)
        # if not done
        # run once again and take the value ...
        # value, _, _ = model((state.unsqueeze(0), (hx, cx)))
        # R = value.detach()
        # values.append(R) -> or make self.epb_values bigger by one? -> so far it seems we just have another values_plus...

        cinput = torch.cat(
            (oh_prev_action, oh_prev_reached_state, prev_receivd_rewrd,
             ts([number_of_feedforward_steps])),
            0).float().view(1, 1, self.input_size)
        out, (hx, cx) = self.lstm(cinput, (hx, cx))

        self.final_value = self.value_outp_layer(out).squeeze().detach()
        # just in case
        self.epb_values[number_of_feedforward_steps] = self.final_value

        return self.epbuffer
    def calc_loss_and_update_weights(self, epbuffer=None, t=0):

        # run the entire set of 200 trials again, not one by one, but instead as batch
        # use exactly the same data & actions as before
        # just so we have a better way for backpropagating the single
        # as it works by having the entire input as matrix

        # standard procedure is to take the internal buffer, it can also be given explicitly to make it nicer
        # to make the code more readily readable
        if epbuffer != None: epbuffer = self.epbuffer

        epbuffer = np.array(epbuffer)

        ## prepare the input

        actions = epbuffer[:, 0]  # based on the policy head output of the A2C
        reached_states = epbuffer[:, 1]
        rewards = epbuffer[:, 2].astype(
            np.long
        )  # may be nessesary, as nparray may happen to be of type object np array, if we
        timesteps = epbuffer[:, 3].astype(
            np.long
        )  # i.e. use it to be tensors, otherwise no problem (so could also leave it away)
        pred_values = epbuffer[:,
                               4]  # based on the value head output of the A2C

        # get the buffered network outputs with the gradient info preserved
        # in contrast to epbuffer, which only keeps track of things without grad
        values = self.epb_values
        policy_out = self.epb_policy

        ## do the loss calculation

        # calculate the policy loss (has biggest influence)
        ohactions = F.one_hot(ts(actions.tolist()).long(), self.num_actions)
        resp_outps = torch.sum(policy_out.squeeze() * ohactions, dim=1)

        value_plus = np.asarray(pred_values.tolist() + [0.0])
        #value_plus = np.asarray(values.squeeze().tolist() + [0.0])
        und_adv = rewards + self.gamma * value_plus[1:] - value_plus[:-1]
        advantages = discount(und_adv, self.gamma)

        policy_loss = -torch.sum(
            torch.log(resp_outps + 1e-7) * ts(advantages.copy()))

        # calculate the value loss
        # compute the targets for the value head
        rewards_plus = np.asarray(rewards.tolist() + [0.0])
        # equals target_v, these are our targets, because they are the only real value we have
        disc_cumm_future_rewards = discount(rewards_plus, self.gamma)[:-1]

        # have to create a copy of the numpy array, to have the conscutive items of the array also in
        # conscutive positions on the memory, which is required for the transformation into a tensor
        diff = ts(disc_cumm_future_rewards.copy()) - values.squeeze()
        value_loss = 0.5 * torch.sum(diff * diff)

        # calculate the entropy loss
        # how certain is the network of its own decision
        entropy_loss = -torch.sum(policy_out * torch.log(policy_out + 1e-7))

        # conbine it all into one loss
        loss = 0.05 * value_loss + policy_loss - 0.05 * entropy_loss

        # reset the gradient
        self.optimizer.zero_grad()

        # calculate the gradient
        #loss.backward(retain_graph=True);
        loss.backward()

        # make sure the gradient is not too big
        torch.nn.utils.clip_grad_norm_(self.parameters(), 999.0)

        ### Here do all the bookkeeping
        # gradient will be applied afterwards
        self.wr.add_scalars(
            'losses', {
                'loss': loss,
                'val_loss': value_loss,
                'pol_loss': policy_loss,
                'ent_loss': entropy_loss
            }, t)

        self.wr.add_scalar('sum_rewards', rewards.sum(), t)

        # plot the parameters before the gradients have been applied
        self.wr.add_scalars(
            'ValueLayerParams',
            get_tb_dir_for_tensor_param_stats(self.value_outp_layer), t)
        self.wr.add_scalars(
            'PolcyLayerParams',
            get_tb_dir_for_tensor_param_stats(self.action_outp_layer), t)
        self.wr.add_scalars('LSTMNLayerParams',
                            get_tb_dir_for_tensor_param_stats(self.lstm), t)

        self.wr.add_scalars(
            'ValueLayerCGrads',
            get_tb_dir_for_tensor_param_stats(self.value_outp_layer,
                                              grad=True), t)
        self.wr.add_scalars(
            'PolcyLayerCGrads',
            get_tb_dir_for_tensor_param_stats(self.action_outp_layer,
                                              grad=True), t)
        self.wr.add_scalars(
            'LSTMNLayerCGrads',
            get_tb_dir_for_tensor_param_stats(self.lstm, grad=True), t)

        # apply the gradient
        self.optimizer.step()

        return {'loss': loss.item(), 'acc_ep_reward': rewards.sum().item()}