Example #1
0
            print "State: {}; Q: {}".format(state[0], Q)


total_score_l = []
sample_period = 5
num_episodes = 50000
starting_uncertainty = model.get_sigma_l()

sigma_average_dict = defaultdict(list)
components = ['W: first hidden', 'b: first hidden', 'W: second hidden', 'b: second hidden', \
                'W: output','b: output']

for i_episode in range(num_episodes):
    # Initialize the environment and state
    env.reset()
    state = Tensor(env.get_state()).unsqueeze(0)
    score = 0

    for t in xrange(500):
        # Select and perform an action
        if t % sample_period == 0:
            w_sample = model.sample()
        action = select_action(state)
        reward, done = env.do_action(action[0, 0])
        score += reward
        reward = Tensor([reward])

        # Observe new state
        if not done:
            next_state = Tensor(env.get_state()).unsqueeze(0)
        else: