print "State: {}; Q: {}".format(state[0], Q) total_score_l = [] sample_period = 5 num_episodes = 50000 starting_uncertainty = model.get_sigma_l() sigma_average_dict = defaultdict(list) components = ['W: first hidden', 'b: first hidden', 'W: second hidden', 'b: second hidden', \ 'W: output','b: output'] for i_episode in range(num_episodes): # Initialize the environment and state env.reset() state = Tensor(env.get_state()).unsqueeze(0) score = 0 for t in xrange(500): # Select and perform an action if t % sample_period == 0: w_sample = model.sample() action = select_action(state) reward, done = env.do_action(action[0, 0]) score += reward reward = Tensor([reward]) # Observe new state if not done: next_state = Tensor(env.get_state()).unsqueeze(0) else: