state = next_state episode_reward += reward if done: args_k = k_by_reward(episode_reward) state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(replay_buffer) > batch_size: loss = improved_td_loss(batch_size, replay_buffer, current_model, target_model, gamma, args_k, optimizer) losses.append(loss.data[0]) if frame_idx % 1000 == 0: update_target(current_model, target_model) if frame_idx % 1000 == 0: print("This is %d frame of 1000000, k_end %f, reward is %f" % (frame_idx, k_end * 2., all_rewards[-1])) losses_all.append(losses) rewards_all.append(all_rewards) str = "data/analysis/CartPolerl1_kend%f" % (k_end * 2.) mean_losses, var_losses = tl.StatShrink2D(losses_all) mean_rewards, var_rewards = tl.StatShrink2D(rewards_all) tl.save2D4list(mean_losses, var_losses, mean_rewards, var_rewards, str)
next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(replay_buffer) > batch_size: loss = compute_td_loss1(batch_size, replay_buffer, current_model, target_model, gamma, optimizer) losses.append(loss.data[0]) if frame_idx % 1000 == 0: update_target(current_model, target_model) if frame_idx % 1000 == 0: print("This is %d frame of 1000000, range %d, reward is %f" % (frame_idx, 1, all_rewards[-1])) losses_all.append(losses) rewards_all.append(all_rewards) mean_losses, var_losses = tl.StatShrink2D(losses_all) mean_rewards, var_rewards = tl.StatShrink2D(rewards_all) tl.save2D4list(mean_losses, var_losses, mean_rewards, var_rewards, "data/noisydqn_CartPole_five1.npz")
replay_buffer = tl.BaseReplayBuffer(capacity) update_target(current_model, target_model) for frame_idx in range(1, num_frames + 1): args_k = 0. action = current_model.act(state) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: args_k = k_by_reward(episode_reward) state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(replay_buffer) > replay_initial: loss = improved_td_loss(batch_size, replay_buffer, current_model, target_model, gamma, args_k, optimizer) losses.append(loss.data[0]) if frame_idx % updatefrc == 0: update_target(current_model, target_model) if frame_idx % 2000 == 0: print("This is %d frame of 1000000, k_end %d, reward is %f" % (frame_idx, k_end, all_rewards[-1])) save_dir = "data/analysis/rl1_kend%d" % (k_end * 2) tl.save2D4list(losses, [], all_rewards, [], save_dir)
next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: args_k = k_by_reward(episode_reward) state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(replay_buffer) > replay_initial: loss = improved_td_loss(batch_size, replay_buffer, current_model, target_model, gamma, args_k, optimizer) losses.append(loss.data[0]) if frame_idx % updatefrc == 0: update_target(current_model, target_model) if frame_idx % 2000 == 0: print("This is %d frame of 1000000, k_end %d, reward is %f" % (frame_idx, k_end * 2, all_rewards[-1])) save_dir = "data/analysis/rl05_kend%d" % (k_end * 2) tl.save2D4list(losses, [], all_rewards, [], save_dir) mean_losses, var_losses = tl.StatShrink2D(losses_all) mean_rewards, var_rewards = tl.StatShrink2D(rewards_all) #tl.save2D4list2(mean_losses, var_losses, mean_rewards, var_rewards, weight_sigmas, bais_sigmas, frame_list, save_dir) tl.save2D4list(mean_losses, var_losses, mean_rewards, var_rewards, save_dir)