def eval(cfg, saved_model_path=SAVED_MODEL_PATH): print('start to eval ! \n') env = NormalizedActions(gym.make("Pendulum-v0")) n_states = env.observation_space.shape[0] n_actions = env.action_space.shape[0] agent = DDPG(n_states, n_actions, critic_lr=1e-3, actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128) agent.load_model(saved_model_path + 'checkpoint.pth') rewards = [] moving_average_rewards = [] ep_steps = [] log_dir = os.path.split( os.path.abspath(__file__))[0] + "/logs/eval/" + SEQUENCE writer = SummaryWriter(log_dir) for i_episode in range(1, cfg.eval_eps + 1): state = env.reset() # reset环境状态 ep_reward = 0 for i_step in range(1, cfg.eval_steps + 1): action = agent.select_action(state) # 根据当前环境state选择action next_state, reward, done, _ = env.step(action) # 更新环境参数 ep_reward += reward state = next_state # 跳转到下一个状态 if done: break print('Episode:', i_episode, ' Reward: %i' % int(ep_reward), 'n_steps:', i_step, 'done: ', done) ep_steps.append(i_step) rewards.append(ep_reward) # 计算滑动窗口的reward if i_episode == 1: moving_average_rewards.append(ep_reward) else: moving_average_rewards.append(0.9 * moving_average_rewards[-1] + 0.1 * ep_reward) writer.add_scalars('rewards', { 'raw': rewards[-1], 'moving_average': moving_average_rewards[-1] }, i_episode) writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode) writer.close() '''存储reward等相关结果''' if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 os.mkdir(RESULT_PATH) np.save(RESULT_PATH + 'rewards_eval.npy', rewards) np.save(RESULT_PATH + 'moving_average_rewards_eval.npy', moving_average_rewards) np.save(RESULT_PATH + 'steps_eval.npy', ep_steps)
def main(): params = { 'actor_learning_rate':1e-4, 'critic_learning_rate':1e-3, 'gamma':0.99, 'tau':0.001, 'sigma':0.2, 'num_epochs':275, 'num_episodes':20, 'replay_size':1000000, 'num_train_steps':1, 'replay_init_size':1000, 'batch_size':64, 'render_train':False, 'restore':False, 'env':'Hopper-v2_kirkiles_train1step_noise_norm_bufsize1Mi1k' } agent = DDPG(params) agent.train()
def main(): params = { 'actor_learning_rate': 1e-4, 'critic_learning_rate': 1e-3, 'gamma': 0.99, 'tau': 0.001, 'sigma': 0.2, 'num_epochs': 500, 'num_episodes': 20, 'replay_size': 1000000, 'num_train_steps': 50, 'replay_init_size': 1000, 'batch_size': 64, 'render_train': False, 'restore': False, 'env': 'HalfCheetah-v2' } agent = DDPG(params) #agent.train() agent.test()
def main(args): env = gym.make('Walker2d-v1') reward_history = [] agent = DDPG(env) agent.construct_model(args.gpu) saver = tf.train.Saver() if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.sess.run(tf.global_variables_initializer()) for episode in range(args.ep): # env init state = env.reset() total_rewards = 0 for step in range(env.spec.timestep_limit): env.render() action = agent.sample_action(state[np.newaxis, :], explore=False) # act next_state, reward, done, _ = env.step(action[0]) total_rewards += reward agent.store_experience(state, action, reward, next_state, done) agent.update_model() # shift state = next_state if done: break reward_history.append(total_rewards) print('Ep%d reward:%d' % (episode+1, total_rewards)) print('Average rewards: ', np.mean(reward_history))
def __init__(self, num_agents, state_size, action_size, params, seed): """Initialize multiple DDPG agents Params ====== num_agents (int): number of DDPG agents state_size (int): dimension of each state action_size (int): dimension of each action params (Params): hyperparameters seed (int): random seed """ self.agents = [ DDPG(state_size, action_size, params, seed) for _ in range(num_agents) ] # Replay Buffer forall agents self.memory = ReplayBuffer(params.buffer_size, params.batch_size, seed)
def main(): RENDER = False env = gym.make(ENV_NAME) env.unwrapped env.seed(1) #获取环境参数 state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_high = env.action_space.high action_low = env.action_space.low ddpg = DDPG(state_dim,action_dim,action_high,MODEL) var = 3 for episode in range(EPISODES): ep_r = 0 state = env.reset() for step in range(STEPS): if RENDER: env.render() action = ddpg.action_choose(state) action = np.clip(np.random.normal(action,var),action_low,action_high) state_,reward,done,info = env.step(action) ddpg.store_transitions(state,action,reward/10,state_) if ddpg.pointer > MEMORY_CAPACITY: var *= 0.9995 ddpg.learn() state = state_ ep_r += reward if step == STEPS-1: print('Episode:',episode,'Average reward:',ep_r,"explore:",var) if ep_r> -300: RENDER = True break if MODEL == 'train': torch.save(ddpg.actor_eval, 'actor_eval.pkl') torch.save(ddpg.actor_target, 'actor_target.pkl') torch.save(ddpg.critic_eval, 'critic_eval.pkl') torch.save(ddpg.critic_target,'critic_target.pkl') # writer.add_graph(ddpg.actor_eval,state) # writer.close() env.close()
def main(args): env = gym.make('Walker2d-v1') env = wrappers.Monitor(env, './videos/', force=True) reward_history = [] agent = DDPG(env, args) agent.construct_model(args.gpu) saver = tf.train.Saver() if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) best_avg_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: raise ValueError('model_path required!') for ep in range(args.ep): # env init state = env.reset() ep_rewards = 0 for step in range(env.spec.timestep_limit): env.render() action = agent.sample_action(state[np.newaxis, :], noise=False) # act next_state, reward, done, _ = env.step(action[0]) ep_rewards += reward agent.store_experience(state, action, reward, next_state, done) # shift state = next_state if done: break reward_history.append(ep_rewards) print('Ep%d reward:%d' % (ep + 1, ep_rewards)) print('Average rewards: ', np.mean(reward_history))
def arp_pred_net( AR, num_his_ar=4, funname='', net_scale=1 ): num_sbs, _ = AR.shape num_ts = 10000 action_size = num_sbs ar_size = num_sbs his_ar_size = ar_size * num_his_ar print( "Size of history ar: " + str(his_ar_size) ) arp_errors = [1] # average error in the predicted arrival rate # Randomly initialize critic, actor, target critic, target actor and load prediction network and replay buffer, in the agent agent = DDPG( his_ar_size, ar_size, action_size, TAU, is_batch_norm, write_sum, net_size_scale=net_scale ) for i in range( num_his_ar, num_ts+num_his_ar ): his_ar = np.reshape( AR[:,i-num_his_ar:i], (1, his_ar_size) , order='F' ) real_ar = AR[:,i] agent.add_experience_arp( his_ar, real_ar ) # Train ar prediction network, after many num_ts, one minibatch is enough for each step arp_error = 1 arp_train_times = min(10, max(1, int(i/ARP_BATCH_SIZE)) ) #if i<1000 else 5 lr = max(ARP_LR_MIN, agent.decay(i, ARP_LR_MIN, ARP_LR_MAX, num_his_ar, 8000, 2) ) for j in range( 0, arp_train_times ): arp_errort = agent.train_arp( lr ) #/math.log(i+2) #print('arp_errort: ' + str(arp_errort)) if arp_errort !=1: arp_error = arp_errort if arp_error !=1: arp_errors.append( math.sqrt( arp_error ) ) if i%(100) == 0: print(' i: ' + str(i) + ', arp_error: ' + str(math.sqrt( arp_error ))) return arp_errors
def main(args): set_random_seed(args.seed) env = gym.make('Walker2d-v1') agent = DDPG(env, args) agent.construct_model(args.gpu) saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) best_avg_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: # build a new model agent.sess.run(tf.global_variables_initializer()) ep_base = 0 best_avg_rewards = None reward_history, step_history = [], [] train_steps = 0 for ep in range(args.max_ep): # env init state = env.reset() ep_rewards = 0 for step in range(env.spec.timestep_limit): action = agent.sample_action(state[np.newaxis, :], noise=True) # act next_state, reward, done, _ = env.step(action[0]) train_steps += 1 ep_rewards += reward agent.store_experience(state, action, reward, next_state, done) agent.update_model() # shift state = next_state if done: print('Ep %d global_steps: %d Reward: %.2f' % (ep + 1, agent.global_steps, ep_rewards)) # reset ou noise agent.ou.reset() break step_history.append(train_steps) if not reward_history: reward_history.append(ep_rewards) else: reward_history.append(reward_history[-1] * 0.99 + ep_rewards + 0.01) # Evaluate during training if ep % args.log_every == 0 and ep > 0: ep_rewards = 0 for ep_eval in range(args.test_ep): state = env.reset() for step_eval in range(env.spec.timestep_limit): action = agent.sample_action( state[np.newaxis, :], noise=False) next_state, reward, done, _ = env.step(action[0]) ep_rewards += reward state = next_state if done: break curr_avg_rewards = ep_rewards / args.test_ep # logging print('\n') print('Episode: %d' % (ep + 1)) print('Gloabal steps: %d' % agent.global_steps) print('Mean reward: %.2f' % curr_avg_rewards) print('\n') if not best_avg_rewards or (curr_avg_rewards >= best_avg_rewards): best_avg_rewards = curr_avg_rewards if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path + str(round(best_avg_rewards, 2)) \ + '_' + str(ep_base + ep + 1) saver.save(agent.sess, save_name) print('Model save %s' % save_name) plt.plot(step_history, reward_history) plt.xlabel('steps') plt.ylabel('running reward') plt.show()
def train(cfg): print('Start to train ! \n') env = NormalizedActions(gym.make("Pendulum-v0")) # 增加action噪声 ou_noise = OUNoise(env.action_space) n_states = env.observation_space.shape[0] n_actions = env.action_space.shape[0] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") agent = DDPG(n_states, n_actions, device="cpu", critic_lr=1e-3, actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128) rewards = [] moving_average_rewards = [] ep_steps = [] log_dir = os.path.split( os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE writer = SummaryWriter(log_dir) for i_episode in range(1, cfg.train_eps + 1): state = env.reset() ou_noise.reset() ep_reward = 0 for i_step in range(1, cfg.train_steps + 1): action = agent.select_action(state) action = ou_noise.get_action(action, i_step) # 即paper中的random process next_state, reward, done, _ = env.step(action) ep_reward += reward agent.memory.push(state, action, reward, next_state, done) agent.update() state = next_state if done: break print('Episode:', i_episode, ' Reward: %i' % int(ep_reward), 'n_steps:', i_step) ep_steps.append(i_step) rewards.append(ep_reward) if i_episode == 1: moving_average_rewards.append(ep_reward) else: moving_average_rewards.append(0.9 * moving_average_rewards[-1] + 0.1 * ep_reward) writer.add_scalars('rewards', { 'raw': rewards[-1], 'moving_average': moving_average_rewards[-1] }, i_episode) writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode) writer.close() print('Complete training!') ''' 保存模型 ''' if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹 os.mkdir(SAVED_MODEL_PATH) agent.save_model(SAVED_MODEL_PATH + 'checkpoint.pth') '''存储reward等相关结果''' if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 os.mkdir(RESULT_PATH) np.save(RESULT_PATH + 'rewards_train.npy', rewards) np.save(RESULT_PATH + 'moving_average_rewards_train.npy', moving_average_rewards) np.save(RESULT_PATH + 'steps_train.npy', ep_steps)
from torch.utils.tensorboard import SummaryWriter from agent import DDPG from exploration import OUActionNoise from utils import get_screen epoch = 5000 env = gym.make('Pendulum-v0') # seed np.random.seed(42) env.seed(42) torch.manual_seed(42) torch.cuda.manual_seed(42) writer = SummaryWriter(log_dir='logs/') agent = DDPG(env, writer) all_timesteps = 0 for e in range(epoch): noise = OUActionNoise(env.action_space.shape[0]) env.reset() pixel = env.render(mode='rgb_array') state = deque([get_screen(pixel) for _ in range(3)], maxlen=3) cumulative_reward = 0 for timestep in range(200): action = agent.get_action(np.array(state)[np.newaxis], noise, timestep) _, reward, done, _ = env.step(action * env.action_space.high[0]) pixel = env.render(mode='rgb_array') state_ = state.copy() state_.append(get_screen(pixel))
# Initialize policy # if args.policy == "TD3": # # Target policy smoothing is scaled wrt the action scale # kwargs["policy_noise"] = args.policy_noise * max_action # kwargs["noise_clip"] = args.noise_clip * max_action # kwargs["policy_freq"] = args.policy_freq # policy = TD3.TD3(**kwargs) if args.policy == "A2C": envs = ParaEnv(args.env, args.n_processes, args.seed) policy = A2C.A2C(env.observation_space, env.action_space, args.discount, args.tau, max_episode_timesteps) x, y = policy.run(envs, file_name, args) write_result(args.env + "_A2C.json", x, y) elif args.policy == "DDPG": policy = DDPG.DDPG(**kwargs) x, y = policy.run(env, file_name, args) write_result(args.env + "_DDPG.json", x, y) elif args.policy == "REINFORCE": args.n_steps = 5 args.n_processes = 16 envs = ParaEnv(args.env, args.n_processes, args.seed) policy = REINFORCE.REINFORCE(env.observation_space, env.action_space, args.discount, args.tau, args.n_steps, args.n_processes, max_episode_timesteps) x, y = policy.run(envs, file_name, args) write_result(args.env + "_REINFORCE.json", x, y) else: x, y = None, None
def test(agent, trial_dir, test_episode, visual_flag, submit_flag): pid = os.getpid() logger, _ = prepare_for_logging("pid_{}".format(pid), False) logger.info("trial_dir={}".format(trial_dir)) if not os.path.exists(trial_dir): logger.info("trial_dir does not exist") return # create environment env = NIPS(visualize=visual_flag) # load config with open(os.path.join(trial_dir, "config.pk"), "rb") as f: config = pickle.load(f) if agent == 'DDPG': config["scale_action"] = scale_action # observation processor if "ob_processor" not in config or config["ob_processor"] == "dummy": ob_processor = ObservationProcessor() elif config["ob_processor"] == "2ndorder": ob_processor = SecondOrderAugmentor() else: ob_processor = BodySpeedAugmentor() config["ob_aug_dim"] = ob_processor.get_aug_dim() util.print_settings(logger, config, env) # create random process oup = create_rand_process(env, config) # create replay buffer memory = create_memory(env, config) # create ddpg agent agent = DDPG(env, memory, oup, ob_processor, config) agent.build_nets(actor_hiddens=config["actor_hiddens"], scale_action=config["scale_action"], critic_hiddens=config["critic_hiddens"]) # load weights paths = {} if test_episode > 0: paths["actor"] = "actor_{}.h5".format(test_episode) paths["critic"] = "critic_{}.h5".format(test_episode) paths["target"] = "target_{}.h5".format(test_episode) else: paths["actor"] = "actor.h5" paths["critic"] = "critic.h5" paths["target"] = "target.h5" paths = {k: os.path.join(trial_dir, v) for k, v in paths.iteritems()} logger.info("Paths to models: {}".format(paths)) agent.load_models(paths) elif agent == 'TRPO': def ob_processor_maker(): if config["ob_processor"] == "normal": return ObservationProcessor() elif config["ob_processor"] == "2ndorder": return SecondOrderAugmentor() elif config['ob_processor'] == 'bodyspeed': return BodySpeedAugmentor() else: raise ValueError('invalid ob processor type') config = { "agent": 'TRPO', "batch_size": 5000, "n_envs": 16, "n_iters": 5000, "ob_processor": "bodyspeed", # "hidden_nonlinearity": "relu", # "action_nonlinearity": "tanh", # "policy_hiddens": [128, 128, 64, 64], # "baseline_hiddens": [128, 128, 64, 64], "policy_hiddens": [256, 128, 64], "baseline_hiddens": [256, 128, 64], "hidden_nonlinearity": "tanh", "action_nonlinearity": None, } agent = TRPO( env, env_maker=None, logger=logger, log_dir=None, ob_processor_maker=ob_processor_maker, policy_hiddens=config['policy_hiddens'], baseline_hiddens=config['baseline_hiddens'], hidden_nonlinearity=config['hidden_nonlinearity'], action_nonlinearity=config['action_nonlinearity'], n_envs=config['n_envs'], batch_size=config['batch_size'], n_iters=config['n_iters'], ) agent.load_models(trial_dir) else: raise ValueError('invalid agent type') if submit_flag: submit(agent, logger) else: rewards = [] for i in xrange(10): steps, reward = agent.test(max_steps=1000) logger.info("episode={}, steps={}, reward={}".format( i, steps, reward)) rewards.append(reward) logger.info("avg_reward={}".format(np.mean(rewards)))
def main(args): env = gym.make('Walker2d-v1') agent = DDPG(env) agent.construct_model(args.gpu) saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) else: # build a new model agent.sess.run(tf.global_variables_initializer()) ep_base = 0 MAX_EPISODES = 100000 TEST = 10 for episode in range(MAX_EPISODES): # env init state = env.reset() total_rewards = 0 for step in range(env.spec.timestep_limit): action = agent.sample_action(state[np.newaxis, :], explore=True) # act next_state, reward, done, _ = env.step(action[0]) total_rewards += reward agent.store_experience(state, action, reward, next_state, done) agent.update_model() # shift state = next_state if done: print('Ep %d global_steps: %d Reward: %.2f' % (episode + 1, agent.global_steps, total_rewards)) # reset ou noise agent.ou.reset() break # Evaluation per 100 ep if episode % 100 == 0 and episode > 100: total_rewards = 0 for ep_eval in range(TEST): state = env.reset() for step_eval in range(env.spec.timestep_limit): action = agent.sample_action(state[np.newaxis, :], explore=False) next_state, reward, done, _ = env.step(action[0]) total_rewards += reward state = next_state if done: break mean_rewards = total_rewards / TEST # logging print('\n') print('Episode: %d' % (episode + 1)) print('Gloabal steps: %d' % agent.global_steps) print('Mean reward: %.2f' % mean_rewards) print('\n') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path \ + str(episode) + '_' + str(round(mean_rewards, 2)) saver.save(agent.sess, save_name)
def train_agent(args, param): """ Args: """ # create CNN convert the [1,3,84,84] to [1, 200] use_gym = False # in case seed experements args.seed = param now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") #args.repeat_opt = repeat_opt torch.manual_seed(args.seed) np.random.seed(args.seed) pathname = str(args.locexp) + "/" + str(args.env_name) + '-agent-' + str( args.policy) pathname += "_batch_size_" + str(args.batch_size) pathname += '_update_freq: ' + str( args.target_update_freq) + "num_q_target_" + str( args.num_q_target) + "_seed_" + str(args.seed) pathname += "_actor_300_200" text = "Star_training target_update_freq: {} num_q_target: {} use device {} ".format( args.target_update_freq, args.num_q_target, args.device) print(pathname, text) write_into_file(pathname, text) arg_text = str(args) write_into_file(pathname, arg_text) tensorboard_name = str(args.locexp) + '/runs/' + pathname writer = SummaryWriter(tensorboard_name) if use_gym: env = gym.make(args.env_name) env.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) args.max_episode_steps = env._max_episode_steps else: size = 84 env = suite.make( args.env_name, has_renderer=False, use_camera_obs=True, ignore_done=True, has_offscreen_renderer=True, camera_height=size, camera_width=size, render_collision_mesh=False, render_visual_mesh=True, camera_name='agentview', use_object_obs=False, camera_depth=True, reward_shaping=True, ) state_dim = 200 print("State dim, ", state_dim) action_dim = env.dof print("action_dim ", action_dim) max_action = 1 args.max_episode_steps = 200 policy = DDPG(state_dim, action_dim, max_action, args) file_name = str(args.locexp) + "/pytorch_models/{}".format(args.env_name) obs_shape = (3, 84, 84) action_shape = (action_dim, ) print("obs", obs_shape) print("act", action_shape) replay_buffer = ReplayBuffer(obs_shape, action_shape, int(args.buffer_size), args.image_pad, args.device) save_env_vid = False total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True t0 = time.time() scores_window = deque(maxlen=100) episode_reward = 0 evaluations = [] tb_update_counter = 0 while total_timesteps < args.max_timesteps: tb_update_counter += 1 # If the episode is done if done: episode_num += 1 #env.seed(random.randint(0, 100)) scores_window.append(episode_reward) average_mean = np.mean(scores_window) if tb_update_counter > args.tensorboard_freq: print("Write tensorboard") tb_update_counter = 0 writer.add_scalar('Reward', episode_reward, total_timesteps) writer.add_scalar('Reward mean ', average_mean, total_timesteps) writer.flush() # If we are not at the very beginning, we start the training process of the model if total_timesteps != 0: text = "Total Timesteps: {} Episode Num: {} ".format( total_timesteps, episode_num) text += "Episode steps {} ".format(episode_timesteps) text += "Reward: {:.2f} Average Re: {:.2f} Time: {}".format( episode_reward, np.mean(scores_window), time_format(time.time() - t0)) print(text) write_into_file(pathname, text) #policy.train(replay_buffer, writer, episode_timesteps) # We evaluate the episode and we save the policy if total_timesteps > args.start_timesteps: policy.train(replay_buffer, writer, 200) if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, env)) torch.manual_seed(args.seed) np.random.seed(args.seed) save_model = file_name + '-{}reward_{:.2f}-agent{}'.format( episode_num, evaluations[-1], args.policy) policy.save(save_model) # When the training step is done, we reset the state of the environment if use_gym: obs = env.reset() else: state = env.reset() obs, state_buffer = stacked_frames(state, size, args, policy) # Set the Done to False done = False # Set rewards and episode timesteps to zero episode_reward = 0 episode_timesteps = 0 # Before 10000 timesteps, we play random actions if total_timesteps < args.start_timesteps: if use_gym: action = env.action_space.sample() else: action = np.random.randn(env.dof) else: # After 10000 timesteps, we switch to the model if use_gym: action = policy.select_action(np.array(obs)) # If the explore_noise parameter is not 0, we add noise to the action and we clip it if args.expl_noise != 0: action = (action + np.random.normal( 0, args.expl_noise, size=env.action_space.shape[0])).clip( env.action_space.low, env.action_space.high) else: action = (policy.select_action(np.array(obs)) + np.random.normal( 0, max_action * args.expl_noise, size=action_dim)).clip(-max_action, max_action) if total_timesteps % args.target_update_freq == 0: if args.policy == "TD3_ad": policy.hardupdate() # The agent performs the action in the environment, then reaches the next state and receives the reward new_obs, reward, done, _ = env.step(action) done = float(done) if not use_gym: new_obs, state_buffer = create_next_obs(new_obs, size, args, state_buffer, policy) # We check if the episode is done #done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float( done) if not use_gym: if episode_timesteps + 1 == args.max_episode_steps: done = True # We increase the total reward reward = reward * args.reward_scalling episode_reward += reward # We store the new transition into the Experience Replay memory (ReplayBuffer) if args.debug: print("add to buffer next_obs ", obs.shape) print("add to bufferobs ", new_obs.shape) replay_buffer.add(obs, action, reward, new_obs, done, done_bool) # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy obs = new_obs if total_timesteps > args.start_timesteps: policy.train(replay_buffer, writer, 0) episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 # We add the last policy evaluation to our list of evaluations and we save our model evaluations.append( evaluate_policy(policy, writer, total_timesteps, args, episode_num))
ep_reward += reward agent.memory.push(state, action, reward, next_state, done) agent.update() state = next_state print('Episode:{}/{}, Reward:{}'.format(i_episode + 1, cfg.train_eps, ep_reward)) ep_steps.append(i_step) rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward) else: ma_rewards.append(ep_reward) print('Complete training!') return rewards, ma_rewards if __name__ == '__main__': cfg = DDPGConfig() env = NormalizedActions(gym.make('Pendulum-v0')) env.seed(1) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] agent = DDPG(state_dim, action_dim, cfg) rewards, ma_rewards = train(cfg, env, agent) agent.save(path=SAVED_MODEL_PATH) save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH) plot_rewards(rewards, ma_rewards, tag="train", algo=cfg.algo, path=RESULT_PATH)
def train(config, trial_dir=None, visualize=False): pid = os.getpid() logger, log_dir = prepare_for_logging("pid_{}".format(pid)) # create environment env = NIPS(visualize) logger.info("pid={}, env={}".format(pid, id(env))) if trial_dir is not None and os.path.exists( trial_dir) and config['agent'] == 'DDPG': logger.info("Loading config from {} ...".format(trial_dir)) with open(os.path.join(trial_dir, "config.pk"), "rb") as f: config = pickle.load(f) # config["scale_action"] = scale_action config["title_prefix"] = "RunEnv" # observation processor if "ob_processor" not in config or config["ob_processor"] == "dummy": ob_processor = ObservationProcessor() elif config["ob_processor"] == "2ndorder": ob_processor = SecondOrderAugmentor() else: ob_processor = BodySpeedAugmentor() config["ob_aug_dim"] = ob_processor.get_aug_dim() # snapshot info if "save_snapshot_every" not in config: config["save_snapshot_every"] = 500 save_snapshot_every = config["save_snapshot_every"] # save config with open(os.path.join(log_dir, "config.pk"), "wb") as f: pickle.dump(config, f) util.print_settings(logger, config, env) # DDPG if config['agent'] == 'DDPG': # create random process oup = create_rand_process(env, config) # create replay buffer memory = create_memory(env, config) # create ddpg agent agent = DDPG(env, memory, oup, ob_processor, config) agent.build_nets(actor_hiddens=config["actor_hiddens"], scale_action=config["scale_action"], critic_hiddens=config["critic_hiddens"]) # print networks agent.actor.summary() agent.target_actor.summary() agent.critic.summary() # add callbacks def p_info(episode_info): util.print_episode_info(logger, episode_info, pid) def save_nets(episode_info): paths = {} paths["actor"] = os.path.join(log_dir, "actor.h5") paths["critic"] = os.path.join(log_dir, "critic.h5") paths["target"] = os.path.join(log_dir, "target.h5") agent = episode_info["agent"] agent.save_models(paths) def save_snapshots(episode_info): agent = episode_info["agent"] episode = episode_info["episode"] if episode % save_snapshot_every == 0: paths = {} paths["actor"] = os.path.join(log_dir, "actor_{}.h5".format(episode)) paths["critic"] = os.path.join(log_dir, "critic_{}.h5".format(episode)) paths["target"] = os.path.join(log_dir, "target_{}.h5".format(episode)) agent.save_models(paths) memory_path = os.path.join(log_dir, "replaybuffer.npz") agent.save_memory(memory_path) logger.info("Snapshots saved. (pid={})".format(pid)) agent.on_episode_end.append(p_info) agent.on_episode_end.append(save_nets) agent.on_episode_end.append(save_snapshots) # load existing model if trial_dir is not None and os.path.exists(trial_dir): logger.info("Loading networks from {} ...".format(trial_dir)) paths = {} paths["actor"] = "actor.h5" paths["critic"] = "critic.h5" paths["target"] = "target.h5" paths = { k: os.path.join(trial_dir, v) for k, v in paths.iteritems() } logger.info("Paths to models: {}".format(paths)) agent.load_models(paths) memory_path = os.path.join(trial_dir, "replaybuffer.npz") if os.path.exists(memory_path): agent.load_memory(memory_path) logger.info("Replay buffer loaded.") # learn util.print_sec_header(logger, "Training") reward_hist, steps_hist = agent.learn( total_episodes=config["total_episodes"], max_steps=config["max_steps"]) env.close() # send result img_file = os.path.join(log_dir, "train_stats.png") util.plot_stats(reward_hist, steps_hist, img_file) log_file = os.path.join(log_dir, "train.log") title = log_dir + "_" + config["title_prefix"] util.send_email(title, [img_file], [log_file], SMTP_SERVER) # TRPO elif config['agent'] == 'TRPO': def ob_processor_maker(): if config["ob_processor"] == "normal": return ObservationProcessor() elif config["ob_processor"] == "2ndorder": return SecondOrderAugmentor() elif config['ob_processor'] == 'bodyspeed': return BodySpeedAugmentor() else: raise ValueError('invalid ob processor type') def env_maker(visualize=False): env = NIPS(visualize=visualize) monitor_dir = os.path.join(log_dir, "gym_monitor") env = gym.wrappers.Monitor(env, directory=monitor_dir, video_callable=False, force=False, resume=True, write_upon_reset=True) return env del env env = env_maker() agent = TRPO( env, env_maker, logger, log_dir, ob_processor_maker, policy_hiddens=config['policy_hiddens'], baseline_hiddens=config['baseline_hiddens'], n_envs=config['n_envs'], batch_size=config['batch_size'], n_iters=config['n_iters'], ) if trial_dir is not None and os.path.exists(trial_dir): agent.load_models(trial_dir) agent.learn() logger.info("Finished (pid={}).".format(pid))
num_agents = len(env_info.agents) print('Number of agents:', num_agents) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) # Train Agent ################################################################## from agent import DDPG agent = DDPG(state_size=state_size, action_size=action_size, random_seed=2) def train(n_episodes=100, max_t=1000): """Deep Deterministic Policy Gradiant. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode """ scores = [] # initialize the score scores_window = deque(maxlen=100) # last 100 scores for i_episode in range(1, n_episodes+1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] agent.reset()
n_data_worker=args.n_worker, batch_size=args.data_bsize, args=args, export_model=args.job == 'export', use_new_input=args.use_new_input) if args.job == 'train': # build folder and logs base_folder_name = '{}_{}_r{}_search'.format(args.model, args.dataset, args.preserve_ratio) if args.suffix is not None: base_folder_name = base_folder_name + '_' + args.suffix args.output = get_output_folder(args.output, base_folder_name) print('=> Saving logs to {}'.format(args.output)) tfwriter = SummaryWriter(logdir=args.output) text_writer = open(os.path.join(args.output, 'log.txt'), 'w') print('=> Output path: {}...'.format(args.output)) nb_states = env.layer_embedding.shape[1] nb_actions = 1 # just 1 action here args.rmsize = args.rmsize * len(env.prunable_idx) # for each layer print('** Actual replay buffer size: {}'.format(args.rmsize)) agent = DDPG(nb_states, nb_actions, args) train(args.train_episode, agent, env, args.output, args) elif args.job == 'export': export_model(env, args) else: raise RuntimeError('Undefined job {}'.format(args.job))
def train(): runtime = 5. # time limit of the episode init_pose = np.array([0., 0., 4.0, 0., 0., 0.0]) # initial pose init_velocities = np.array([0., 0., 0.0]) # initial velocities init_angle_velocities = np.array([0., 0., 0.]) # initial angle velocities file_output = 'rewards.txt' # file name for saved results num_episodes = 10 target_pos = np.array([0., 0., 40.]) task = Task(init_pose=init_pose, init_velocities=init_velocities, init_angle_velocities=init_angle_velocities, target_pos=target_pos) agent = DDPG(task) labels = ['episod', 'avg_reward', 'total_reward'] results = {x: [] for x in labels} with open(file_output, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(labels) best_total_reward = -1000 for i_episode in range(1, num_episodes + 1): state = agent.reset_episode() # start a new episode total_reward = 0 rewards = [] while True: # select action according to the learned policy and the exploration noise action = agent.act(state) # execute the action and observe the reward and the next state next_state, reward, done = task.step(action) # sample mini batch and learn agent.step(action, reward, next_state, done) # data tracking total_reward += reward rewards.append(reward) if total_reward > best_total_reward: best_total_reward = total_reward state = next_state if done: avg_reward = np.mean(np.array(rewards)) print(task.sim.pose) #to_write = [task.sim.time] + list(task.sim.pose) + list(task.sim.v) + list(task.sim.angular_v) + list(rotor_speeds) #for ii in range(len(labels)): # results[labels[ii]].append(to_write[ii]) #writer.writerow(to_write) to_write = [i_episode] + [avg_reward] + [total_reward] for ii in range(len(labels)): results[labels[ii]].append(to_write[ii]) print( "\rEpisode = {:4d}, total_reward = {:7.3f}, avg_reward={:7.3} (best = {:7.3f})" .format(i_episode, total_reward, avg_reward, best_total_reward), end="") # [debug] break sys.stdout.flush() return agent
def runs(pattern_type = 1, noise_method = 2): patterns = ['sp','svp','fvp'] noise_para = ['0','1_0.05','2_0.05_0.03','3_0.05'] data_file = 'AR_'+patterns[pattern_type-1]+'_n_10_d_' + str(days) + '_nm_' + noise_para[noise_method] # 'Simulated_arrival_rates_no_noise_' + str(days) # samples = sio.loadmat( data_file ) arrival_rates = samples['AR'] num_sbs, steps = arrival_rates.shape steps = 20000 # The number of steps (can freely modify) if soft_update == 1 : tui = 1 # Target network update interval TAUt = TAU else: tui = 100 TAUt = 1 ar_size = num_sbs his_ar_size = ar_size * num_his_ar load_size = ar_size + 1 action_size = num_sbs state_size = num_sbs * 2 print( "Size of history ar: " + str(his_ar_size) ) print( "Size of action: " + str(action_size) ) print( "Number of timeslots: " + str(steps) ) rewards = np.zeros( (steps) ) # reward of each timeslot mean_reward = np.zeros( ( int(steps/48) + 1 ) ) actions = np.zeros( (steps, num_sbs) ) # refined action actions_o = np.zeros( (steps, num_sbs) ) # original/raw output action of the actor network prev_action = np.ones( (num_sbs) ) pred_ars = np.zeros( (steps, num_sbs) ) # predicted arrival rates of the next timeslot real_loads = np.zeros( (steps, num_sbs+1) ) pred_loads = np.zeros( (steps, num_sbs+1) ) arp_errors = [1] # average error in the predicted arrival rate lm_errors = [1] # average error in the mapped load c_errors = [1] # average error in the Q values (critic network output) a_errors = [1] # average error in the action output # Randomly initialize critic, actor, target critic, target actor and load prediction network and replay buffer, in the agent agent = DDPG( his_ar_size, ar_size, action_size, TAUt, is_batch_norm, write_sum ) exploration_noise = OUNoise( num_sbs ) for i in range( num_his_ar, steps ): #print("i: "+str(i)) his_ar = np.reshape( arrival_rates[:,i-num_his_ar:i], (1, his_ar_size) , order='F' ) pred_ar = agent.ar_pred_net.evaluate_ar_pred( his_ar ) #real_ar = np.array( arrival_rates[:,i] ) #print("his_ar: "+str(his_ar)) # Generate a state_ac of the AC network state_ac = agent.construct_state( pred_ar, prev_action ) # if eps_greedy: # epsilon-greedy based exploration if random.uniform(0, 1) < epsilon/i:#math.log(i+2): # sigmai = 0.3#/math.log(i+1) action = exploration_noise.noisei( 0.0, sigmai ) else: action = agent.evaluate_actor( state_ac ) action = action[0] else: # noise-based exploration action = agent.evaluate_actor( state_ac )[0] sigmai = agent.decay(i, 0.01, 0.5, num_his_ar, steps/2, 2) #action = [ 1-a if random.uniform(0, 1)<sigmai else a for a in action ] noise = exploration_noise.noisei( 0, sigmai ) #0.5/math.log(i+2) action = action + noise actions_o[i] = action # Refine the action, including rounding to 0 or 1, and greedy exploration if i<3000: action = agent.refine_action( state_ac, action, ac_ref1 ) # refine the action else: action = agent.refine_action( state_ac, action, ac_ref2 ) # after taking the action and the env reacts #print("action_o: "+str(actions_o[i])+", action"+str(action)) #print("pred_ar: "+str(pred_ar)) pred_load = agent.load_map_net.evaluate_load_map( pred_ar, np.reshape( action, [1, action_size] ) ) real_ar = arrival_rates[:,i] real_load = agent.env.measure_load( real_ar, action ) #print("pred_load: "+str(pred_load)) #print("real_load: "+str(real_load)) reward = agent.env.find_reward( real_load, action, prev_action ) # #print("real reward: "+str(reward)) next_his_ar = np.reshape( arrival_rates[:, i-num_his_ar+1:i+1], (1, his_ar_size) , order='F' ) next_pred_ar = agent.ar_pred_net.evaluate_ar_pred( next_his_ar ) next_state_ac = agent.construct_state( next_pred_ar, action ) # # Add s_t, s_t+1, action, reward to experience memory #print("real_ar: "+str(real_ar) + "action: "+str(action)) ar_action = np.concatenate([real_ar, action]) agent.add_experience_ac( state_ac, next_state_ac, action, reward ) agent.add_experience_arp( his_ar, real_ar ) agent.add_experience_lm( ar_action, real_load ) # Train critic and actor network, maybe multiple minibatches per step a_lr = max(A_LR_MIN, agent.decay(i, A_LR_MIN, A_LR_MAX, num_his_ar, 8000, 2) ) #max( AC_LR_MIN[0], AC_LR_MAX[0]/math.log2(i+1) ) c_lr = max(C_LR_MIN, agent.decay(i, C_LR_MIN, C_LR_MAX, num_his_ar, 8000, 2) ) #max( AC_LR_MIN[1], AC_LR_MAX[1]/math.log2(i+1) ) learning_rate = [ a_lr, c_lr ] cerror = 1 aerror = 1 ac_train_times = min(16, max(1, int(i/500)) ) for j in range( 0, ac_train_times ): # #between 1 and 5 cerrort, aerrort = agent.train_ac( learning_rate, soft_update ) if cerrort !=1: cerror = cerrort aerror = aerrort if ( (i%tui == 0) and (soft_update==0) ): agent.update_target_net() # Train ar prediction network, after many steps, one minibatch is enough for each step arp_error = 1 arp_train_times = min(10, max(1, int(i/ARP_BATCH_SIZE)) ) #if i<1000 else 5 lr = max(ARP_LR_MIN, agent.decay(i, ARP_LR_MIN, ARP_LR_MAX, num_his_ar, 8000, 2) ) for j in range( 0, arp_train_times ): arp_errort = agent.train_arp( lr ) #/math.log(i+2) if arp_errort !=1: arp_error = arp_errort # Train load mapping network, after many steps, one minibatch is enough for each step lm_error = 1 lm_train_times = min(10, max(1, int(i/LM_BATCH_SIZE)) ) #if i<1000 else 20 lr = max(LM_LR_MIN, agent.decay(i, LM_LR_MIN, LM_LR_MAX, num_his_ar, 8000, 2) ) for j in range( 0, lm_train_times ): lm_errort = agent.train_lm( lr ) # if lm_errort !=1: lm_error = lm_errort if arp_error !=1: arp_errors.append( math.sqrt( arp_error ) ) if lm_error !=1: lm_errors.append( math.sqrt( lm_error ) ) if cerror !=1: c_errors.append( math.sqrt( cerror ) ) if aerror !=1: a_errors.append( aerror*num_sbs ) # hamming distance error prev_action = action pred_ars[i] = pred_ar real_loads[i] = real_load pred_loads[i] = pred_load actions[i] = action rewards[i] = reward if i%(48) == 0: mean_reward[int(i/48)] = mean( rewards[i-48:i] ) print("==== i: %5d, arp error: %1.5f, lm error: %1.5f, a error: %1.5f, c error: %1.5f, mean reward: %1.5f \n" % ( i, arp_errors[-1], lm_errors[-1], a_errors[-1], c_errors[-1], mean_reward[int(i/48)] ) ) agent.close_all() # this will write network parameters into .txt files """ writetext( actions, 'actions', 1 ) writetext( actions_o, 'actions_o', 1 ) writetext( pred_ars, 'pred_ar', 1 ) writetext( rewards, 'rewards' ) writetext( mean_reward, 'mean_rewards' ) writetext( arp_errors, 'arp_errors', 1 ) writetext( lm_errors, 'lm_errors', 1 ) writetext( c_errors, 'c_errors' ) writetext( a_errors, 'a_errors' ) writetext( real_loads, 'real_loads', 1 ) writetext( pred_loads, 'pred_loads', 1 ) pre = '_bn_'+str(is_batch_norm)+'_gi_'+str(is_grad_inverter)+'_ar_'+str(ac_ref1)+'_'+str(steps) writetext( mean_reward, 'mean_rewards'+pre ) plt.plot(rewards) plt.show() """ writetext( rewards, 'ACDQN_rewards_' + data_file ) return 1
def dqn_bsa(AR, ac_ref=4, write_sum=0, net_scale=1, funname='', beta0=beta): num_sbs, num_ts = AR.shape num_mbs = 1 ts_per_day = 48 ar_size = num_sbs his_ar_size = ar_size * num_his_ar load_size = num_sbs + num_mbs action_size = num_sbs state_size = ar_size + action_size print("Size of history ar: " + str(his_ar_size)) print("Size of action: " + str(action_size)) print("Number of timeslots: " + str(num_ts)) rewards = np.zeros((num_ts)) # reward of each timeslot sum_powers = np.zeros((num_ts)) switch_powers = np.zeros((num_ts)) qos_costs = np.zeros((num_ts)) throughputs = np.zeros((num_ts)) prev_action = np.ones((num_sbs)) arp_errors = [1] # average error in the predicted arrival rate lm_errors = [1] # average error in the mapped load c_errors = [1] # average error in the Q values (critic network output) a_errors = [1] # average error in the action output # Randomly initialize critic, actor, target critic, target actor and load prediction network and replay buffer, in the agent agent = DDPG(his_ar_size, ar_size, action_size, TAU, is_batch_norm, write_sum, net_size_scale=net_scale, beta0=beta0) exploration_noise = OUNoise(num_sbs) for i in range(num_his_ar, num_ts): his_ar = np.reshape(AR[:, i - num_his_ar:i], (1, his_ar_size), order='F') pred_ar = agent.ar_pred_net.evaluate_ar_pred(his_ar) # Generate a state_ac of the AC network state_ac = agent.construct_state(pred_ar, prev_action) # if eps_greedy: # epsilon-greedy based exploration if random.uniform(0, 1) < epsilon / i: #math.log(i+2): # sigmai = 0.3 #/math.log(i+1) action = exploration_noise.noisei(0.0, sigmai) else: action = agent.evaluate_actor(state_ac) action = action[0] else: # noise-based exploration action = agent.evaluate_actor(state_ac)[0] sigmai = agent.decay(i, 0.01, 0.5, num_his_ar, num_ts / 2, 2) noise = exploration_noise.noisei(0, sigmai) #0.5/math.log(i+2) action = action + noise # Refine the action, including rounding to 0 or 1, and greedy exploration if ac_ref <= 3: action = agent.refine_action(state_ac, action, ac_ref) else: # hybrid method if random.uniform(0, 1) < agent.decay(i, 0, 3, num_his_ar, num_ts * 0.75, 2): action = agent.refine_action(state_ac, action, 3) # refine the action else: action = agent.refine_action(state_ac, action, 2) # after taking the action and the env reacts #pred_load = agent.load_map_net.evaluate_load_map( pred_ar, np.reshape( action, [1, action_size] ) ) real_ar = AR[:, i] real_load = agent.env.measure_load(real_ar, action) reward, sum_power, switch_power, qos_cost, throughput = agent.env.find_reward( real_load, action, prev_action) # next_his_ar = np.reshape(AR[:, i - num_his_ar + 1:i + 1], (1, his_ar_size), order='F') next_pred_ar = agent.ar_pred_net.evaluate_ar_pred(next_his_ar) next_state_ac = agent.construct_state(next_pred_ar, action) # # Add s_t, s_t+1, action, reward to experience memory ar_action = np.concatenate([real_ar, action]) agent.add_experience_ac(state_ac, next_state_ac, action, reward) agent.add_experience_arp(his_ar, real_ar) agent.add_experience_lm(ar_action, real_load) # Train critic and actor network, maybe multiple minibatches per step a_lr = max(A_LR_MIN, agent.decay( i, A_LR_MIN, A_LR_MAX, num_his_ar, 8000, 2)) #max( AC_LR_MIN[0], AC_LR_MAX[0]/math.log2(i+1) ) c_lr = max(C_LR_MIN, agent.decay( i, C_LR_MIN, C_LR_MAX, num_his_ar, 8000, 2)) #max( AC_LR_MIN[1], AC_LR_MAX[1]/math.log2(i+1) ) learning_rate = [a_lr, c_lr] cerror = 1 aerror = 1 ac_train_times = min(16, max(1, int(i / 500))) for j in range(0, ac_train_times): # #between 1 and 5 cerrort, aerrort = agent.train_ac(learning_rate, 1) if cerrort != 1: cerror = cerrort aerror = aerrort # Train ar prediction network, after many num_ts, one minibatch is enough for each step arp_error = 1 arp_train_times = min(10, max(1, int(i / ARP_BATCH_SIZE))) #if i<1000 else 5 lr = max(ARP_LR_MIN, agent.decay(i, ARP_LR_MIN, ARP_LR_MAX, num_his_ar, 8000, 2)) for j in range(0, arp_train_times): arp_errort = agent.train_arp(lr) #/math.log(i+2) if arp_errort != 1: arp_error = arp_errort # Train load mapping network, after many num_ts, one minibatch is enough for each step lm_error = 1 lm_train_times = min(10, max(1, int(i / LM_BATCH_SIZE))) #if i<1000 else 20 lr = max(LM_LR_MIN, agent.decay(i, LM_LR_MIN, LM_LR_MAX, num_his_ar, 8000, 2)) for j in range(0, lm_train_times): lm_errort = agent.train_lm(lr) # if lm_errort != 1: lm_error = lm_errort if arp_error != 1: arp_errors.append(math.sqrt(arp_error)) if lm_error != 1: lm_errors.append(math.sqrt(lm_error)) if cerror != 1: c_errors.append(math.sqrt(cerror)) if aerror != 1: a_errors.append(aerror * num_sbs) # hamming distance error prev_action = action rewards[i] = reward sum_powers[i] = sum_power throughputs[i] = throughput switch_powers[i] = switch_power qos_costs[i] = qos_cost if i % (ts_per_day) == 0: mrt = np.mean(rewards[i - ts_per_day:i]) if write_sum > 0: print( funname + " ------- i: %5d, arp-e: %1.5f, lm-e: %1.5f, a-e: %1.5f, c-e: %1.5f, d-reward: %1.5f \n" % (i, arp_errors[-1], lm_errors[-1], a_errors[-1], c_errors[-1], mrt)) else: print(funname + " ------- i: %5d, mean reward: %1.5f \n" % (i, mrt)) return rewards, sum_powers, switch_powers, qos_costs, throughputs
import torch from torch.utils.tensorboard import SummaryWriter from agent import DDPG from exploration import OUActionNoise epoch = 2000 env = gym.make('Pendulum-v0') # seed np.random.seed(42) env.seed(42) torch.manual_seed(42) torch.cuda.manual_seed(42) writer = SummaryWriter(log_dir='logs/') agent = DDPG(env, writer) all_timesteps = 0 for e in range(epoch): noise = OUActionNoise(env.action_space.shape[0]) state = env.reset() cumulative_reward = 0 for timestep in range(200): action = agent.get_action(state, noise, timestep) state_, reward, done, _ = env.step(action * env.action_space.high[0]) # env.render() agent.store_transition(state, action, state_, reward, done) state = state_ cumulative_reward += reward
def main(): """ UnboundLocalError: local variable 'RENDER' referenced before assignment If the global variable changed in a function without declare with a "global" prefix, then the variable here will be treat as a local variable For example, if "RENDER" is not been declared with global prefix, access "RENDER" variable will raise UnboundLocalError before assign value to "RENDER" """ global RENDER env = gym.make(ENV_NAME) env = env.unwrapped env.seed(1) s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_bound = env.action_space.high[0] # print(f"s_dim: {s_dim}, a_dim: {a_dim}, a_bound: {a_bound}") # s_dim: 3, a_dim: 1, a_bound: 2.0 ddpg = DDPG(s_dim, a_dim, a_bound) # var: add noise to action var = 3 for i in range(MAX_EPISODES): s = env.reset() # s : list # s.shape = (3,) ep_reward = 0 for j in range(MAX_EP_STEPS): if RENDER: env.render() a = ddpg.choose_action(s) a = np.clip(np.random.normal(a, var), -a_bound, a_bound) s_, r, done, info = env.step(a) # s : list # a : np.float # r : float # s_ : list ddpg.store_transition(s, a, r/10, s_) if ddpg.m_pointer > ddpg.capacity: var *= 0.9995 ddpg.learn() s = s_ ep_reward += r if done or (j+1) == MAX_EP_STEPS: print(f"Episode: {i:03d}") print(f"\tReward: {ep_reward:.3f}, Explore: {var:.2f}") if ep_reward > -150: RENDER = True break env.close()
import gym from agent import DDPG env = gym.make('Pendulum-v0') agent = DDPG(env) agent.load_model() state = env.reset() cumulative_reward = 0 for i in range(200): action = agent.get_action(state) env.render() state, reward, _, _ = env.step(action * 2) cumulative_reward += reward print('Cumulative Reward: {}'.format(cumulative_reward))
from collections import deque import gym import numpy as np from agent import DDPG from utils import get_screen env = gym.make('Pendulum-v0') agent = DDPG(env, memory=False) agent.load_model() env.reset() pixel = env.render(mode='rgb_array') state = deque([get_screen(pixel) for _ in range(3)], maxlen=3) cumulative_reward = 0 for timestep in range(200): action = agent.get_action(np.array(state)[np.newaxis]) _, reward, _, _ = env.step(action * 2) pixel = env.render(mode='rgb_array') state_ = state.copy() state_.append(get_screen(pixel)) state = state_ cumulative_reward += reward print('Cumulative Reward: {}'.format(cumulative_reward))
# 在开发智能体的时候,你还需要关注它的性能。参考下方代码,建立一个机制来存储每个阶段的总奖励值。如果阶段奖励值在逐渐上升,说明你的智能体正在学习。 # In[25]: ## TODO: Train your agent here. import keras import sys import pandas as pd from agent import DDPG from task import Task num_episodes = 2000 target_pos = np.array([0., 0., 100.]) init_pose = np.array([0., 0., 0., 0., 0., 0.]) task = Task(target_pos=target_pos, init_pose=init_pose) agent = DDPG(task) reward_labels = ['episode', 'reward'] reward_results = {x: [] for x in reward_labels} # In[18]: for i_episode in range(1, num_episodes + 1): state = agent.reset_episode() # start a new episode while True: action = agent.act(state) next_state, reward, done = task.step(action) agent.step(action, reward, next_state, done) state = next_state if done: print("\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f})".format(
from agent import DDPG from task import Task num_episodes = 1000 init_pose = np.array([0., 0., 0., 0., 0., 0.]) target_pos = np.array([0., 0., 10.]) init_velocities = np.array([0., 0., 0.]) # initial velocities init_angle_velocities = np.array([0., 0., 0.]) task = Task(init_pose=init_pose, target_pos=target_pos, init_angle_velocities=init_angle_velocities, init_velocities=init_velocities) best_score = -np.inf agent = DDPG(task) for i_episode in range(1, num_episodes + 1): state = agent.reset_episode() # start a new score = 0 while True: action = agent.act(state) next_state, reward, done = task.step(action) agent.step(action, reward, next_state, done) state = next_state score += reward best_score = max(best_score, score) if done: print("\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f})".format( i_episode, score, best_score), end="") # [debug]