def run(args): if args.machine == "Mac": env = UnityEnvironment(file_name='./Reacher.app',seed=1) else : env = UnityEnvironment(file_name='./Reacher_Linux_NoVis/Reacher.x86_64',seed=1) if torch.cuda.is_available(): device = torch.device('cuda') else : device = torch.device('cpu') print("using device", device) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) #==========================my version========================= agent = Agent(a_dim=4, s_dim=33, clip_value=1, device=device) # continuous action clip agent.load("./pretrained/") eval(env, agent, brain_name) env.close()
def __init__(self, state_size, action_size, random_seed): super(MADDPG, self).__init__() self.state_size = state_size self.action_size = action_size self.random_seed = random_seed self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE, self.random_seed) self.maddpg_agent = [Agent(self.state_size, self.action_size, BATCH_SIZE, self.random_seed, TAU, LR_ACTOR, LR_CRITIC, WEIGHT_DECAY, 0), Agent(self.state_size, self.action_size, BATCH_SIZE, self.random_seed, TAU, LR_ACTOR, LR_CRITIC, WEIGHT_DECAY, 1)] self.iter = 0 self.episode_counter = 0 self.eps = 2 self.eps_decay = 0.9999
def collect_trajectories(env: gym.Env, agent: Agent, n_games: int = 10) -> np.ndarray: for _ in range(n_games): state = env.reset() done: bool = False state_history: list[np.ndarray] = [] while not done: state_history.append(state) action = agent.choose_action(state) next_state, _, done, _ = env.step(action) state = next_state return np.vstack(state_history)
from DRLEnv import FedEnv from DDPG import Agent from tqdm import tqdm, trange import torch import numpy as np import pandas as pd from collections import deque if __name__ == '__main__': print(torch.cuda.is_available()) epoches, print_every = 200, 100 env = FedEnv(Client=5, k=2) # env agent = Agent(state_size=25, action_size=25, random_seed=2) # agent scores_deque = deque(maxlen=print_every) scores = [] episode = [] for i_episode in range(1, 200 + 1): X, Y = [], [] # x and y axis for test_data start_time = 0 # initialize pca ? if i_episode == 0: state = env.reset(Tag=True) else: state = env.reset(Tag=False) # initialize agent's noise agent.reset() score = 0 reward_y = []
if __name__ == '__main__': # Init. Environment env = gym.make('LunarLanderContinuous-v2') env.reset() # Init. Datapath data_path = os.path.abspath('Vanilla-DDPG/data') # Init. Testing n_games = 10 test_data: List[Dict[str, np.ndarray]] = [] * n_games # Init. Agent agent = Agent(env=env, n_games=n_games, training=False) agent.load_models(data_path) for i in tqdm(range(n_games), desc=f'Testing', total=n_games): score_history: List[np.float32] = [] * n_games for _ in tqdm(range(n_games), desc=f'Testing', total=n_games): score = 0 done = False # Initial Reset of Environment state = env.reset() while not done: action = agent.choose_action(state) next_state, reward, done, _ = env.step(action)
next_state, _, done, _ = env.step(action) state = next_state return np.vstack(state_history) if __name__ == "__main__": # Init. path data_path = os.path.abspath('Vanilla-DDPG/data') # Init. Environment and agent env = gym.make('LunarLanderContinuous-v2') env.reset() agent = Agent(env=env, training=False) agent.load_models(data_path) with open(os.path.join(data_path, 'training_info.json')) as f: train_data = json.load(f) with open(os.path.join(data_path, 'testing_info.json')) as f: test_data = json.load(f) # Load all the data frames score = [data["Epidosic Summed Rewards"] for data in train_data] average = [data["Moving Mean of Episodic Rewards"] for data in train_data] test = [data["Test Score"] for data in test_data] trajectory = collect_trajectories(env, agent)
def main(): global render_bool render_bool = True # parl.connect('localhost:8037') if dummy_mode: render_bool = False if not render_bool: os.environ["SDL_VIDEODRIVER"] = "dummy" # else: # pygame.display.set_mode((800, 600 + 60)) # 创建环境 game = GameEnv() p = PLE(game, display_screen=render_bool, fps=30, force_fps=True ) # , fps=30, display_screen=render_bool, force_fps=True) p.init() # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) width, height = p.getScreenDims() rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 obs_dim = get_env_obs(p).shape model = Model(act_dim=act_dim) if MODE == "DDPG": alg = RL_Alg(model, gamma=GAMMA, tau=0.001, actor_lr=LEARNING_RATE, critic_lr=LEARNING_RATE) if MODE == "DQN": alg = RL_Alg(model, gamma=GAMMA, lr=LEARNING_RATE, act_dim=act_dim) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # e_greed有一定概率随机选取动作,探索 # 加载模型 best_eval_reward = -1000 cache_fn = './model_pixelcopter_%s.ckpt' % MODE if os.path.exists(cache_fn): print("loaded model:", cache_fn) agent.restore(cache_fn) best_eval_reward = evaluate(p, agent, render=render_bool) # run_episode(env, agent, train_or_test='test', render=True) # exit() # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(p, agent, rpm) max_episode = 200000 # 开始训练 episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 5): total_reward = run_episode(p, agent, rpm) episode += 1 # test part eval_reward = evaluate(p, agent, render=render_bool) # render=True 查看显示效果 logger.info('episode:{} e_greed:{} test_reward:{}'.format( episode, e_greed, eval_reward)) # 保存模型到文件 ./model.ckpt agent.save(cache_fn + "." + str(rate_num)) if best_eval_reward < eval_reward: best_eval_reward = eval_reward agent.save(cache_fn)
import gym import gym_pid from DDPG import Agent import numpy as np import utils env = gym.make('pid-v0') agent = Agent(alpha=0.00001, beta=0.0001, input_dims=[3], tau=0.0001, env=env, batch_size=64, layer1_size=256, layer2_size=128, n_actions=3) #agent.load_models() # np.random.seed(1) score_history = [] for i in range(50): obs = env.reset() done = False score = 0 while not done: act = agent.choose_action(obs) print(act) new_state, reward, done, info = env.step(act) agent.remember(obs, act, reward, new_state, int(done)) agent.learn()
continued = True path = "path" NOISE_C = 1.1 first_ep = 0 with tf.device('/GPU:0'): env = Environment("data/u20.txt", SEED) # env = gym.wrappers.Monitor(e.env, 'video/', video_callable=lambda episode_id: True,force = True) # video = VideoRecorder(env, "video.mp4" state_shape = env.state_shape action_len = env.action_shape[0] action_scale = None NOISE = 0.6 # np.random.seed(SEED) agent = Agent(state_shape, action_len, action_scale) if continued: agent.load(path) agent.summary() for episode in range(first_ep, EPISODES): state = env.reset() state = np.reshape(state, state_shape) score = 0 # print(state) # done = False noise = np.random.normal(NOISE, NOISE / 2, 2) / (1 + pow(NOISE_C, episode + 10)) for st in range(MAX_STEPS): # while not done : # env.render()
def main(env, episodes=500, max_steps=500, eps_decay=.99, actor_lr=10**-6, critic_lr=10**-3, gamma=.9, base_nodes=64, batch_size=128,theta=.4, sigma=.25): with tf.Session() as sess: # Initialize environment and constants input_dim = env.state_dim output_dim = env.action_dim action_high = env.action_high action_low = env.action_low # Create DDPG Agent agent = Agent(input_dim, output_dim, action_high, action_low, actor_lr=actor_lr, critic_lr=critic_lr, gamma=gamma, base_nodes=base_nodes, eps_decay=eps_decay, batch_size=batch_size,theta=theta, sigma=sigma, sess=sess) sess.run(tf.global_variables_initializer()) agent.actor.update_target_network() agent.critic.update_target_network() # Prepare for episodes c_losses, rewards, actions, Qs, states = [np.array([]) for i in range(5)] for e in tqdm(range(episodes)): # Reset episode state = env.reset() state = np.reshape(state, (-1, len(state))) agent.noise.reset() done = False step_count = 0 total_reward = 0 while not done and step_count < max_steps: # Action action = agent.act(state) next_state, reward, done = env.step(action) next_state = np.reshape(next_state, (-1, len(next_state))) # Learn c_loss = agent.learn(state, action, reward, done, next_state) # Save results c_losses = np.append(c_losses, c_loss) actions = np.append(actions, action) states = np.append(states, state[0]) Qs = np.append(Qs, agent.critic.predict(state, action)) # Loop state = next_state step_count += 1 total_reward += reward # Reduce exploration if agent.eps > agent.min_eps: agent.eps *= agent.eps_decay rewards = np.append(rewards, total_reward) return rewards, c_losses, actions, Qs
# Init. Environment env = gym.make('LunarLanderContinuous-v2') env.reset() # Init. Datapath data_path = os.path.abspath('Vanilla-DDPG/data') # Init. Training n_games: int = 1500 best_score = -np.inf score_history: List[float] = [] * n_games avg_history: List[float] = [] * n_games logging_info: List[Dict[str, float]] = [] * n_games # Init. Agent agent = Agent(env=env, n_games=n_games) for i in range(n_games): score: float = 0.0 done: bool = False # Initial Reset of Environment state = env.reset() while not done: action = agent.choose_action(state) next_state, reward, done, _ = env.step(action) agent.memory.add(state, action, reward, next_state, done) state = next_state