def main(): mp.set_start_method('spawn') config = Config() # 1. 初始化环境 env = NormalizedEnv(gym.make('Pendulum-v0')) # 2. 初始化agent agent = DDPGAgent(env=env, seed=config.seed, batch_size=config.batch_size, learning_rate_actor=config.learning_rate_actor, learning_rate_critic=config.learning_rate_critic, weight_decay=config.weight_decay) agent.target_actor.share_memory() # 3. 初始化memory memory = ReplayMemory(config.capacity) q = mp.Queue(10) process_collect_list = [] for i in range(config.agent_num): process_name = "collect_process_" + str(i) process = mp.Process(name=process_name, target=collect_porcess, args=(i, q, agent.target_actor)) process.start() process_collect_list.append(process) steps = mp.Value('d', 0) test_p = mp.Process(name="test_process", target=test_process, args=(config, steps, agent.target_actor)) test_p.start() process_collect_list.append(test_p) try: while True: len = q.qsize() while len: mem = q.get() memory.push(mem[0], mem[1], mem[2], mem[3], mem[4]) len -= 1 # 4.4 学习 if memory.len > config.batch_size: agent.learning(memory) # save model if steps.value > 1 and steps.value % config.save_steps == 0: agent.save_models(steps.value / config.save_steps) steps.value += 1 except Exception as e: print(e) except: for process in process_collect_list: process.join() print(process.name + " stop ") env.close()
def collect_porcess(agent_index, queue_mem, acrot_param): env = NormalizedEnv(gym.make('Pendulum-v0')) agent = Action(state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0]) try: while True: done = False state = env.reset() state = (state - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) agent.load_param(acrot_param) print("agent {} load param".format(agent_index)) while not done: action = agent.chose_action(state, explort=True) next_state, reward, done, _ = env.step(action) # env.render() next_state = (next_state - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) is_done = 0 if done else 1 queue_mem.put((state, action, next_state, reward, is_done)) state = next_state except Exception as e: print(e) print("agent {} exit".format(agent_index)) env.close()
def test_process(config, steps, target_actor): env = NormalizedEnv(gym.make('Pendulum-v0')) agent = Action(state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0]) reward_list = [] try: while True: # for test if (steps.value) != 0 and (steps.value % config.test_every_eposide == 0): agent.load_param(target_actor) print("test agent load param ") et_reward = 0 for index in range(config.num_eposide_test): eposide = 0 state = env.reset() state = (state - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) while True: action = agent.chose_action(state, explort=False) next_state, reward, done, _ = env.step(action) env.render() next_state = (next_state - env.observation_space.low ) / (env.observation_space.high - env.observation_space.low) eposide += reward state = next_state if done: break et_reward += eposide print("\033[93m [ test ] eposide average reward : {}\033[00m". format(et_reward / config.num_eposide_test)) reward_list.append(et_reward / config.num_eposide_test) x = np.arange(len(reward_list)) y = np.array(reward_list) plt.plot(x, y) plt.savefig("./eposide_reward.png") except Exception as e: print(e) print("test process exit") env.close()
def main(): env = NormalizedEnv(gym.make('Pendulum-v0')) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] agent = Actor(state_dim, action_dim).to('cuda') agent.load_state_dict(torch.load('./Models/78.0_actor.pt')) eposide = 0 done = False eposide_list = [] while eposide < 100: eposide_reward = 0 state = env.reset() state = (state - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) state = to_tensor(state) while not done: action = agent.forward(state).detach().cpu().data.numpy() state_, reward, done, _ = env.step(action) state_ = (state_ - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) env.render() state = to_tensor(state_) eposide_reward += reward eposide_list.append(eposide_reward) eposide += 1 done = False print("{} : {}".format(eposide, eposide_reward)) import matplotlib.pyplot as plt x = np.arange(100) y = np.array(eposide_list) plt.plot(x, y) plt.savefig("./test_eposide_reward.png") env.close()
action = trainer.select_action(observation) observation2, reward, done, info = env.step(action) observation2 = deepcopy(observation2) if step >= MAX_STEP_PER_EPISODE - 1: done = True # trainer store transitions and update all networks trainer.observe(reward, observation2, done) trainer.update_all() episode_reward += reward observation = deepcopy(observation2) print('Training Episode {}, Episode Reward is:{}'.format( episode, episode_reward)) if episode % EVALUATING_EPISODE_INTERVAL == 0: policy = lambda x: trainer.select_action(x, decay_epsilon=False) evaluator(env, policy, debug=True, visualize=OPEN_VISUALIZATION_EVA, save=True) ''' Entrance of Main Program ''' env = NormalizedEnv(gym.make(ENVIRONMENT)) nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] ddpg_trainer = DDPG_trainer(nb_states, nb_actions) train(ddpg_trainer, env)
parser.add_argument('--log_interval', default=50, type=int) # parser.add_argument('--load', default=False, type=bool) # load model parser.add_argument('--render_interval', default=100, type=int) # after render_interval, the env.render() will work parser.add_argument('--exploration_noise', default=0.1, type=float) parser.add_argument('--max_episode', default=10000, type=int) # num of games parser.add_argument('--num_episode', default=0, type=int) parser.add_argument('--print_log', default=5, type=int) parser.add_argument('--update_iteration', default=200, type=int) args = parser.parse_args() device = 'cuda:0' if torch.cuda.is_available() else 'cpu' print('env:', args.env_name) print('seed:', args.random_seed) script_name = os.path.basename(__file__) eps = np.finfo(np.float32).eps env = NormalizedEnv(gym.make(args.env_name)) if args.seed: env.seed(args.random_seed) torch.manual_seed(args.random_seed) np.random.seed(args.random_seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) min_Val = torch.tensor(1e-7).float().to(device) # min value directory = './exp'+ script_name +'Seed'+str(args.random_seed)+ args.env_name +'./' def normal_R_V(R_, current_Q, reward): R_ = np.array(R_)
type=int, help='linear decay of exploration policy') parser.add_argument('--seed', default=-1, type=int, help='') parser.add_argument('--resume', default='default', type=str, help='Resuming model path for testing') # parser.add_argument('--l2norm', default=0.01, type=float, help='l2 weight decay') # TODO # parser.add_argument('--cuda', dest='cuda', action='store_true') # TODO args = parser.parse_args() args.output = get_output_folder(args.output, args.env) if args.resume == 'default': args.resume = 'output/{}-run0'.format(args.env) env = NormalizedEnv(gym.make(args.env)) if args.seed > 0: np.random.seed(args.seed) env.seed(args.seed) nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] agent = DDPG(nb_states, nb_actions, args) evaluate = Evaluator(args.validate_episodes, args.validate_steps, args.output, max_episode_length=args.max_episode_length) if args.mode == 'train':
parser.add_argument('--l2norm', default=0.01, type=float, help='l2 weight decay') # TODO args = parser.parse_args() # StrCat args.output with args.env if args.resume is None: args.output = get_output_folder(args.output, args.env) else: args.output = args.resume if args.env == "KukaGym": env = KukaGymEnv(renders=False, isDiscrete=True) elif args.discrete: env = gym.make(args.env) env = env.unwrapped else: env = NormalizedEnv(gym.make(args.env)) # input random seed if args.seed > 0: np.random.seed(args.seed) env.seed(args.seed) # input states count & actions count print(env.observation_space.shape, env.action_space.shape) nb_states = env.observation_space.shape[0] if args.discrete: nb_actions = env.action_space.n else: nb_actions = env.action_space.shape[0] env = fastenv(env, args.action_repeat, args.vis)
stats["Surrogate loss"] = surrafter summary = tf.Summary() for k, v in stats.iteritems(): print(k + ": " + " " * (40 - len(k)) + str(v)) if k != "Time elapsed": summary.value.add(tag=k, simple_value=float(v)) # save stats self.summary_writer.add_summary(summary, i) self.summary_writer.flush() if entropy != entropy: exit(-1) """ if exp > 0.8: self.train = False """ i += 1 if __name__ == '__main__': logging.getLogger().setLevel(logging.DEBUG) args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) env = gym.make(args.env_id) env = NormalizedEnv(env, normalize_obs=True) agent = TRPOAgent(env, args) agent.learn()
[path["rewards"].sum() for path in paths]) print "\n********** Iteration %i ************" % i if episoderewards.mean() >= self.env._spec.reward_threshold: print "Solved Env" self.solved = True stats = {} numeptotal += len(episoderewards) stats["Total number of episodes"] = numeptotal stats["Average sum of rewards per episode"] = episoderewards.mean() for k, v in stats.iteritems(): print(k + ": " + " " * (40 - len(k)) + str(v)) i += 1 if __name__ == '__main__': args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) env = gym.make(args.env_id) if args.use_pixels: env = JacoCombiEnv(env, is_rgb=True, is_depth=True) else: env = NormalizedEnv(env) agent = AsyncNGAgent(env, args) agent.deploy()
from util import get_output_folder, setup_logger from wolp_agent import WolpertingerAgent args.save_model_dir = get_output_folder('../output', args.env) env = gym.make(args.env) continuous = None try: # continuous action nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] action_high = env.action_space.high action_low = env.action_space.low continuous = True env = NormalizedEnv(env) except IndexError: # discrete action for 1 dimension nb_states = env.observation_space.shape[0] nb_actions = 1 # the dimension of actions, usually it is 1. Depend on the environment. max_actions = env.action_space.n continuous = False if args.seed > 0: np.random.seed(args.seed) env.seed(args.seed) if continuous: agent_args = { 'continuous': continuous, 'max_actions': None,
args.output = get_output_folder(args.output, args.env) if args.debug: print('Writing to {}'.format(args.output)) writer = SummaryWriter(args.output) with open(os.path.join(args.output, 'cmdline.txt'), 'a') as f: f.write(' '.join(sys.argv) + '\n') bullet = ("Bullet" in args.env) if bullet: import pybullet import pybullet_envs env = NormalizedEnv(gym.make(args.env)) # input random seed if args.seed > 0: np.random.seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) env.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) # input status count & actions count print('observation_space', env.observation_space.shape, 'action_space', env.action_space.shape) nb_status = env.observation_space.shape[0] nb_actions = env.action_space.shape[0]
def main(args): env = make_env('simple_tag') env = NormalizedEnv(env) kwargs = dict() kwargs['config'] = args predator_model = Predators(16, 2, num_agent=3, **kwargs) preyer_model = Preyer(14, 2, **kwargs) if args.tensorboard: writer = SummaryWriter(log_dir='runs/' + args.log_dir) episode = 0 total_step = 0 while episode < args.max_episodes: state = env.reset() episode += 1 step = 0 predator_accum_reward = [] preyer_accum_reward = 0 while True: state_predator, state_prayer = split_obs(state) predator_model.prep_eval() action_predator = predator_model.choose_action(state_predator) action_prayer = preyer_model.random_action() #action_prayer = preyer_model.choose_action(state_prayer) action = merge_action(action_predator, action_prayer) next_state, reward, done, info = env.step(action) step += 1 total_step += 1 predator_accum_reward.append(np.mean(reward[:3])) preyer_accum_reward = reward[3] if step > args.episode_length: done = [True, True, True, True] if args.render and (episode % 10 == 1): env.render(mode='rgb_array') predator_model.memory(state[:3], action[:3], reward[:3], next_state[:3], done[:3]) # preyer_model.memory(state[3], action[3], reward[3], next_state[3], done[3]) if len( predator_model.replay_buffer ) >= args.batch_size and total_step % args.steps_per_update == 0: predator_model.prep_train() predator_model.train() # preyer_model.train() if True in done: predator_c_loss, predator_a_loss = predator_model.getLoss() preyer_c_loss, preyer_a_loss = preyer_model.getLoss() print("[Episode %05d] reward_predator %3.1f reward_preyer %3.1f predator_c_loss %3.1f predator_a_loss %3.1f preyer_c_loss %3.1f preyer_a_loss %3.1f" % \ (episode, np.mean(predator_accum_reward).item(), preyer_accum_reward, predator_c_loss, predator_a_loss, preyer_c_loss, preyer_a_loss)) if args.tensorboard: # writer.add_scalar(tag='debug/memory_length', global_step=episode, scalar_value=len(predator_model.replay_buffer)) # writer.add_scalar(tag='debug/predator_epsilon', global_step=episode, scalar_value=predator_model.epsilon) # writer.add_scalar(tag='debug/preyer_epsilon', global_step=episode, scalar_value=preyer_model.epsilon) writer.add_scalar( tag='agent/reward_predator', global_step=episode, scalar_value=np.mean(predator_accum_reward).item()) # writer.add_scalar(tag='perf/reward_preyer', global_step=episode, scalar_value=preyer_accum_reward) if predator_c_loss and predator_a_loss: writer.add_scalars('agent/predator_loss', global_step=episode, tag_scalar_dict={ 'actor': -predator_a_loss, 'critic': predator_c_loss }) # writer.add_scalar(tag='loss/preyer_c_loss', global_step=episode, scalar_value=preyer_c_loss) # writer.add_scalar(tag='loss/preyer_a_loss', global_step=episode, scalar_value=preyer_a_loss) predator_model.reset() preyer_model.reset() break state = next_state if args.tensorboard: writer.close()