def __init__(self, env, train_flag = True, model_path = None, actor_learning_rate = 0.001, critic_learning_rate = 0.002, num_episodes = 1000, tau = 0.005, gamma = 0.99, memory_size = 100000, batch_size = 64): self.env = env self.train_flag = train_flag self.num_episodes = num_episodes self.tau = tau self.gamma = gamma self.batch_size = batch_size self.model_path = model_path self.actor_opt = Adam(lr = actor_learning_rate) self.critic_opt = Adam(lr = critic_learning_rate) self.states_shape = self.env.observation_space.shape[0] self.action_shape = self.env.action_space.shape[0] self.n_actions = self.env.action_space.shape[0] self.actions_lower_bound = env.action_space.low self.actions_upper_bound = env.action_space.high if self.train_flag: self.noise = OUActionNoise(mean = np.zeros(self.n_actions), std_deviation = 0.2 * np.ones(self.n_actions)) self.actor = get_actor(self.states_shape, self.n_actions, self.actions_upper_bound) self.actor_target = get_actor(self.states_shape, self.n_actions, self.actions_upper_bound) self.critic = get_critic(self.states_shape, self.action_shape) self.critic_target = get_critic(self.states_shape, self.action_shape) self.actor_target.set_weights(self.actor.get_weights()) self.critic_target.set_weights(self.critic.get_weights()) self.memory = Memory(memory_size) else: self.actor = load_model(self.model_path)
def train(args, env, policy_net, value_net, running_state): for i_episode in count(1): memory = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(10000): # Don't infinite loop while learning action = select_action(state, policy_net) action = action.data[0].numpy() next_state, reward, done, _ = env.step(action) reward_sum += reward next_state = running_state(next_state) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) if args.render: env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() ######################### # TRPO update parameters ######################### update_trpo = trpo.update_params(batch, value_net, policy_net, args, trpo_functions) update_trpo.execute() if i_episode % args.log_interval == 0: print('Episode {}\tLast reward: {}\tAverage reward {:.2f}'.format( i_episode, reward_sum, reward_batch))
def __init__(self, n_bits, lr, memory_size, batch_size, gamma): self.n_bits = n_bits self.lr = lr self.gamma = gamma self.batch_size = batch_size self.memory_size = memory_size self.memory = Memory(self.memory_size) self.device = "cpu" self.model = DQN(n_inputs=2 * self.n_bits, n_outputs=n_bits).to(self.device) self.target_model = DQN(n_inputs=2 * self.n_bits, n_outputs=n_bits).to(self.device) self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() self.opt = Adam(self.model.parameters(), lr=self.lr) self.loss_fn = MSELoss() self.epsilon = 1.0 self.epsilon_decay = 0.001
def __init__(self, env, \ train_flag = True, \ model_path = None, \ memory_size = 512, \ num_checkpoints = 5, \ num_episodes = 1000, \ batch_size = 64, \ learning_rate = 0.01, \ gamma = 1.0, \ exploration_phase = 0.4, \ train_start_episode = 100, \ eps_start = 1.0, \ eps_min = 0.05, \ eps_decay = 0.999, \ target_model_update_interval = 20): self.env = env self.train_flag = train_flag self.model_path = model_path self.input_shape = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.n self.num_episodes = num_episodes self.num_checkpoints = num_checkpoints self.batch_size = batch_size self.learning_rate = learning_rate self.memory_size = memory_size self.gamma = gamma self.train_start_episode = train_start_episode self.eps = eps_start self.eps_min = eps_min self.eps_decay = (eps_start - eps_min) / ( (num_episodes - train_start_episode) * exploration_phase) self.target_model_update_interval = target_model_update_interval if (self.train_flag): self.model = get_model(input_shape=self.input_shape, num_actions=self.num_actions, learning_rate=self.learning_rate) self.target_model = get_model(input_shape=self.input_shape, num_actions=self.num_actions, learning_rate=self.learning_rate) self.memory = Memory(self.memory_size) else: assert model_path != None, "Please pass the path of a trained model!" self.model = load_model(model_path) print('Model Loaded!!')
def __init__(self, env_id, dim_latent, render=False, num_process=1, memory_size=1000000, lr_p=1e-3, lr_v=1e-3, gamma=0.99, polyak=0.995, action_noise=0.1, target_action_noise_std=0.2, target_action_noise_clip=0.5, explore_size=10000, step_per_iter=3000, batch_size=100, min_update_step=1000, update_step=50, policy_update_delay=2, seed=1, model_path=None): self.env_id = env_id self.gamma = gamma self.polyak = polyak self.action_noise = action_noise self.target_action_noise_std = target_action_noise_std self.target_action_noise_clip = target_action_noise_clip self.memory = Memory(memory_size) self.explore_size = explore_size self.step_per_iter = step_per_iter self.render = render self.num_process = num_process self.lr_p = lr_p self.lr_v = lr_v self.batch_size = batch_size self.min_update_step = min_update_step self.update_step = update_step self.policy_update_delay = policy_update_delay self.model_path = model_path self.seed = seed self.dim_latent = dim_latent self._init_model()
def __init__(self, env_name, n_states, n_actions, memory_size, batch_size, gamma, alpha, lr, action_bounds, reward_scale): self.env_name = env_name self.n_states = n_states self.n_actions = n_actions self.memory_size = memory_size self.batch_size = batch_size self.gamma = gamma self.alpha = alpha self.lr = lr self.action_bounds = action_bounds self.reward_scale = reward_scale self.memory = Memory(memory_size=self.memory_size) self.device = "cuda" if torch.cuda.is_available() else "cpu" self.policy_network = PolicyNetwork( n_states=self.n_states, n_actions=self.n_actions, action_bounds=self.action_bounds).to(self.device) self.q_value_network1 = QvalueNetwork(n_states=self.n_states, n_actions=self.n_actions).to( self.device) self.q_value_network2 = QvalueNetwork(n_states=self.n_states, n_actions=self.n_actions).to( self.device) self.value_network = ValueNetwork(n_states=self.n_states).to( self.device) self.value_target_network = ValueNetwork(n_states=self.n_states).to( self.device) self.value_target_network.load_state_dict( self.value_network.state_dict()) self.value_target_network.eval() self.value_loss = torch.nn.MSELoss() self.q_value_loss = torch.nn.MSELoss() self.value_opt = Adam(self.value_network.parameters(), lr=self.lr) self.q_value1_opt = Adam(self.q_value_network1.parameters(), lr=self.lr) self.q_value2_opt = Adam(self.q_value_network2.parameters(), lr=self.lr) self.policy_opt = Adam(self.policy_network.parameters(), lr=self.lr)
def __init__(self, parameters): self.parameters = parameters self.env = gym.make(self.parameters['env']) self.nA = self.env.action_space.sample().shape[0] self.state_size = self.env.reset().shape[0] # Build our replay memory self.memory = Memory(replay_size=self.parameters['replay_size'], action_size=self.nA, state_size=self.state_size, batch_size=self.parameters['batch_size']) # Create actor and critic self.actor_critic = ActorCritic( actor_lr=parameters['actor_learning_rate'], critic_lr=parameters['critic_learning_rate'], gamma=parameters['gamma'], state_size=self.state_size, action_size=self.nA, tau=parameters['tau'])
def experiment_2(): env = gym.make('MountainCar-v0') #env = gym.make('CartPole-v0'); state = env.reset() action_dim = 1 num_actions = env.action_space.n obs_dim = env.observation_space.shape[0] memory = Memory(MEMORY_SIZE, BATCH_SIZE, action_dim, obs_dim) agent = LSPI(num_actions, obs_dim) return agent, env, memory
def collect_samples(policy_net, min_batch_size): memory = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while (num_steps < min_batch_size): state = env.reset() reward_sum = 0 for t in range(10000): action = select_action(policy_net, state) action = action.data[0].numpy() next_state, reward, done, _ = env.step(action) reward_sum += reward mask = 0 if done else 1 memory.push(state, np.array([action]), mask, next_state, reward) if render: env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 reward_batch += reward_sum print(num_episodes) reward_batch = reward_batch / num_episodes batch = memory.sample() return batch, reward_batch
def experiment_1(): import gym #env = gym.make('InvertedPendulum-v1') env = gym.make('Acrobot-v0') state = env.reset() num_actions = env.action_space.n obs_dim = env.observation_space.shape[0] action_dim = 1 memory = Memory(MEMORY_SIZE, BATCH_SIZE, action_dim, obs_dim) print(num_actions, obs_dim) agent = LSPI(num_actions, obs_dim) return agent, env, memory
def cartpole_experiment(basis_opt="gaussian", basis_function_dim=5): print('Hello CartPole world!') env = gym.make('CartPole-v0') num_actions = env.action_space.n state_dim = env.observation_space.shape[0] action_dim = 1 print("state_dim : ", state_dim) print("action_dim : ", action_dim) print("num_actions : ", num_actions) print("basis_function_option : ", basis_opt) print("basis_function_dim : ", basis_function_dim) memory = Memory(MEMORY_SIZE, action_dim, state_dim) agent = LSPI(num_actions, state_dim, basis_function_dim, gamma=0.99, opt=basis_opt) return agent, env, memory, 'CartPole-v0'
def collect_samples(pid, queue, env, policy, encoder, render, running_state, custom_reward, min_batch_size): torch.set_num_threads(1) if pid > 0: torch.manual_seed(torch.randint(0, 5000, (1, )) * pid) if hasattr(env, 'np_random'): env.np_random.seed(env.np_random.randint(5000) * pid) if hasattr(env, 'env') and hasattr(env.env, 'np_random'): env.env.np_random.seed(env.env.np_random.randint(5000) * pid) log = dict() memory = Memory() num_steps = 0 num_episodes = 0 min_episode_reward = float('inf') max_episode_reward = float('-inf') total_reward = 0 while num_steps < min_batch_size: state = env.reset() episode_reward = 0 if running_state: state = running_state(state) for t in range(10000): if render: env.render() enco_state = FLOAT(state).unsqueeze(0) #.to(device) with torch.no_grad(): enco_state = encoder.sample_prediction(enco_state) enco_state = enco_state.cpu().numpy()[0] state_tensor = FLOAT(enco_state).unsqueeze(0) with torch.no_grad(): action, log_prob = policy.get_action_log_prob(state_tensor) action = action.cpu().numpy()[0] log_prob = log_prob.cpu().numpy()[0] next_state, reward, done, _ = env.step(action) if custom_reward: reward = custom_reward(state, action) episode_reward += reward if running_state: next_state = running_state(next_state) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') memory.push(state, action, reward, next_state, mask, log_prob) num_steps += 1 if done or num_steps >= min_batch_size: break state = next_state # num_steps += (t + 1) num_episodes += 1 total_reward += episode_reward min_episode_reward = min(episode_reward, min_episode_reward) max_episode_reward = max(episode_reward, max_episode_reward) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_episode_reward'] = max_episode_reward log['min_episode_reward'] = min_episode_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
mean0 = Variable(mean1.data) log_std0 = Variable(log_std1.data) std0 = Variable(std1.data) kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / ( 2.0 * std1.pow(2)) - 0.5 return kl.sum(1, keepdim=True) trpo_step(policy_net, get_loss, get_kl, args.max_kl, args.damping) running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) for i_episode in count(1): memory = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(10000): # Don't infinite loop while learning action = select_action(state) action = action.data[0].numpy() next_state, reward, done, _ = env.step(action) reward_sum += reward
class DDPG(): def __init__(self, parameters): self.parameters = parameters self.env = gym.make(self.parameters['env']) self.nA = self.env.action_space.sample().shape[0] self.state_size = self.env.reset().shape[0] # Build our replay memory self.memory = Memory(replay_size=self.parameters['replay_size'], action_size=self.nA, state_size=self.state_size, batch_size=self.parameters['batch_size']) # Create actor and critic self.actor_critic = ActorCritic( actor_lr=parameters['actor_learning_rate'], critic_lr=parameters['critic_learning_rate'], gamma=parameters['gamma'], state_size=self.state_size, action_size=self.nA, tau=parameters['tau']) def train(self): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True # Create global step and increment operation global_step_tensor = tf.Variable(0, trainable=False, name='global_step') increment_global_step = tf.assign_add(global_step_tensor, 1) # Create model saver saver = tf.train.Saver() sess = tf.Session(config=config) if not self.parameters['restore']: sess.run(tf.global_variables_initializer()) else: saver.restore(sess, tf.train.latest_checkpoint('./saves')) self.actor_critic.set_moving_to_target(sess) run_id = np.random.randint(10000) trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id), graph=sess.graph) # Get action noise action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.nA), sigma=float(self.parameters['sigma']) * np.ones(self.nA)) # Fill Replay Memory state = self.env.reset() fill_amount = 0 while fill_amount < self.parameters['replay_init_size']: action = self.env.action_space.sample() next_state, reward, done, _ = self.env.step(action) if done: state = self.env.reset() else: fill_amount += 1 self.memory.add(state, action, reward, done, next_state) state = next_state # Main Loop steps = 0 for i in range(self.parameters['num_epochs']): avg_epoch_rewards = 0 num_epochs = 1 for e in range(self.parameters['num_episodes']): state = self.env.reset() ep_reward = 0 # Perform rollout while True: noise = action_noise() action = self.actor_critic.pi(sess, state[None, ...]) action += noise action = np.clip(action, self.env.action_space.low[0], self.env.action_space.high[0]) assert action.shape == self.env.action_space.shape """ # UNCOMMENT TO PRINT ACTIONS a0 = tf.Summary(value=[tf.Summary.Value(tag="action_0", simple_value=action[0,0])]) trainwriter.add_summary(a0,steps) a1 = tf.Summary(value=[tf.Summary.Value(tag="action_1", simple_value=action[0,1])]) trainwriter.add_summary(a1,steps) a2 = tf.Summary(value=[tf.Summary.Value(tag="action_2", simple_value=action[0,2])]) trainwriter.add_summary(a2,steps) steps += 1 """ next_state, reward, done, _ = self.env.step(action) self.memory.add(state, action, reward, done, next_state) if self.parameters['render_train']: self.env.render() ep_reward += reward if done: reward_summary = tf.Summary(value=[ tf.Summary.Value(tag="ep_rewards", simple_value=ep_reward) ]) trainwriter.add_summary( reward_summary, i * self.parameters['num_episodes'] + e) action_noise.reset() break state = next_state avg_epoch_rewards = avg_epoch_rewards + ( ep_reward - avg_epoch_rewards) / num_epochs num_epochs += 1 # Perform train for t in range(self.parameters['num_train_steps']): s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample( ) # Train actor critic model self.actor_critic.update(sess=sess, filewriter=trainwriter, state_batch=s_state, next_state_batch=s_next_state, action_batch=s_action, reward_batch=s_reward, done_batch=s_terminal) sess.run(increment_global_step) # Print out epoch stats here table_data = [['Epoch', 'Average Reward'], [ str(i) + "/" + str(self.parameters['num_epochs']), str(avg_epoch_rewards) ]] table = AsciiTable(table_data, "Training Run: " + str(run_id)) save_path = saver.save(sess, "./saves/model.ckpt") os.system('clear') print("Model saved in path: %s" % save_path + "\n" + table.table) def test(self): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True saver = tf.train.Saver() sess = tf.Session(config=config) saver.restore(sess, tf.train.latest_checkpoint('./saves')) while True: state = self.env.reset() # Perform rollout while True: action = self.actor_critic.pi(sess, state[None, ...]) action = np.clip(action, self.env.action_space.low[0], self.env.action_space.high[0]) assert action.shape == self.env.action_space.shape next_state, reward, done, _ = self.env.step(action) self.env.render() if done: break state = next_state
cur_id += cur_batch_size cur_id_exp += cur_batch_size_exp #running_state = ZFilter((num_inputs,), clip=5) #running_reward = ZFilter((1,), demean=False, clip=10) episode_lengths = [] optim_epochs = args.optim_epochs optim_batch_size = args.optim_batch_size expert = Expert(args.expert_path, num_inputs) print 'Loading expert trajectories ...' expert.push() print 'Expert trajectories loaded.' for i_episode in count(1): memory = Memory() num_steps = 0 reward_batch = 0 true_reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: c = expert.sample_c() # read c sequence from expert trajectories #if np.argmax(c[0,:]) == 1: # left half # set_diff = list(set(product(tuple(range(0, (width/2)-3)), tuple(range(1, height)))) - set(obstacles)) #elif np.argmax(c[0,:]) == 3: # right half # set_diff = list(set(product(tuple(range(width/2, width-2)), tuple(range(2, height)))) - set(obstacles)) start_loc = sample_start(set_diff) s = State(start_loc, obstacles) #state = running_state(state)
reward_list = [] batchsize_list = [] for i_para in range(5): episode_list = [] reward_tmp = [] env.seed(args.seed) torch.manual_seed(args.seed) policy_net = Policy(num_inputs, num_actions) value_net = Value(num_inputs) batchsize_list.append(args.batch_size) running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) for i_episode in tqdm(range(200)): memory = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(10000): # Don't infinite loop while learning action = select_action(state) action = action.data[0].numpy() next_state, reward, done, _ = env.step(action) reward_sum += reward
optim_epochs = 5 optim_percentage = 0.05 for i_episode in count(1): ep_memory = Memory_Ep() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: state = env.reset() state = running_state(state) policy_net.reset() reward_sum = 0 memory = Memory() for t in range(10000): # Don't infinite loop while learning if args.use_joint_pol_val: action = select_action_actor_critic(state) else: action = select_action(state) action = action.data[0].numpy() next_state, reward, done, _ = env.step(action) reward_sum += reward next_state = running_state(next_state) mask = 1 if done: mask = 0
class DDPG(): def __init__(self, parameters): self.parameters = parameters self.env = gym.make( self.parameters['env'][:self.parameters['env'].find('_')]) self.nA = self.env.action_space.sample().shape[0] self.state_size = self.env.reset().shape[0] # Build our replay memory self.memory = Memory(replay_size=self.parameters['replay_size'], action_size=self.nA, state_size=self.state_size, batch_size=self.parameters['batch_size']) # Create actor and critic self.actor_critic = ActorCritic( actor_lr=parameters['actor_learning_rate'], critic_lr=parameters['critic_learning_rate'], gamma=parameters['gamma'], state_size=self.state_size, action_size=self.nA, tau=parameters['tau']) def train(self): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True # Create global step and increment operation global_step_tensor = tf.Variable(0, trainable=False, name='global_step') increment_global_step = tf.assign_add(global_step_tensor, 1) # Create model saver saver = tf.train.Saver(max_to_keep=None) sess = tf.Session(config=config) if not self.parameters['restore']: sess.run(tf.global_variables_initializer()) else: saver.restore(sess, tf.train.latest_checkpoint('./saves')) self.actor_critic.set_moving_to_target(sess) run_id = np.random.randint(10000) trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id), graph=sess.graph) # Get action noise action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.nA), sigma=float(self.parameters['sigma']) * np.ones(self.nA)) # Fill Replay Memory state = self.env.reset() fill_amount = 0 while fill_amount < self.parameters['replay_init_size']: action = self.env.action_space.sample() next_state, reward, done, _ = self.env.step(action) if done: state = self.env.reset() else: fill_amount += 1 self.memory.add(state, action, reward, done, next_state) state = next_state # Main Loop plots = {'critic_loss': [], 'actor_loss': [], 'episode_reward': []} plots_dir = './plots/' weights_dir = './weights/' graph_dir = './graph/' if not os.path.exists(plots_dir): os.makedirs(plots_dir) if not os.path.exists(weights_dir): os.makedirs(weights_dir) if not os.path.exists(graph_dir): os.makedirs(graph_dir) saver.export_meta_graph(graph_dir + self.parameters['env'] + '/graph.meta') #cumulative step counter cumu_step = 0 for i in range(self.parameters['num_epochs']): avg_epoch_rewards = 0 n_epochs = 1 for e in range(self.parameters['num_episodes']): state = self.env.reset() ep_reward = 0 ep_n_action = 0 # Perform rollout for _ in range(500): noise = action_noise() action = self.actor_critic.pi(sess, state[None, ...]) action += noise action = np.clip(action, self.env.action_space.low[0], self.env.action_space.high[0]) assert action.shape == self.env.action_space.shape next_state, reward, done, _ = self.env.step(action) # print(action) # print(next_state) # print(reward) self.memory.add(state, action, reward, done, next_state) if self.parameters['render_train']: self.env.render() ep_reward += reward ep_n_action += 1 cumu_step += 1 state = next_state # Perform train avg_critic_loss = 0.0 avg_actor_loss = 0.0 for t in range(self.parameters['num_train_steps']): s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample( ) # Train actor critic model _, _, critic_loss, actor_loss = self.actor_critic.update( sess=sess, filewriter=trainwriter, state_batch=s_state, next_state_batch=s_next_state, action_batch=s_action, reward_batch=s_reward, done_batch=s_terminal) avg_critic_loss += critic_loss avg_actor_loss += actor_loss sess.run(increment_global_step) avg_critic_loss /= self.parameters['num_train_steps'] avg_actor_loss /= self.parameters['num_train_steps'] if done: reward_summary = tf.Summary(value=[ tf.Summary.Value(tag="ep_rewards", simple_value=ep_reward) ]) trainwriter.add_summary( reward_summary, i * self.parameters['num_episodes'] + e) action_noise.reset() break avg_epoch_rewards = avg_epoch_rewards + ( ep_reward - avg_epoch_rewards) / n_epochs n_epochs += 1 print('Epoch: {:d} | Reward: {:d} | Avg_Q_loss: {:.4f} | Avg_a_loss: {:.4f} | Episode: {:d} | Step: {:d} | Cumu Step: {:d}'\ .format(i+1, int(ep_reward), avg_critic_loss, avg_actor_loss, e+1, ep_n_action, cumu_step)) if e % 19 == 0: save_path = saver.save( sess, weights_dir + self.parameters['env'] + '/model.ckpt', global_step=i * e + 1) plots['episode_reward'].append(ep_reward) plots['critic_loss'].append(critic_loss) plots['actor_loss'].append(critic_loss) pickle.dump( plots, open(plots_dir + self.parameters['env'] + '_plot.pickle', 'wb')) def test(self): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True saver = tf.train.Saver() sess = tf.Session(config=config) saver.restore( sess, tf.train.latest_checkpoint( './weights/HalfCheetah-v2_kirkiles_train50episode_noise_norm_bufsize1Mi1k' )) while True: state = self.env.reset() # Perform rollout while True: action = self.actor_critic.pi(sess, state[None, ...]) action = np.clip(action, self.env.action_space.low[0], self.env.action_space.high[0]) #print(action) assert action.shape == self.env.action_space.shape next_state, reward, done, _ = self.env.step(action) self.env.render() if done: break state = next_state
class TD3: def __init__(self, env_id, render=False, num_process=1, memory_size=1000000, lr_p=1e-3, lr_v=1e-3, gamma=0.99, polyak=0.995, action_noise=0.1, target_action_noise_std=0.2, target_action_noise_clip=0.5, explore_size=10000, step_per_iter=3000, batch_size=100, min_update_step=1000, update_step=50, policy_update_delay=2, seed=1, model_path=None): self.env_id = env_id self.gamma = gamma self.polyak = polyak self.action_noise = action_noise self.target_action_noise_std = target_action_noise_std self.target_action_noise_clip = target_action_noise_clip self.memory = Memory(memory_size) self.explore_size = explore_size self.step_per_iter = step_per_iter self.render = render self.num_process = num_process self.lr_p = lr_p self.lr_v = lr_v self.batch_size = batch_size self.min_update_step = min_update_step self.update_step = update_step self.policy_update_delay = policy_update_delay self.model_path = model_path self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, self.num_actions = get_env_info( self.env_id) assert env_continuous, "TD3 is only applicable to continuous environment !!!!" self.action_low, self.action_high = self.env.action_space.low[ 0], self.env.action_space.high[0] # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Policy(num_states, self.num_actions, self.action_high).to(device) self.policy_net_target = Policy(num_states, self.num_actions, self.action_high).to(device) self.value_net_1 = Value(num_states, self.num_actions).to(device) self.value_net_target_1 = Value(num_states, self.num_actions).to(device) self.value_net_2 = Value(num_states, self.num_actions).to(device) self.value_net_target_2 = Value(num_states, self.num_actions).to(device) self.running_state = ZFilter((num_states, ), clip=5) self.num_states = num_states if self.model_path: print("Loading Saved Model {}_td3.p".format(self.env_id)) self.policy_net, self.value_net_1, self.value_net_2, self.running_state = pickle.load( open('{}/{}_td3.p'.format(self.model_path, self.env_id), "rb")) self.policy_net_target.load_state_dict(self.policy_net.state_dict()) self.value_net_target_1.load_state_dict(self.value_net_1.state_dict()) self.value_net_target_2.load_state_dict(self.value_net_2.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v_1 = optim.Adam(self.value_net_1.parameters(), lr=self.lr_v) self.optimizer_v_2 = optim.Adam(self.value_net_2.parameters(), lr=self.lr_v) def choose_action(self, state, noise_scale): """select action""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, log_prob = self.policy_net.get_action_log_prob(state) action = action.cpu().numpy()[0] # add noise noise = noise_scale * np.random.randn(self.num_actions) action += noise action = np.clip(action, -self.action_high, self.action_high) return action def eval(self, i_iter, render=False): """evaluate model""" state = self.env.reset() test_reward = 0 while True: if render: self.env.render() action = self.choose_action(state, 0) state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter): """interact""" global_steps = (i_iter - 1) * self.step_per_iter log = dict() num_steps = 0 num_episodes = 0 total_reward = 0 min_episode_reward = float('inf') max_episode_reward = float('-inf') while num_steps < self.step_per_iter: state = self.env.reset() episode_reward = 0 for t in range(10000): if self.render: self.env.render() if global_steps < self.explore_size: # explore action = self.env.action_space.sample() else: # action with noise action = self.choose_action(state, self.action_noise) next_state, reward, done, _ = self.env.step(action) # next_state = self.running_state(next_state) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') self.memory.push(state, action, reward, next_state, mask, None) episode_reward += reward global_steps += 1 num_steps += 1 if global_steps >= self.min_update_step and global_steps % self.update_step == 0: for k in range(self.update_step): batch, permuted_batch = self.memory.sample( self.batch_size) # random sample batch self.update(batch, permuted_batch, k) if done or num_steps >= self.step_per_iter: break state = next_state num_episodes += 1 total_reward += episode_reward min_episode_reward = min(episode_reward, min_episode_reward) max_episode_reward = max(episode_reward, max_episode_reward) self.env.close() log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_episode_reward'] = max_episode_reward log['min_episode_reward'] = min_episode_reward print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}") # record reward information writer.add_scalar("rewards/total_reward", log['total_reward'], i_iter) writer.add_scalar("rewards/average_reward", log['avg_reward'], i_iter) writer.add_scalar("rewards/min_reward", log['min_episode_reward'], i_iter) writer.add_scalar("rewards/max_reward", log['max_episode_reward'], i_iter) writer.add_scalar("rewards/num_steps", log['num_steps'], i_iter) def update(self, batch, batch2, k_iter): """learn model""" batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) # update by TD3 alg_step_stats = td3_step( self.policy_net, self.policy_net_target, self.value_net_1, self.value_net_target_1, self.value_net_2, self.value_net_target_2, self.optimizer_p, self.optimizer_v_1, self.optimizer_v_2, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak, self.target_action_noise_std, self.target_action_noise_clip, self.action_high, k_iter % self.policy_update_delay == 0) def save(self, save_path): """save model""" check_path(save_path) pickle.dump((self.policy_net, self.value_net_1, self.value_net_2, self.running_state), open('{}/{}_td3.p'.format(save_path, self.env_id), 'wb'))
def train(rank, args, shared_model, opt_ac, can_save, shared_obs_stats): best_result = -1000 torch.manual_seed(args.seed + rank) torch.set_default_tensor_type('torch.DoubleTensor') num_inputs = args.feature num_actions = 9 last_state = [1] * 48 if args.render and can_save: env = RunEnv(visualize=True) else: env = RunEnv(visualize=False) #running_state = ZFilter((num_inputs,), clip=5) #running_reward = ZFilter((1,), demean=False, clip=10) episode_lengths = [] PATH_TO_MODEL = '../models/' + str(args.bh) ac_net = ActorCritic(num_inputs, num_actions) start_time = time.time() for i_episode in count(1): memory = Memory() ac_net.load_state_dict(shared_model.state_dict()) ac_net.zero_grad() num_steps = 0 reward_batch = 0 num_episodes = 0 #Tot_loss = 0 #Tot_num = while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) last_state = process_observation(state) state = process_observation(state) last_state, state = transform_observation(last_state, state) state = numpy.array(state) #global last_state #last_state,_ = update_observation(last_state,state) #last_state,state = update_observation(last_state,state) #print(state.shape[0]) #print(state[41]) state = Variable(torch.Tensor(state).unsqueeze(0)) shared_obs_stats.observes(state) state = shared_obs_stats.normalize(state) state = state.data[0].numpy() #state = running_state(state) reward_sum = 0 #timer = time.time() for t in range(10000): # Don't infinite loop while learning #print(t) if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state, ac_net) #print(action) action = action.data[0].numpy() if numpy.any(numpy.isnan(action)): print(state) print(action) print('ERROR') raise RuntimeError('action NaN problem') #print(action) #print("------------------------") #timer = time.time() BB = numpy.append(action, action) #print(BB) reward = 0 if args.skip: #env.step(action) _, A, _, _ = env.step(BB) reward += A _, A, _, _ = env.step(BB) reward += A next_state, A, done, _ = env.step(BB) reward += A next_state = process_observation(next_state) last_state, next_state = transform_observation( last_state, next_state) next_state = numpy.array(next_state) reward_sum += reward #print('env:') #print(time.time()-timer) #last_state ,next_state = update_observation(last_state,next_state) #next_state = running_state(next_state) next_state = Variable(torch.Tensor(next_state).unsqueeze(0)) shared_obs_stats.observes(next_state) next_state = shared_obs_stats.normalize(next_state) next_state = next_state.data[0].numpy() #print(next_state[41:82]) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) #if args.render: # env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() #print('env:') #print(time.time()-timer) #timer = time.time() update_params_actor_critic(batch, args, shared_model, ac_net, opt_ac) #print('backpropagate:') #print(time.time()-timer) epoch = i_episode if (i_episode % args.log_interval == 0) and (rank == 0): print('TrainEpisode {}\tLast reward: {}\tAverage reward {:.2f}'. format(i_episode, reward_sum, reward_batch)) if reward_batch > best_result: best_result = reward_batch save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': ac_net.state_dict(), 'optimizer': opt_ac, 'obs': shared_obs_stats, }, PATH_TO_MODEL, 'best') if epoch % 30 == 1: save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': ac_net.state_dict(), 'optimizer': opt_ac, 'obs': shared_obs_stats, }, PATH_TO_MODEL, epoch)
def main(gamma=0.995, env_name="Walker2d-v2", tau=0.97, number_of_batches=500,\ batch_size=5000, maximum_steps=10000, render=False,\ seed=543, log_interval=1, entropy_coeff=0.0, clip_epsilon=0.2): env = gym.make(env_name) #Get number of inputs for A3CActor num_inputs = env.observation_space.shape[0] #Get number of outputs required for describing action num_actions = env.action_space.shape[0] env.seed(seed) torch.manual_seed(seed) actor_net = A3CActor(num_inputs, num_actions) actor_optimizer = optim.Adam(actor_net.parameters(), lr=0.001) running_state = ZFilter((num_inputs,), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) episode_lengths = [] plot_rew = [] for i_episode in range(number_of_batches): memory = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(maximum_steps): action = select_action(state, actor_net) action = action.data[0].numpy() next_state, reward, done, _ = env.step(action) reward_sum += reward next_state = running_state(next_state) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) if render: env.render() if done: break state = next_state num_steps += (t-1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() plot_rew.append(reward_batch) update_params(batch, actor_net, actor_optimizer, gamma, tau, clip_epsilon) if i_episode % log_interval == 0: print('Episode {}\t Last reward: {}\tAverage reward {:.2f}'.format( i_episode, reward_sum, reward_batch)) plot_epi = [] for i in range (number_of_batches): plot_epi.append(i) trace = go.Scatter( x = plot_epi, y = plot_rew) layout = go.Layout(title='A2C',xaxis=dict(title='Episodes', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')), yaxis=dict(title='Average Reward', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f'))) plotly.offline.plot({"data": [trace], "layout": layout},filename='PPO.html',image='jpeg') return
def train(rank, args, traffic_light, counter, shared_model, shared_grad_buffers, shared_obs_stats, opt_ac): best_result = -1000 torch.manual_seed(args.seed + rank) torch.set_default_tensor_type('torch.DoubleTensor') num_inputs = args.feature num_actions = 9 last_state = [0] * 41 last_v = [0] * 10 #last_state = numpy.zeros(48) env = RunEnv(visualize=False) #running_state = ZFilter((num_inputs,), clip=5) #running_reward = ZFilter((1,), demean=False, clip=10) episode_lengths = [] PATH_TO_MODEL = '../models/' + str(args.bh) ac_net = ActorCritic(num_inputs, num_actions) #running_state = ZFilter((num_inputs,), clip=5) start_time = time.time() for i_episode in range(args.start_epoch + 1, 999999): #print(shared_obs_stats.n[0]) #print('hei') #if rank == 0: # print(running_state.rs._n) signal_init = traffic_light.get() memory = Memory() ac_net.load_state_dict(shared_model.state_dict()) num_steps = 0 reward_batch = 0 num_episodes = 0 #Tot_loss = 0 #Tot_num = while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) #state = numpy.array(state) last_state, last_v, state = process_observation( last_state, last_v, state) state = numpy.array(state) #state = running_state(state) state = Variable(torch.Tensor(state).unsqueeze(0)) shared_obs_stats.observes(state) state = shared_obs_stats.normalize(state) state = state.data[0].numpy() #print(state) #return #print(AA) #print(type(AA)) #print(type(state)) #print(AA.shape) #print(state.shape) reward_sum = 0 #timer = time.time() for t in range(10000): # Don't infinite loop while learning #print(t) if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state, ac_net) #print(action) action = action.data[0].numpy() if numpy.any(numpy.isnan(action)): print(state) print(action) print(ac_net.affine1.weight) print(ac_net.affine1.weight.data) print('ERROR') #action = select_action_actor_critic(state,ac_net) #action = action.data[0].numpy() #state = state + numpy.random.rand(args.feature)*0.001 raise RuntimeError('action NaN problem') #print(action) #print("------------------------") #timer = time.time() reward = 0 if args.skip: #env.step(action) _, A, _, _ = env.step(action) reward += A _, A, _, _ = env.step(action) reward += A BB = numpy.append(action, action) next_state, A, done, _ = env.step(BB) reward += A #print(next_state) #last_state = process_observation(state) last_state, last_v, next_state = process_observation( last_state, last_v, next_state) next_state = numpy.array(next_state) #print(next_state) #print(next_state.shape) #return reward_sum += reward #print('env:') #print(time.time()-timer) #last_state ,next_state = update_observation(last_state,next_state) #next_state = running_state(next_state) next_state = Variable(torch.Tensor(next_state).unsqueeze(0)) shared_obs_stats.observes(next_state) next_state = shared_obs_stats.normalize(next_state) next_state = next_state.data[0].numpy() #print(next_state[41:82]) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) #if args.render: # env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() #print('env:') #print(time.time()-timer) #timer = time.time() update_params_actor_critic(batch, args, ac_net, opt_ac) shared_grad_buffers.add_gradient(ac_net) counter.increment() epoch = i_episode if (i_episode % args.log_interval == 0) and (rank == 0): print( 'TrainEpisode {}\tTime{}\tLast reward: {}\tAverage reward {:.2f}' .format( i_episode, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) epoch = i_episode if reward_batch > best_result: best_result = reward_batch save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), 'obs': shared_obs_stats, }, PATH_TO_MODEL, 'best') if epoch % 30 == 1: save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), 'obs': shared_obs_stats, }, PATH_TO_MODEL, epoch) # wait for a new signal to continue while traffic_light.get() == signal_init: pass
def main(gamma=0.995, env_name="Walker2d-v2", tau=0.97, number_of_batches=500,\ batch_size=5000, maximum_steps=10000, render=False,\ seed=543, log_interval=1, entropy_coeff=0.0, clip_epsilon=0.2): env = gym.make(env_name) #Get number of inputs for A3CActor num_inputs = env.observation_space.shape[0] #Get number of outputs required for describing action num_actions = env.action_space.shape[0] env.seed(seed) torch.manual_seed(seed) actor_net = A3CActor(num_inputs, num_actions) actor_optimizer = optim.Adam(actor_net.parameters(), lr=0.001) running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) episode_lengths = [] for i_episode in range(number_of_batches): memory = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(maximum_steps): action = select_action(state, actor_net) action = action.data[0].numpy() next_state, reward, done, _ = env.step(action) reward_sum += reward next_state = running_state(next_state) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) if render: env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() update_params(batch, actor_net, actor_optimizer, gamma, tau, clip_epsilon) if i_episode % log_interval == 0: print('Episode {}\t Last reward: {}\tAverage reward {:.2f}'.format( i_episode, reward_sum, reward_batch)) return
std0 = Variable(std1.data) kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / ( 2.0 * std1.pow(2)) - 0.5 return kl.sum(1, keepdim=True) trpo_step(policy_net, get_loss, get_kl, args.max_kl, args.damping) running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) plt.ion() reward_plot = [] reward_batch_0 = 1000 for i_episode in count(1): memory_pro = Memory() memory_adv = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(10000): # Don't infinite loop while learning action_pro = select_action(policy_net_pro, value_net_pro, state) action_pro = action_pro.data[0].numpy() action_adv = select_action(policy_net_adv, value_net_adv, state) action_adv = action_adv.data[0].numpy()
- set(obstacles)) elif np.argmax(c[0, :]) == 3: # right half set_diff = list(set(product(tuple(range(width/2, width-2)), tuple(range(2, height)))) \ - set(obstacles)) start_loc = sample_start(set_diff) s = State(start_loc, obstacles) #state = running_state(state) policy_net.reset() reward_net.reset() R.reset() reward_sum = 0 true_reward_sum = 0 memory = Memory() for t in range( args.max_ep_length): # Don't infinite loop while learning ct = c[t, :] action = select_action(np.concatenate((s.state, ct))) action = epsilon_greedy_linear_decay(action.data.cpu().numpy(), args.num_episodes * 0.5, i_episode, low=0.05, high=0.5) reward = -float( reward_net( torch.cat( (Variable(torch.from_numpy( s.state).unsqueeze(0)).type(dtype),
class SAC: def __init__(self, env_name, n_states, n_actions, memory_size, batch_size, gamma, alpha, lr, action_bounds, reward_scale): self.env_name = env_name self.n_states = n_states self.n_actions = n_actions self.memory_size = memory_size self.batch_size = batch_size self.gamma = gamma self.alpha = alpha self.lr = lr self.action_bounds = action_bounds self.reward_scale = reward_scale self.memory = Memory(memory_size=self.memory_size) self.device = "cuda" if torch.cuda.is_available() else "cpu" self.policy_network = PolicyNetwork( n_states=self.n_states, n_actions=self.n_actions, action_bounds=self.action_bounds).to(self.device) self.q_value_network1 = QvalueNetwork(n_states=self.n_states, n_actions=self.n_actions).to( self.device) self.q_value_network2 = QvalueNetwork(n_states=self.n_states, n_actions=self.n_actions).to( self.device) self.value_network = ValueNetwork(n_states=self.n_states).to( self.device) self.value_target_network = ValueNetwork(n_states=self.n_states).to( self.device) self.value_target_network.load_state_dict( self.value_network.state_dict()) self.value_target_network.eval() self.value_loss = torch.nn.MSELoss() self.q_value_loss = torch.nn.MSELoss() self.value_opt = Adam(self.value_network.parameters(), lr=self.lr) self.q_value1_opt = Adam(self.q_value_network1.parameters(), lr=self.lr) self.q_value2_opt = Adam(self.q_value_network2.parameters(), lr=self.lr) self.policy_opt = Adam(self.policy_network.parameters(), lr=self.lr) def store(self, state, reward, done, action, next_state): state = from_numpy(state).float().to("cpu") reward = torch.Tensor([reward]).to("cpu") done = torch.Tensor([done]).to("cpu") action = torch.Tensor([action]).to("cpu") next_state = from_numpy(next_state).float().to("cpu") self.memory.add(state, reward, done, action, next_state) def unpack(self, batch): batch = Transition(*zip(*batch)) states = torch.cat(batch.state).view(self.batch_size, self.n_states).to(self.device) rewards = torch.cat(batch.reward).view(self.batch_size, 1).to(self.device) dones = torch.cat(batch.done).view(self.batch_size, 1).to(self.device) actions = torch.cat(batch.action).view(-1, self.n_actions).to(self.device) next_states = torch.cat(batch.next_state).view( self.batch_size, self.n_states).to(self.device) return states, rewards, dones, actions, next_states def train(self): if len(self.memory) < self.batch_size: return 0, 0, 0 else: batch = self.memory.sample(self.batch_size) states, rewards, dones, actions, next_states = self.unpack(batch) # Calculating the value target reparam_actions, log_probs = self.policy_network.sample_or_likelihood( states) q1 = self.q_value_network1(states, reparam_actions) q2 = self.q_value_network2(states, reparam_actions) q = torch.min(q1, q2) target_value = q.detach() - self.alpha * log_probs.detach() value = self.value_network(states) value_loss = self.value_loss(value, target_value) # Calculating the Q-Value target with torch.no_grad(): target_q = self.reward_scale * rewards + \ self.gamma * self.value_target_network(next_states) * (1 - dones) q1 = self.q_value_network1(states, actions) q2 = self.q_value_network2(states, actions) q1_loss = self.q_value_loss(q1, target_q) q2_loss = self.q_value_loss(q2, target_q) policy_loss = (self.alpha * log_probs - q).mean() self.policy_opt.zero_grad() policy_loss.backward() self.policy_opt.step() self.value_opt.zero_grad() value_loss.backward() self.value_opt.step() self.q_value1_opt.zero_grad() q1_loss.backward() self.q_value1_opt.step() self.q_value2_opt.zero_grad() q2_loss.backward() self.q_value2_opt.step() self.soft_update_target_network(self.value_network, self.value_target_network) return value_loss.item(), 0.5 * ( q1_loss + q2_loss).item(), policy_loss.item() def choose_action(self, states): states = np.expand_dims(states, axis=0) states = from_numpy(states).float().to(self.device) action, _ = self.policy_network.sample_or_likelihood(states) return action.detach().cpu().numpy()[0] @staticmethod def soft_update_target_network(local_network, target_network, tau=0.005): for target_param, local_param in zip(target_network.parameters(), local_network.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data) def save_weights(self): torch.save(self.policy_network.state_dict(), self.env_name + "_weights.pth") def load_weights(self): self.policy_network.load_state_dict( torch.load(self.env_name + "_weights.pth")) def set_to_eval_mode(self): self.policy_network.eval()
args.weight, 'mixture', args.prior, args.traj_size, folder=args.ofolder, fname=fname, noise=args.noise) if not args.only and not args.is_eval and args.use_cot and args.weight: labeled_traj = torch.Tensor(expert_traj[:num_label, :]).to(device) unlabeled_traj = torch.Tensor(expert_traj[num_label:, :]).to(device) label = torch.Tensor(expert_conf[:num_label, :]).to(device) batch_size = min(128, labeled_traj.shape[0]) glfw.init() print('start training') for episode in range(args.num_epochs): memory = Memory() num_steps = 0 num_episodes = 0 reward_batch = [] states = [] actions = [] mem_actions = [] mem_mask = [] mem_next = [] sup_losses = [] label = torch.Tensor(expert_conf[:num_label, :]).to(device) for i in range(args.sup_iters_per_episode): idx = np.random.choice(labeled_traj.shape[0], batch_size) labeled_state_action_batch = labeled_traj[idx, :] true_conf_batch = label[idx, :] sup_loss = sup_train_discriminator_one_step(
def main(gamma=0.995, env_name='Walker2d-v2', tau=0.97, seed=543, number_of_batches=500,\ batch_size=5000, maximum_steps=10000, render=False, log_interval=1, entropy_coeff=0.0,\ clip_epsilon=0.2, use_joint_pol_val=False): torch.set_default_tensor_type('torch.DoubleTensor') PI = torch.DoubleTensor([3.1415926]) env = gym.make(env_name) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] env.seed(seed) torch.manual_seed(seed) policy_net = Policy(num_inputs, num_actions) value_net = Value(num_inputs) opt_policy = optim.Adam(policy_net.parameters(), lr=0.001) opt_value = optim.Adam(value_net.parameters(), lr=0.001) running_state = ZFilter((num_inputs,), clip=5) running_reward = ZFilter((1,), demean=False, clip=10) episode_lengths = [] plot_rew = [] for i_episode in range(number_of_batches): memory = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(maximum_steps): # Don't infinite loop while learning action = select_action(state, policy_net) action = action.data[0].numpy() next_state, reward, done, _ = env.step(action) reward_sum += reward next_state = running_state(next_state) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) if render: env.render() if done: break state = next_state num_steps += (t-1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() plot_rew.append(reward_batch) update_params(batch, policy_net, value_net, gamma, opt_policy, opt_value) if i_episode % args.log_interval == 0: print('Episode {}\tLast reward: {}\tAverage reward {:.2f}'.format( i_episode, reward_sum, reward_batch)) plot_epi = [] for i in range (number_of_batches): plot_epi.append(i) trace = go.Scatter( x = plot_epi, y = plot_rew) layout = go.Layout(title='PPO',xaxis=dict(title='Episodes', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')), yaxis=dict(title='Average Reward', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f'))) plotly.offline.plot({"data": [trace], "layout": layout},filename='PPO.html',image='jpeg')
def test(rank, args, shared_model, opt_ac): best_result = -1000 torch.manual_seed(args.seed + rank) torch.set_default_tensor_type('torch.DoubleTensor') num_inputs = args.feature num_actions = 9 last_state = numpy.zeros(41) if args.render: env = RunEnv(visualize=True) else: env = RunEnv(visualize=False) running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) episode_lengths = [] PATH_TO_MODEL = '../models/' + str(args.bh) ac_net = ActorCritic(num_inputs, num_actions) start_time = time.time() for i_episode in count(1): memory = Memory() ac_net.load_state_dict(shared_model.state_dict()) num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) state = numpy.array(state) #global last_state #last_state = state #last_state,_ = update_observation(last_state,state) #last_state,state = update_observation(last_state,state) #print(state.shape[0]) #print(state[41]) state = running_state(state) reward_sum = 0 for t in range(10000): # Don't infinite loop while learning #print(t) #timer = time.time() if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state, ac_net) #print(action) action = action.data[0].numpy() if numpy.any(numpy.isnan(action)): print(action) puts('ERROR') return #print('NN take:') #print(time.time()-timer) #print(action) #print("------------------------") #timer = time.time() if args.skip: #env.step(action) _, reward, _, _ = env.step(action) reward_sum += reward next_state, reward, done, _ = env.step(action) next_state = numpy.array(next_state) reward_sum += reward #print('env take:') #print(time.time()-timer) #timer = time.time() #last_state ,next_state = update_observation(last_state,next_state) next_state = running_state(next_state) #print(next_state[41:82]) mask = 1 if done: mask = 0 #print('update take:') #print(time.time()-timer) #timer = time.time() memory.push(state, np.array([action]), mask, next_state, reward) #print('memory take:') #print(time.time()-timer) #if args.render: # env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 #print(num_episodes) reward_batch += reward_sum #print(num_episodes) reward_batch /= num_episodes batch = memory.sample() #update_params_actor_critic(batch,args,shared_model,ac_net,opt_ac) time.sleep(60) if i_episode % args.log_interval == 0: File = open(PATH_TO_MODEL + '/record.txt', 'a+') File.write("Time {}, episode reward {}, Average reward {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) File.close() #print('TestEpisode {}\tLast reward: {}\tAverage reward {:.2f}'.format( # i_episode, reward_sum, reward_batch)) print("Time {}, episode reward {}, Average reward {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) #print('!!!!') epoch = i_episode if reward_batch > best_result: best_result = reward_batch save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), }, PATH_TO_MODEL, 'best') if epoch % 30 == 1: save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), }, PATH_TO_MODEL, epoch)
elif np.argmax(c[0,:]) == 3: # right half set_diff = list(set(product(tuple(range(width/2, width-2)), tuple(range(2, height)))) \ - set(obstacles)) start_loc = sample_start(set_diff) s = State(start_loc, obstacles) #state = running_state(state) policy_net.reset() reward_net.reset() posterior_net.reset() R.reset() reward_sum = 0 true_reward_sum = 0 memory = Memory() for t in range(args.max_ep_length): # Don't infinite loop while learning ct = c[t,:] action = select_action(np.concatenate((s.state, ct))) action = epsilon_greedy_linear_decay(action.data.cpu().numpy(), args.num_episodes * 0.5, i_episode, low=0.05, high=0.3) reward = -float(reward_net(torch.cat(( Variable(torch.from_numpy( s.state).unsqueeze(0)).type(dtype), Variable(torch.from_numpy( oned_to_onehot(action)).unsqueeze(0)).type(dtype), Variable(torch.from_numpy(