def main(): env = gym.make('LunarLanderContinuous-v2') log_dir = 'log/lander' # env = gym.make('Pendulum-v0') # log_dir = 'log/pendulum' # paper settings # agent = DDPG(env, sigma=0.2, num_episodes=1000, buffer_size=1000000, batch_size=64, # tau=1e-3, batch_norm=True, merge_layer=2) # did not work unless I merged action into critic at first layer # worked btter without batchnorm k = 4000 """ agent = DDPG(env, log_dir, sigma=0.2, num_episodes=k, buffer_size=1000000, batch_size=64, tau=1e-3, batch_norm=False, merge_layer=0) print('training start') agent.train() """ agent = DDPG(env, log_dir, sigma=0.2, num_episodes=k, buffer_size=1000000, batch_size=64, tau=1e-2, batch_norm=False, merge_layer=0) print('training1 start') agent.train1() """
def main(args): env = gym.make(args['env_name']) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') action_dim = env.action_space.shape[0] max_action = env.action_space.high[0] state_dim = env.observation_space.shape[0] ddpg = DDPG(args, action_dim, max_action, state_dim, device) trained_actor = torch.load(args['model_directory']) ddpg.actor.load_state_dict(trained_actor) timestep = 0 for episode in range(args['max_episode']): episode_reward = 0 state = env.reset() while True: action = ddpg.get_action(state) next_state, reward, done, info = env.step(action) env.render() episode_reward += reward state = next_state timestep += 1 if done: print('episode: ', episode, ' reward : %.3f' % (episode_reward), ' timestep :', timestep) break
def Test(): print 'test' cmd = 'sudo python ./utils/delete_episode_data.py' os.system(cmd) env = NetworkEnv() agent = DDPG(env.state_dim, env.rate_dim, env.path_dim, env.action_dim, env.kPath) rewards = [] workload = datamining state = env.reset(workload) agent.load() t1 = time.time() for i in range(test_max_episode): Perform = [] ep_r = 0.0 if i % 5 == 0: if env.CDF_file == datamining: env.set_CDF_file(websearch) else: env.set_CDF_file(datamining) for t in range(test_max_step): print '\n' print '第%d回合,第%d步骤:' % (i, t) print 'state', state action = agent.select_action(state) print 'action:', action # execute action next_state, reward, perform = env.step(action) Perform.append(perform) ep_r += reward state = next_state print 'reward', reward print( "Ep_i {}, the ep_r is {:0.2f}, the step is {}, the reward is {}" .format(i, ep_r, t, reward)) if t == test_max_step - 1: print("Ep_i {}, the ep_r is {:0.2f}".format(i, ep_r)) df_perform = pd.DataFrame(Perform) df_perform.to_csv("./data/test_perform.csv", mode='a', header=False, index=False) break rewards.append(ep_r) file_time = open('./data/test_time.txt', mode='w') df = pd.DataFrame([rewards]) df.to_csv("./data/test_rewards.csv", mode='w', header=False, index=False) print max_episode, 'episode的总运行时间:', time.time() - t1 file_time.write(str(time.time() - t1))
def main(): ddpg = DDPG(0, 0, torch.cuda.is_available()) env.init_state() if os.path.exists('models/ddpg_actor_'): ddpg.load_model() else: print("Please ensure models existing!") while True: action = ddpg.select_action(env.state) env.step(action) print(env.last_score)
def main(): # Initialize the ANNs agent = DDPG() rospy.init_node("neuro_deep_planner", anonymous=False) ros_handler = ROSHandler() ros_handler.on_policy = False while not rospy.is_shutdown(): # If we have a new msg we might have to execute an action and need to put the new experience in the buffer if ros_handler.new_msg(): if not ros_handler.is_episode_finished: # Send back the action to execute ros_handler.publish_action(agent.get_action(ros_handler.state)) # Safe the past state and action + the reward and new state into the replay buffer agent.set_experience(ros_handler.state, ros_handler.reward, ros_handler.is_episode_finished) elif ros_handler.new_setting(): agent.noise_flag = ros_handler.noise_flag else: # Train the network! agent.train()
def main(): experiment = 'model-builder-v0' #specify environments here env = gym.make(experiment) #steps= env.spec.timestep_limit #steps per episode steps = 20 assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(env.action_space.shape[0]) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print("Number of States:", num_states) print("Number of Actions:", num_actions) print("Number of Steps per episode:", steps) #saving reward: reward_st = np.array([0]) for i in range(episodes): print("==== Starting episode no:", i, "====", "\n") observation = env.reset() reward_per_episode = 0 for t in range(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, 300, 300, 2])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise print("Action at step", t, " :", action, "\n") observation, reward, done, info = env.step(action) #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, done) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (done or (t == steps - 1)): print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode) print("Printing reward to file") exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") print('\n\n') break total_reward += reward_per_episode print("Average reward per episode {}".format(total_reward / episodes))
def main(): experiment= 'InvertedPendulum-v1' #specify environments here env= gym.make(experiment) steps= env.spec.timestep_limit #steps per episode assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(env.action_space.shape[0]) counter=0 reward_per_episode = 0 total_reward=0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print "Number of States:", num_states print "Number of Actions:", num_actions print "Number of Steps per episode:", steps #saving reward: reward_st = np.array([0]) for i in xrange(episodes): print "==== Starting episode no:",i,"====","\n" observation = env.reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) noise = exploration_noise.noise() action = action[0] + noise #Select action according to current policy and exploration noise print "Action at step", t ," :",action,"\n" observation,reward,done,info=env.step(action) #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if (done or (t == steps-1)): print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode print "Printing reward to file" exploration_noise.reset() #reinitializing random noise for action exploration reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, newline="\n") print '\n\n' break total_reward+=reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
from strategies import OUStrategy from utils import SEED import mxnet as mx # set environment, policy, qfunc, strategy env = normalize(CartpoleEnv()) policy = DeterministicMLPPolicy(env.spec) qfunc = ContinuousMLPQ(env.spec) strategy = OUStrategy(env.spec) # set the training algorithm and train algo = DDPG( env=env, policy=policy, qfunc=qfunc, strategy=strategy, ctx=mx.gpu(0), max_path_length=100, epoch_length=1000, memory_start_size=10000, n_epochs=1000, discount=0.99, qfunc_lr=1e-3, policy_lr=1e-4, seed=SEED) algo.train()
def train(env, nb_epochs, nb_epoch_cycles, normalize_observations, actor_lr, critic_lr, action_noise, gamma, nb_train_steps, nb_rollout_steps, batch_size, memory, tau=0.01): max_action = env.action_space.high agent = DDPG(memory, env.observation_space.shape[0], env.action_space.shape[0], gamma=gamma, tau=tau, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, actor_lr=actor_lr, critic_lr=critic_lr, ) if USE_CUDA: agent.cuda() # Set up logging stuff only for a single worker. step = 0 episode = 0 episode_rewards_history = deque(maxlen=100) # Prepare everything. agent.reset() obs = env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) t += 1 episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] for t_train in range(nb_train_steps): cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time combined_stats = dict() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('')
def main(): # Initialize the ANNs agent = DDPG() rospy.init_node("neuro_deep_planner", anonymous=False) ros_handler = ROSHandler() ros_handler.on_policy = False # For plotting currently_plotting = False goal_count = 0 crash_count = 0 start_time = 0 # Make sure the directory for the plotting exists if not tf.gfile.Exists(PLOT_PATH): tf.gfile.MakeDirs(PLOT_PATH) f = open(PLOT_PATH + '/results', 'w') while not rospy.is_shutdown(): # If we are plotting results we don't want to train and need to turn of noise! if PLOTTING and not currently_plotting and agent.training_step > 0 and \ agent.training_step % PLOT_INTERVALL == 0: currently_plotting = True agent.noise_flag = False start_time = rospy.get_time() if currently_plotting and rospy.get_time() - start_time > PLOT_TIME: # Plot the results string = str(agent.training_step) + ', ' + str(goal_count) + ', ' + str(crash_count) + '\n' f.write(string) # Reset all parameters currently_plotting = False agent.noise_flag = True goal_count = 0 crash_count = 0 # If we are plotting results we need to count reached goals and crashes if currently_plotting: # Count the positive and negative rewards if ros_handler.new_msg(): if not ros_handler.is_episode_finished: # Send back the action to execute ros_handler.publish_action(agent.get_action(ros_handler.state)) elif ros_handler.reward == 1: goal_count += 1 elif ros_handler.reward == -1: crash_count += 1 # If we're not plotting results else: # If we have a new msg we might have to execute an action and need to put the new experience in the buffer if ros_handler.new_msg(): if not ros_handler.is_episode_finished: # Send back the action to execute ros_handler.publish_action(agent.get_action(ros_handler.state)) # Safe the past state and action + the reward and new state into the replay buffer agent.set_experience(ros_handler.state, ros_handler.reward, ros_handler.is_episode_finished) elif ros_handler.new_setting(): agent.noise_flag = ros_handler.noise_flag else: # Train the network! agent.train()
def main(): experiment= 'InvertedPendulum-v1' env= gym.make(experiment) assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env) exploration_noise = OUNoise(env.action_space.shape[0]) counter=0 total_reward=0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] #saving reward: reward_st = np.array([0]) for i in xrange(episodes): observation = env.reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) #env.render() x = observation #select action using actor network model action = agent.evaluate_actor(np.reshape(x,[num_actions,num_states])) noise = exploration_noise.noise() action = action[0] + noise print 'Agent.Action :',action print '\n' print '\n' observation,reward,done,[]=env.step(action) #add s_t,s_t+1,action,reward to experience memeroy agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if done: print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode exploration_noise.reset() reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, newline="\n") print '\n' print '\n' break total_reward+=reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 1000000)') parser.add_argument('--render', action='store_true', help='render the environment') parser.add_argument('--threshold', type=float, default=0.5, metavar='G', help='threshold for PMV') args = parser.parse_args() num_inputs = 4 num_outputs = 2 torch.manual_seed(args.seed) np.random.seed(args.seed) action_space = np.array([0.0, 0.0]) agent = DDPG(args.gamma, args.tau, args.hidden_size, num_inputs, action_space) memory = ReplayMemory(args.replay_size) ounoise = OUNoise(num_outputs) db = db_opt.DB() if db.is_open(): print("the connection is open") else: print("the connection is closed") sys.exit() step = 1 rewards = [] for i_episode in range(args.num_episodes): ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -