def main(): env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) agent = DDPG(env) env = gym.wrappers.Monitor(env, 'experiments/' + ENV_NAME, force=True) for episode in xrange(EPISODES): state = env.reset() #print "episode:",episode # Train for step in xrange(env.spec.timestep_limit): action = agent.noise_action(state) next_state, reward, done, _ = env.step(action) agent.perceive(state, action, reward, next_state, done) state = next_state if done: break # Testing: if episode % 100 == 0 and episode > 100: total_reward = 0 for i in xrange(TEST): state = env.reset() for j in xrange(env.spec.timestep_limit): #env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break ave_reward = total_reward / TEST print 'episode: ', episode, 'Evaluation Average Reward:', ave_reward env.monitor.close()
def main(): env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) agent = DDPG(env) env.monitor.start('experiments/' + ENV_NAME,force=True) for episode in xrange(EPISODES): state = env.reset() #print "episode:",episode # Train for step in xrange(env.spec.timestep_limit): action = agent.noise_action(state) next_state,reward,done,_ = env.step(action) agent.perceive(state,action,reward,next_state,done) state = next_state if done: break # Testing: if episode % 100 == 0 and episode > 100: total_reward = 0 for i in xrange(TEST): state = env.reset() for j in xrange(env.spec.timestep_limit): #env.render() action = agent.action(state) # direct action for test state,reward,done,_ = env.step(action) total_reward += reward if done: break ave_reward = total_reward/TEST print 'episode: ',episode,'Evaluation Average Reward:',ave_reward env.monitor.close()
def main(): finishedTraining = EPISODES startTime = time.time() env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) results_file = open("ResultsNew.csv", 'a') agent = DDPG(env, results_file) env = Monitor(env, directory='experiments/' + ENV_NAME, force=True) results_file.write("Episodes Spent Training; " + str(TEST) + " Episode Eval Avg \n") for episode in range(EPISODES): state = env.reset() if (episode % 20 == 0): print("episode:", episode) # Train for step in range(env.spec.timestep_limit): action = agent.noise_action(state) next_state, reward, done, _ = env.step(action) agent.perceive(state, action, reward, next_state, done) state = next_state if done: break # Testing: if (episode + 1) % 100 == 0 and episode > 100: total_reward = 0 for i in range(TEST): state = env.reset() for j in range(env.spec.timestep_limit): env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break ave_reward = total_reward / TEST print('episode: ', episode, 'Evaluation Average Reward:', ave_reward) results_file.write(str(episode) + "; " + str(ave_reward) + "\n") if ave_reward > 800 and finishedTraining > episode + 300: finishedTraining = episode + 300 elif (episode >= finishedTraining): break results_file.write("Time Training (" + str(EPISODES) + "episodes);" + str(time.time() - startTime) + "\n") results_file.write("Evaluation Episode; Reward \n") for episode in range(100): total_reward = 0 env.reset() state = env.env.env.set_test(episode) for j in range(env.spec.timestep_limit): action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break results_file.write(str(episode) + "; " + str(total_reward) + "\n") results_file.write("endExperiment\n\n") results_file.close()
def main(): startTime = time.time() env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) results_file = open("MoreExactReward12.csv", 'a') agent = DDPG(env, results_file) env = Monitor(env, directory='experiments/' + ENV_NAME, force=True) results_file.write("Episodes Spent Training; " + str(TEST) + " Episode Eval Avg; Learned Reward Map \n") for episode in range(EPISODES): state = env.reset() if (episode % 20 == 0): print("episode:", episode) # Train for step in range(env.spec.timestep_limit): action = agent.noise_action(state) next_state, reward, done, _ = env.step(action) agent.perceive(state, action, reward, next_state, done) state = next_state if done: break # Testing: if episode % 100 == 0 and episode > 100: total_reward = 0 for i in range(TEST): state = env.reset() for j in range(env.spec.timestep_limit): #env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break ave_reward = total_reward / TEST print('episode: ', episode, 'Evaluation Average Reward:', ave_reward) results_file.write(str(episode) + "; " + str(ave_reward) + ";") results_file.write("%s \n" % (np.array_str( agent.actor_network.net[-1].eval())).replace("\n", " ")) results_file.write("Time Training (" + str(EPISODES) + "episodes);" + str(time.time() - startTime) + "\n") results_file.write( "Final Learned Reward Map; %s \n" % (np.array_str(agent.actor_network.net[-1].eval())).replace("\n", " ")) results_file.write("Evaluation Episode; Reward \n") for episode in range(100): total_reward = 0 state = env.reset() for j in range(env.spec.timestep_limit): action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break results_file.write(str(episode) + "; " + str(total_reward) + "\n") results_file.write("endExperiment\n\n") results_file.close()
def main(): env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) #env = gym.wrappers.Monitor(env, 'experiments/' + ENV_NAME,force=True) state = env.reset() ddpg_server = GymDDPGServer(env, state) ddpg_server.Open() rospy.spin()
def run(self): self.t_train = 0 self.t_test = 0 # create filtered environment self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env)) # self.env = gym.make(FLAGS.env) self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False) gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print(dimO,dimA) import pprint pprint.pprint(self.env.spec.__dict__,width=1) self.agent = ddpg.Agent(dimO=dimO,dimA=dimA) returns = [] # main loop while self.t_train < FLAGS.total: # test T = self.t_test R = [] self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False,resume=True) while self.t_test - T < FLAGS.test: R.append(self.run_episode(test=True,monitor=(len(R)==0))) avr = np.mean(R) print('Average test return\t{} after {} timesteps of training'.format(avr,self.t_train)) # save return returns.append((self.t_train, avr)) np.save(FLAGS.outdir+"/returns.npy",returns) # evaluate required number of episodes for gym if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold: for i in range(self.env.spec.trials): self.run_episode(test=True) self.env.monitor.close() # train T = self.t_train R = [] while self.t_train - T < FLAGS.train: R.append(self.run_episode(test=False)) avr = np.mean(R) print('Average training return\t{} after {} timesteps of training'.format(avr,self.t_train)) self.env.monitor.close() # upload results if FLAGS.upload: gym.upload(FLAGS.outdir+"/monitor",algorithm_id = GYM_ALGO_ID)
def main(): env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) agent = DDPG(env) #monitor=wrappers.Monitor(env,'experiments/'+ENV_NAME,force=True); for episode in xrange(EPISODES): program_order_idx = np.random.randint(1, 4) env.set_order(program_order_idx, order_list[program_order_idx]) state = env.reset() # Train for step in xrange(env.spec.timestep_limit): action = agent.noise_action(state, order_list[program_order_idx]) next_state, reward, done, _ = env.step(action) agent.perceive(state, order_list[program_order_idx], action, reward, next_state, done) state = next_state if done: break # Testing: if episode % 100 == 0 and episode > 100: ts_reward = 0 for i in xrange(TEST): program_order_idx = 0 env.set_order(program_order_idx, order_list[program_order_idx]) state = env.reset() for j in xrange(env.spec.timestep_limit): #monitor.render() action = agent.action(state, order_list[program_order_idx] ) # direct action for test state, reward, done, _ = env.step(action) ts_reward += reward if done: break ave_ts_reward = ts_reward / TEST / 200 tr_reward = 0 for i in xrange(TEST): program_order_idx = np.random.randint(1, 4) env.set_order(program_order_idx, order_list[program_order_idx]) state = env.reset() for j in xrange(env.spec.timestep_limit): #monitor.render() action = agent.action(state, order_list[program_order_idx] ) # direct action for test state, reward, done, _ = env.step(action) tr_reward += reward if done: break ave_tr_reward = tr_reward / TEST / 200 print 'episode: ', episode, 'Unseen Case Average Reward:', ave_ts_reward, 'Training Case Average Reward:', ave_tr_reward f = open("pa_logs", "a") f.writelines('episode: ' + str(episode) + 'Unseen Case Average Reward:' + str(ave_ts_reward) + 'Training Case Average Reward:' + str(ave_tr_reward) + "\n") f.close()
def run(self): self.t_train = 0 self.t_test = 0 # create filtered environment self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env)) # self.env = gym.make(FLAGS.env) self.env.monitor.start(FLAGS.outdir + '/monitor/', video_callable=lambda _: False) gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print(dimO, dimA) import pprint pprint.pprint(self.env.spec.__dict__, width=1) self.agent = ddpg.Agent(dimO=dimO, dimA=dimA) simplelog = open(FLAGS.outdir + '/log.txt', 'w') # main loop while self.t_train < FLAGS.total: # test T = self.t_test R = [] while self.t_test - T < FLAGS.test: R.append(self.run_episode(test=True, monitor=np.random.binomial(1, FLAGS.monitor))) self.t_test += 1 avr = np.mean(R) print('Average test return\t{} after {} episodes of training'.format(avr, self.t_train)) print >> simplelog, "%d\t%d" % (self.t_train, avr) # evaluate required number of episodes for gym and end training when above threshold if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold: avr = np.mean([self.run_episode(test=True) for _ in range(self.env.spec.trials)]) if avr > self.env.spec.reward_threshold: break # train T = self.t_train R = [] while self.t_train - T < FLAGS.train: R.append(self.run_episode(test=False)) self.t_train += 1 avr = np.mean(R) print('Average training return\t{} after {} episodes of training'.format(avr, self.t_train)) self.env.monitor.close() # upload results if FLAGS.upload: gym.upload(FLAGS.outdir + "/monitor", algorithm_id=GYM_ALGO_ID)
def run(args): # experiment = "InvertedPendulum-v1" env = filter_env.makeFilteredEnv(gym.make(args.game)) print "reward_threshold:", env.spec.reward_threshold, ", timestep_limit:", env.spec.timestep_limit save_dir = './result/%s/monitor/' % args.game if not os.path.isdir(save_dir): os.makedirs(save_dir) # env.monitor.start(save_dir, video_callable=lambda _: False, force=True) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_range = (env.action_space.low, env.action_space.high) print "action space range:", action_range train_dir = "./result/%s/tf/" % args.game agent = DDPG(state_dim, action_dim, train_dir=train_dir, gpu_id=args.gpu, dim=args.dim) t_train, t_test = 0, 0 experiment = Experiment(env, agent, args.tmax) while True: # test T = t_test R = [] # env.monitor.start(save_dir, video_callable=lambda _: False, resume=True) while t_test - T < args.test: r, t = experiment.run_episode(test=True, monitor=(len(R) == 0)) R.append(r) t_test += t if len(R) > 0: avr = sum(R) / len(R) logger.info( 'Average test return\t{} after {} timesteps of training target: ({})' .format(avr, t_train, env.spec.reward_threshold)) # env.monitor.close() # train T = t_train R = [] while t_train - T < args.train: r, t = experiment.run_episode(test=False) R.append(r) t_train += t if len(R) > 0: avr = sum(R) / len(R) logger.info( 'Average train return\t{} after {} timesteps of training'. format(avr, t_train))
def main(args): env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) agent = DDPG(env) # env.monitor.start('experiments/' + ENV_NAME,force=True) saver = tf.train.Saver() if args.checkpoint: saver.restore(agent.sess, args.checkpoint) print("Resuming checkpoint from" + args.checkpoint) max_reward = -100000 for episode in range(EPISODES): state = env.reset() print("episode:", episode) # Train for step in range(MAX_STEPS): action = agent.noise_action(state) next_state, reward, done, _ = env.step(action) agent.perceive(state, action, reward, next_state, done) state = next_state if done: break # Testing: if episode % 100 == 0 and episode > 100: total_reward = 0 for i in range(TEST): state = env.reset() for j in range(MAX_STEPS): #env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break ave_reward = total_reward / TEST if ave_reward > max_reward: max_reward = ave_reward saver.save( agent.sess, "models/ddpg_ep" + str(episode) + "-" + str(ave_reward)) print('episode: ', episode, 'Evaluation Average Reward:', ave_reward) with open("models/ddpg_2.csv", "a") as savefile: wr = csv.writer(savefile, dialect="excel") wr.writerow([episode, ave_reward]) env.monitor.close()
def main(args): env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) agent = DDPG(env) # env.monitor.start('experiments/' + ENV_NAME,force=True) saver = tf.train.Saver() saver.restore(agent.sess, args.checkpoint) print("Resuming checkpoint from" + args.checkpoint) max_reward = -100000 rewards = [] for i in range(TEST): state = env.reset() for j in range(MAX_STEPS): action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) rewards.append(reward) if done: break print("Average Reward: {}".format(np.mean(rewards)))
def run(args): # experiment = "InvertedPendulum-v1" env = filter_env.makeFilteredEnv(gym.make(args.game)) print "reward_threshold:", env.spec.reward_threshold, ", timestep_limit:", env.spec.timestep_limit save_dir = './result/%s/monitor/' % args.game if not os.path.isdir(save_dir): os.makedirs(save_dir) # env.monitor.start(save_dir, video_callable=lambda _: False, force=True) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_range = (env.action_space.low, env.action_space.high) print "action space range:", action_range train_dir = "./result/%s/tf/" % args.game agent = DDPG(state_dim, action_dim, train_dir=train_dir, gpu_id=args.gpu, dim=args.dim) t_train, t_test = 0, 0 experiment = Experiment(env, agent, args.tmax) while True: # test T = t_test R = [] # env.monitor.start(save_dir, video_callable=lambda _: False, resume=True) while t_test - T < args.test: r, t = experiment.run_episode(test=True, monitor=(len(R) == 0)) R.append(r) t_test += t if len(R) > 0: avr = sum(R) / len(R) logger.info('Average test return\t{} after {} timesteps of training target: ({})'.format(avr, t_train, env.spec.reward_threshold)) # env.monitor.close() # train T = t_train R = [] while t_train - T < args.train: r, t = experiment.run_episode(test=False) R.append(r) t_train += t if len(R) > 0: avr = sum(R) / len(R) logger.info('Average train return\t{} after {} timesteps of training'.format(avr, t_train))
def run(self): self.t_log = 103 self.t_global = 0 # create filtered environment self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env)) self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False) gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print('dimO: '+str(dimO) +' dimA: '+str(dimA)) self.agent = ddpg.Agent(dimO=dimO,dimA=dimA) returns = [] t_last_test = 0 # main loop while self.t_global < t_train: # test t_last_test = self.t_global R = np.mean([self.run_episode(test=True,render=render) for _ in range(n_test)]) returns.append((self.t_global, R)) np.save(FLAGS.outdir+"/returns.npy",returns) print('Average return '+str(R)+ ' after '+str(self.t_global)+' timesteps of training') # train while self.t_global-t_last_test < FLAGS.test: self.run_episode(test=False) self.env.monitor.close() # upload results if FLAGS.gymkey != '': gym.upload(FLAGS.outdir+"/monitor",FLAGS.gymkey)
def main(): env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) agent = DDPG(env) env.monitor.start('experiments/' + ENV_NAME, force=True) for episode in xrange(EPISODES): state = env.reset() # Train for step in xrange(env.spec.timestep_limit): action = agent.noise_action(state) next_state, reward, done, _ = env.step(action) agent.perceive(state, action, reward, next_state, done) state = next_state if done: break # Testing: if episode % 100 == 0 and episode > 100: total_reward = 0 min_reward = 1000 max_reward = 0 for i in xrange(TEST): reward_one = 0 state = env.reset() for j in xrange(env.spec.timestep_limit): #env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward reward_one += reward if done: break min_reward = min(min_reward, reward_one) max_reward = max(max_reward, reward_one) ave_reward = total_reward / TEST print 'episode: ', episode, 'Evaluation Reward: Average-', ave_reward, " Min-", min_reward, " Max-", max_reward env.monitor.close()
def main(): env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) agent = DDPG(env) #env.monitor.start('experiments/' + ENV_NAME,force=True) env = gym.wrappers.Monitor(env, PATH, force=True) returns = [] rewords = [] for episode in xrange(EPISODES): state = env.reset() reward_episode = [] print "episode:",episode # Train for step in xrange(env.spec.timestep_limit): env.render() action = agent.noise_action(state) next_state,reward,done,_ = env.step(action) #print('state={}, action={}, reward={}, next_state={}, done={}'.format(state, action, reward, next_state, done)) agent.perceive(state,action,reward,next_state,done) state = next_state reward_episode.append(reward) if done: break
def main(): env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) agent = DDPG(env) #env.monitor.start('experiments/' + ENV_NAME,force=True) env = gym.wrappers.Monitor(env, PATH, force=True) returns = [] rewords = [] for episode in xrange(EPISODES): state = env.reset() reward_episode = [] print "episode:", episode # Train for step in xrange(env.spec.timestep_limit): env.render() action = agent.noise_action(state) next_state, reward, done, _ = env.step(action) #print('state={}, action={}, reward={}, next_state={}, done={}'.format(state, action, reward, next_state, done)) agent.perceive(state, action, reward, next_state, done) state = next_state reward_episode.append(reward) if done: break plt.figure(3) plt.plot(reward_episode) plt.show() # Testing: #if episode % 1 == 0: if episode % 10 == 0 and episode > 50: agent.save_model(PATH, episode) total_return = 0 ave_reward = 0 for i in xrange(TEST): state = env.reset() reward_per_step = 0 for j in xrange(env.spec.timestep_limit): #env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break reward_per_step += (reward - reward_per_step) / (j + 1) ave_reward += reward_per_step ave_return = total_return / TEST ave_reward = ave_reward / TEST returns.append(ave_return) rewards.append(ave_reward) plt.figure(1) plt.plot(returns) plt.figure(2) plt.plot(rewards) plt.show() print 'episode: ', episode, 'Evaluation Average Return:', ave_return, ' Evaluation Average Reward: ', ave_reward env.monitor.close()
import filter_env import rospy from rl_agent_environment_communication.srv import * import cv2 from cv_bridge import CvBridge import gym import numpy as np ENV_NAME = 'LunarLanderContinuous-v2' #ENV_NAME = 'Pendulum-v0' DEBUG_SERVICES_MODE = False env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) #env = gym.wrappers.Monitor(env, 'experiments/' + ENV_NAME,force=True) state = env.reset() def handle_environment_reset(req): print 'Reseting Env...' state = env.reset() resp = ResetEnvSrvResponse() resp.state = state return resp def handle_environment_render(req): print 'Rendering Env...' image_state = env.render(mode='rgb_array') print 'ENV RENDERED!'
def main(): # tensorflow session sess = tf.InteractiveSession() # set agents per each particle envs = np.zeros(n_particle, dtype=object) agents = np.zeros(n_particle, dtype=object) states = np.zeros(n_particle, dtype=object) dones = np.zeros(n_particle, dtype=bool) actor_nets = np.zeros(n_particle, dtype=object) actor_pg_list = np.zeros(n_particle, dtype=object) ave_reward = np.zeros(n_particle, dtype=float) for i in range(n_particle): envs[i] = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) agents[i] = DDPG(sess, envs[i], i) dones[i] = False actor_nets[i] = agents[i].actor_network.net actor_pg_list[i] = agents[i].actor_network.pg_list actor_nets = np.array(list(actor_nets)) actor_pg_list = np.array(list(actor_pg_list)) svpg = SVPG(sess, actor_nets, actor_pg_list, envs[0].observation_space.shape[0], envs[0].action_space.shape[0], independent_flag) # session initialization and target NN update sess.run(tf.global_variables_initializer()) for par in range(n_particle): agents[par].update_target() for episode in xrange(EPISODES): for par in range(n_particle): states[par] = envs[par].reset() # Train for par in range(n_particle): dones[par] = False for step in xrange(envs[0].spec.timestep_limit): flag = 0 for par in range(n_particle): if not dones[par]: action = agents[par].noise_action(states[par]) next_state, reward, done, _ = envs[par].step(action) agents[par].save_to_buffer(states[par], action, reward, next_state, done) states[par] = next_state if done: dones[par] = True if agents[par].can_train(): flag += 1 if (flag == n_particle): for par in range(n_particle): # train critic NN and get policy gradient agents[par].train() # svpg svpg.run() for par in range(n_particle): agents[par].update_target() # Testing: if episode % 100 == 0 and episode > 100: for par in range(n_particle): total_reward = 0 for i in xrange(TEST): state = envs[par].reset() for j in xrange(envs[0].spec.timestep_limit): action = agents[par].action( state) # direct action for test state, reward, done, _ = envs[par].step(action) total_reward += reward if done: break ave_reward[par] = total_reward / TEST print 'episode: ', episode, 'Evaluation Average Reward:', np.max( ave_reward) if np.max(ave_reward) > 950.0: print 'solved' exit(1)
def __init__(self, environment = 'MountainCarContinuous-v0', # environment = 'InvertedPendulum-v1', ): self.gamma = 0.99 lr = 1e-3 #learning rate self.sess = tf.InteractiveSession() self.l1 = 100 #neurons layer 1 self.l2 = 100 #neurons layer 2 self.step = 0 # number of SGD-steps alredy taken self.summaries_dir = './logging/ddpg' replay_memory_size = 5e5 #number of transitions to be stored in replay buffer self.warmup = 0#5e4 self.train_lengths = [] self.test_lengths = [] self.replay_memory = deque(maxlen=replay_memory_size) # environment specific: self.env_f = filter_env.makeFilteredEnv(gym.make(environment)) self.select_env = environment self.num_outputs = 1 self.action_dim = self.env_f.action_space.shape[0] self.state_dim = self.env_f.observation_space.shape[0] print('state dim', self.state_dim) print('action dim', self.action_dim) self.batch_size = 32 self.samples_count = 0 self.ou_process = ornstein_uhlenbeck(ndim= 1, theta= 0.15, sigma= .2, delta_t= 1) ####### Initialize the Networks: ###### self.state = tf.placeholder(tf.float32, [None, self.state_dim], name='x-states') self.action = tf.placeholder(tf.float32, [None, self.action_dim], name='x-action') neurons_layer1 = 200 neurons_layer2 = 200 theta_hidden = naf_net.theta_hidden(self.state_dim, neurons_layer1, neurons_layer2) self.hidden_out, _ = naf_net.hidden_layers(self.state, theta_hidden, name= 'hidden_net') theta_v = naf_net.theta_fc(neurons_layer2, 1) self.V, _ = naf_net.fc_layer(self.hidden_out, theta_v, tf.identity, 'v_layer') theta_l = naf_net.theta_fc(neurons_layer2, int(self.action_dim*(self.action_dim+1)/2)) l, _ = naf_net.fc_layer(self.hidden_out, theta_l, tf.identity,'l_layer') theta_mu = naf_net.theta_fc(neurons_layer2, self.action_dim) # theta_mu = [tf.Variable(tf.random_uniform((neurons_layer2,self.action_dim),-1e-3, 1e-3), name='1w'), # tf.Variable(tf.random_uniform([self.action_dim],-1e-3, 1e-3), name='1b')] self.mu, _ = naf_net.fc_layer(self.hidden_out, theta_mu, tf.tanh, 'mu_layer') #prime net: self.state_prime = tf.placeholder(tf.float32, [None, self.state_dim], name='x-states_prime') theta_hidden_prime, update_hidden = naf_net.exponential_moving_averages(theta_hidden, 0.001) self.hidden_out_prime, _ = naf_net.hidden_layers(self.state, theta_hidden_prime, name= 'hidden_net_prime') theta_v_prime, update_theta_v = naf_net.exponential_moving_averages(theta_v, 0.001) V_prime, _ = naf_net.fc_layer(self.hidden_out, theta_v_prime, tf.identity, 'v_prime_layer') #creating the P matrix: pivot = 0 rows = [] for idx in xrange(self.action_dim): count = self.action_dim - idx diag_elem = tf.exp(tf.slice(l, (0, pivot), (-1, 1))) non_diag_elems = tf.slice(l, (0, pivot + 1), (-1, count - 1)) row = tf.pad(tf.concat(1, (diag_elem, non_diag_elems)), ((0, 0), (idx, 0))) rows.append(row) pivot += count L = tf.transpose(tf.pack(rows), (1, 2, 0)) P = tf.batch_matmul(L, tf.transpose(L, (0, 2, 1))) tmp = tf.expand_dims(self.action - self.mu, -1) A = -tf.batch_matmul(tf.transpose(tmp, (0, 2, 1)), tf.batch_matmul(P, tmp)) / 2 A = tf.reshape(A, [-1, 1]) with tf.name_scope('Q'): self.Q = A + self.V with tf.name_scope('optimization'): self.rew = tf.placeholder(tf.float32, [None, self.action_dim], name='reward') V_prime_stopped = tf.stop_gradient(V_prime) q_target = self.rew + self.gamma*V_prime_stopped self.loss = tf.reduce_mean(tf.squared_difference(tf.squeeze(q_target), tf.squeeze(self.Q)), name='td_error_loss') optimizer = tf.train.AdamOptimizer(learning_rate=lr) grads_and_vars = optimizer.compute_gradients(self.loss, var_list=theta_hidden + theta_v + theta_mu + theta_l) with tf.control_dependencies([update_hidden, update_theta_v]): self.train_step = optimizer.apply_gradients(grads_and_vars) # logging log_obs = [] if self.state_dim > 20 else [tf.histogram_summary("obs/" + str(i), self.state[:, i]) for i in range(self.state_dim)] log_act = [] if self.action_dim > 20 else [tf.histogram_summary("act/inf" + str(i), self.mu[:, i]) for i in range(self.action_dim)] log_act2 = [] if self.action_dim > 20 else [tf.histogram_summary("act/train" + str(i), self.action[:, i]) for i in range(self.action_dim)] log_grad = [plotting.grad_histograms(grads_and_vars)] # self.log_all = tf.merge_summary(log_obs + log_act + log_act2) self.log_all = tf.scalar_summary('mean squared tderror', self.loss) plotting.hist_summaries(*list(theta_v_prime + theta_hidden_prime)) # Merge all the summaries and write them out to /tmp/mnist_logs (by default) self.merged = tf.merge_all_summaries() self.train_writer = tf.train.SummaryWriter(self.summaries_dir, self.sess.graph) tf.initialize_all_variables().run()
def train(): # parameter server and worker information ps_hosts = np.zeros(FLAGS.ps_hosts_num, dtype=object) worker_hosts = np.zeros(FLAGS.worker_hosts_num, dtype=object) port_num = FLAGS.st_port_num for i in range(FLAGS.ps_hosts_num): ps_hosts[i] = str(FLAGS.hostname) + ":" + str(port_num) port_num += 1 for i in range(FLAGS.worker_hosts_num): worker_hosts[i] = str(FLAGS.hostname) + ":" + str(port_num) port_num += 1 ps_hosts = list(ps_hosts) worker_hosts = list(worker_hosts) # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": device = tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster) #tf.set_random_seed(1); # env and model call env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) agent = DDPG(env, device) # prepare session with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) global_step_ph = tf.placeholder(global_step.dtype, shape=global_step.get_shape()) global_step_ops = global_step.assign(global_step_ph) score = tf.get_variable('score', [], initializer=tf.constant_initializer(-21), trainable=False) score_ph = tf.placeholder(score.dtype, shape=score.get_shape()) score_ops = score.assign(score_ph) init_op = tf.global_variables_initializer() # summary for tensorboard tf.summary.scalar("score", score) summary_op = tf.summary.merge_all() saver = tf.train.Saver() sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), global_step=global_step, logdir=FLAGS.log_dir, summary_op=summary_op, saver=saver, init_op=init_op) with sv.managed_session(server.target) as sess: agent.set_sess(sess) while True: if sess.run([global_step])[0] > EPISODES: break score = 0 for ls in range(local_step): state = env.reset() for step in xrange(env.spec.timestep_limit): action = agent.noise_action(state) next_state, reward, done, _ = env.step(action) agent.perceive(state, action, reward, next_state, done) state = next_state if done: break for i in xrange(TEST): state = env.reset() for j in xrange(env.spec.timestep_limit): #env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) score += reward if done: break sess.run( global_step_ops, {global_step_ph: sess.run([global_step])[0] + local_step}) sess.run(score_ops, {score_ph: score / TEST / 200}) print( str(FLAGS.task_index) + "," + str(sess.run([global_step])[0]) + "," + str(score / TEST / 200)) sv.stop() print("Done")
def run(self, env): self.t_train = 0 self.t_test = 0 # create filtered environment # self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env)) self.env = filter_env.makeFilteredEnv(gym.make(env)) self.t_elapsed = [] # self.env = gym.make(FLAGS.env) if tf.gfile.Exists(FLAGS.outdir): tf.gfile.DeleteRecursively(FLAGS.outdir) # self.env.monitor.start(FLAGS.outdir + '/monitor/', video_callable=lambda _: False) # gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print 'observationspace action space', print(dimO, dimA) import pprint pprint.pprint(self.env.spec.__dict__, width=1) self.agent = ddpg.Agent(dimO=dimO, dimA=dimA) returns = [] it = 0 episodelengths = [] testlengths = [] if env == 'Reacher-v1': self.train_frequency = 1 test_frequency = 3 plot_frequency = 1 if env == 'MountainCarContinuous-v0': test_frequency = 10 plot_frequency = 1 self.train_frequency = 16 if env == 'InvertedPendulum-v1': test_frequency = 100 plot_frequency = 300 self.train_frequency = 1 print 'using train frequency', self.train_frequency # main loop while self.t_train < FLAGS.total: it += 1 episodelengths.append(self.run_episode(test=False)) if it % test_frequency == 0: testlengths.append(self.run_episode(test=True)) if it % plot_frequency == 0: print 'avg time for sim step:', np.mean( np.array(self.t_elapsed)) plotting.plot_episode_lengths(episodelengths) plotting.plot_episode_lengths(testlengths) # plotting.plot_replay_memory_2d_state_histogramm(self.agent.rm.observations) # plotting.plot_learned_mu(self.agent.act_test, self.env) # else: # # test # T = self.t_test # R = [] # # while self.t_test - T < FLAGS.test: # # print 'running test episode' # R.append(self.run_episode(test=True, monitor=(self.t_test - T < FLAGS.monitor * FLAGS.test))) # avr = np.mean(R) # print('Average test return\t{} after {} timesteps of training'.format(avr, self.t_train)) # # save return # returns.append((self.t_train, avr)) # np.save(FLAGS.outdir + "/returns.npy", returns) # # # evaluate required number of episodes for gym and end training when above threshold # if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold: # avr = np.mean([self.run_episode(test=True) for _ in range(self.env.spec.trials)]) # if avr > self.env.spec.reward_threshold: # break # # # train # T = self.t_train # R = [] # while self.t_train - T < FLAGS.train: # # print 'running train episode' # R.append(self.run_episode(test=False)) # avr = np.mean(R) # print('Average training return\t{} after {} timesteps of training'.format(avr, self.t_train)) # self.env.monitor.close() # upload results if FLAGS.upload: gym.upload(FLAGS.outdir + "/monitor", algorithm_id=GYM_ALGO_ID)
def main(args): if VERBOSE: print '***The Replay Buffer currently always returns the most recent experiences (instead of random), so the batches are constant between the tf and torch nets.' state_dim = 3 action_dim = 1 net = ActorCriticNet(state_dim, action_dim) target_net = copy.deepcopy(net) memory = ReplayBuffer(REPLAY_BUFFER_SIZE) noise = OUNoise(action_dim) criterion = nn.MSELoss() optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, weight_decay=L2) target_optim = optim.Optimizer(target_net.parameters(), {}) # to iterate over target params if VERBOSE: print '***Making gym env (only used to setup TF net).' # load tf net (restoring saved parameters) dtf = ddpg_tf.DDPG_TF(filter_env.makeFilteredEnv(gym.make('Pendulum-v0')), loadfilename='tf_params-0', printVars=False) if VERBOSE: print '***TF net restore complete.' # load control data (only using a every fourth data), and tf net results control_states = np.load('control_states.npy')[::4] control_rewards = np.load('control_rewards.npy')[::4] tf_record = np.load('tf_control_record.npy') # replace torch params with tf params, and run control data, collecting torch net results # first optimization step will occur at i == 50, upon which extra data is recorded to compare tf and torch # using: no bn, REPLAY_BUFFER_SIZE=200, REPLAY_START_SIZE=50, BATCH_SIZE=50, constant replay_buffer_batches (always the most recent experiences) replaceNetParams(dtf, net, target_net) if VERBOSE: print '***Torch net params initialized to TF net params.' original_net = copy.deepcopy(net) # save original net original_target_net = copy.deepcopy(target_net) torch_record = [] loss = -1 first_step = True for i in xrange(len(control_rewards) - 1): state = torch.from_numpy(control_states[i].reshape(1, state_dim)).float() action = net.getAction(Variable(state)).data target_action = target_net.getAction(Variable(state)).data reward = torch.FloatTensor([[control_rewards[i]]]).float() new_state = torch.from_numpy(control_states[i + 1].reshape( 1, state_dim)).float() memory.add(state, action, reward, new_state, True) if memory.count() > REPLAY_START_SIZE: minibatch = memory.get_batch(BATCH_SIZE) state_batch = torch.cat([data[0] for data in minibatch], dim=0) action_batch = torch.cat([data[1] for data in minibatch], dim=0) reward_batch = torch.cat([data[2] for data in minibatch]) next_state_batch = torch.cat([data[3] for data in minibatch], dim=0) done_batch = Tensor([data[4] for data in minibatch]) # calculate y_batch from targets #next_action_batch = target_net.getAction(Variable(next_state_batch)) value_batch = target_net.getValue(Variable(next_state_batch)).data y_batch = reward_batch + GAMMA * value_batch * done_batch if first_step: if VERBOSE: print '***First Optimization Step complete.' torch_ys = y_batch torch_batch = minibatch torch_outs = net.getValue(Variable(state_batch)).data # optimize net 1 step loss = criterion(net.getValue(Variable(state_batch)), Variable(y_batch)) optimizer.zero_grad() loss.backward() optimizer.step() loss = loss.data[0] # update targets - using exponential moving averages for group, target_group in zip(optimizer.param_groups, target_optim.param_groups): for param, target_param in zip(group['params'], target_group['params']): target_param.data.mul_(1 - TAU) target_param.data.add_(TAU, param.data) if first_step: first_step_net = copy.deepcopy(net) first_step_target_net = copy.deepcopy(target_net) first_step = False torch_record.append( [action.numpy()[0][0], target_action.numpy()[0][0], loss]) loss = -1 torch_record = np.array(torch_record) torch_outs = torch_outs.numpy().T[0] torch_ys = torch_ys.numpy().T[0] if VERBOSE: print '***Control Data run complete.' # compare torch and tf results # results for each net have 3 columns: [net action prediction, target net action prediction, loss (-1 if there was no training)] sel = np.arange(45, 55) #print calc_error(tf_record[sel,:], torch_record[sel,:]) print 'Result comparison:' print 'control_data_index | tf_net_action | tf_target_net_action | tf_loss | torch_net_action | torch_target_net_action | torch_loss' print np.hstack( [sel[:, np.newaxis], tf_record[sel, :], torch_record[sel, :]]) print '\t(a loss of -1 means no training occured in that step)' # load all tf results from before taking first optimization step tf_ys = np.load('tf_first_step_y_batch.npy') tf_rs = np.load('tf_first_step_reward_batch.npy') tf_ds = np.load('tf_first_step_done_batch.npy') tf_vs = np.load('tf_first_step_value_batch.npy') tf_outs = np.load('tf_first_step_output_values.npy') torch_wd = 1.36607 # weight decay loss of tf net at first optimization step - recorded directly from terminal output of tf net if VERBOSE: print '***Comparing first step stats' # compare tf and torch data from before taking first optimization step # including calculation of manual loss print '\terror in ys (between tf and torch)', calc_error( torch_ys, tf_ys) print '\terror in predictions (between tf and torch)', calc_error( torch_outs, tf_outs) print '\ttorch loss (manually calculated)', np.mean( (torch_ys - torch_outs)**2) print '\ttf loss (manually calculated)', np.mean((tf_ys - tf_outs)**2) print '\ttorch loss', torch_record[50, 2], '(not including weight decay)' print '\ttf loss', tf_record[ 50, 2] - torch_wd, '(not including weight decay)' return 0
def run(self,env): self.t_train = 0 self.t_test = 0 # create filtered environment # self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env)) self.env = filter_env.makeFilteredEnv(gym.make(env)) self.t_elapsed = [] # self.env = gym.make(FLAGS.env) if tf.gfile.Exists(FLAGS.outdir): tf.gfile.DeleteRecursively(FLAGS.outdir) # self.env.monitor.start(FLAGS.outdir + '/monitor/', video_callable=lambda _: False) # gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print 'observationspace action space', print(dimO, dimA) import pprint pprint.pprint(self.env.spec.__dict__, width=1) self.agent = ddpg.Agent(dimO=dimO, dimA=dimA) returns = [] it = 0 episodelengths = [] testlengths = [] if env == 'Reacher-v1': self.train_frequency = 1 test_frequency = 3 plot_frequency = 1 if env == 'MountainCarContinuous-v0': test_frequency = 10 plot_frequency = 1 self.train_frequency = 16 if env == 'InvertedPendulum-v1': test_frequency = 100 plot_frequency = 300 self.train_frequency = 1 print 'using train frequency', self.train_frequency # main loop while self.t_train < FLAGS.total: it +=1 episodelengths.append(self.run_episode(test=False)) if it % test_frequency== 0: testlengths.append(self.run_episode(test=True)) if it % plot_frequency == 0: print 'avg time for sim step:', np.mean(np.array(self.t_elapsed)) plotting.plot_episode_lengths(episodelengths) plotting.plot_episode_lengths(testlengths) # plotting.plot_replay_memory_2d_state_histogramm(self.agent.rm.observations) # plotting.plot_learned_mu(self.agent.act_test, self.env) # else: # # test # T = self.t_test # R = [] # # while self.t_test - T < FLAGS.test: # # print 'running test episode' # R.append(self.run_episode(test=True, monitor=(self.t_test - T < FLAGS.monitor * FLAGS.test))) # avr = np.mean(R) # print('Average test return\t{} after {} timesteps of training'.format(avr, self.t_train)) # # save return # returns.append((self.t_train, avr)) # np.save(FLAGS.outdir + "/returns.npy", returns) # # # evaluate required number of episodes for gym and end training when above threshold # if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold: # avr = np.mean([self.run_episode(test=True) for _ in range(self.env.spec.trials)]) # if avr > self.env.spec.reward_threshold: # break # # # train # T = self.t_train # R = [] # while self.t_train - T < FLAGS.train: # # print 'running train episode' # R.append(self.run_episode(test=False)) # avr = np.mean(R) # print('Average training return\t{} after {} timesteps of training'.format(avr, self.t_train)) # self.env.monitor.close() # upload results if FLAGS.upload: gym.upload(FLAGS.outdir + "/monitor", algorithm_id=GYM_ALGO_ID)
action = np.clip(np.round(action), self._env.action_space.low, self._env.action_space.high) # action as integer id action = int("".join([str(int(a)) for a in action]), 2) vector_length = len(self._env.action_space.high) if action > 0: timestamp = long(time.time() * 1000) - 874724710 observed_items = gl.SFrame({'item_id': [action], 'timestamp': [timestamp], 'prev_item': [state]}) nearest_neighbors = k if k is not None else FLAGS.knn k_interactions = self._model.recommend_from_interactions(observed_items, k=nearest_neighbors, diversity=1) items = ["{0:0{1}b}".format(item[0], vector_length) for item in k_interactions[["item_id"]].to_numpy()] else: items = ["00000000000"] item_vectors = [] for item in items: item_vectors.append(np.array([float(item[i]) for i in range(vector_length)])) return item_vectors if __name__ == '__main__': # env = filter_env.makeFilteredEnv(gym.make("InvertedDoublePendulum-v1")) env = filter_env.makeFilteredEnv(gym.make("CollaborativeFiltering-v3")) x = FMPolicy(env) obs = env.reset() cont_action = env.action_space.sample() print('==Action in continuous space: {}'.format(cont_action)) result = x.g(cont_action) print(result)
def run(self): self.t_train = 0 self.t_test = 0 # create filtered environment self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env), skip_space_norm=FLAGS.skip_space_norm, wolpertinger=FLAGS.wolpertinger) # self.env = gym.make(FLAGS.env) self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False) # self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: True) gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print(dimO,dimA) import pprint pprint.pprint(self.env.spec.__dict__,width=1) wolp = None if FLAGS.wolpertinger: wolp = wp.Wolpertinger(self.env, i=FLAGS.wp_total_actions, action_set=wp.load_action_set(FLAGS.wp_action_set_file, i=FLAGS.wp_total_actions, action_shape=dimA[0]) ).g elif FLAGS.fmpolicy: wolp = fmp.FMPolicy(self.env).g self.agent = ddpg.Agent(dimO=dimO, dimA=dimA, custom_policy=FLAGS.wolpertinger or FLAGS.fmpolicy, env_dtype=str(self.env.action_space.high.dtype)) returns = [] # main loop while self.t_train < FLAGS.total: # test T = self.t_test R = [] if self.t_train - T > 0 or FLAGS.train == 0: while self.t_test - T < FLAGS.test: R.append(self.run_episode(test=True, monitor=(self.t_test - T < FLAGS.monitor * FLAGS.test), custom_policy=wolp)) self.t_test += 1 avr = np.mean(R) # print('Average test return\t{} after {} timesteps of training'.format(avr,self.t_train)) with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f: # f.write('Average test return\t{} after {} timesteps of training\n'.format(avr, self.t_train)) f.write('Average test return\t{} after {} timesteps\n'.format(avr, self.t_train + FLAGS.test)) # save return returns.append((self.t_train, avr)) np.save(FLAGS.outdir+"/returns.npy",returns) s = self.agent.checkpoint_session() with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f: f.write('Checkpoint saved at {} \n'.format(s)) # evaluate required number of episodes for gym and end training when above threshold if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold: # TODO: it is supposed that when testing the model does not have to use the full wolpertinger policy? # TODO: to avoid the item not found exception in environment, custom policy is being sent to the run_episode avr = np.mean([self.run_episode(test=True, custom_policy=wolp) for _ in range(self.env.spec.trials)]) # trials??? # print('TRIALS => Average return{}\t Reward Threshold {}'.format(avr, self.env.spec.reward_threshold)) with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f: f.write('TRIALS => Average return{}\t Reward Threshold {}\n'.format(avr, self.env.spec.reward_threshold)) if avr > self.env.spec.reward_threshold: s = self.agent.checkpoint_session() with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f: f.write('Final Checkpoint saved at {} \n'.format(s)) break # train T = self.t_train R = [] start_time = time.time() while self.t_train - T < FLAGS.train: R.append(self.run_episode(test=False, custom_policy=wolp)) self.t_train += 1 end_time = time.time() avr = np.mean(R) # print('Average training return\t{} after {} timesteps of training'.format(avr,self.t_train)) with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f: f.write('Average training return\t{} after {} timesteps of training. Batch time: {} sec.\n' .format(avr, self.t_train, end_time - start_time)) self.env.monitor.close() f.close() # upload results if FLAGS.upload: gym.upload(FLAGS.outdir+"/monitor",algorithm_id = GYM_ALGO_ID)