def main(): try: NUM_ITER=int(sys.argv[1]) except: NUM_ITER=1000 try: NUM_EPISODES=int(sys.argv[2]) except: NUM_EPISODES=1 try: UPLOAD = bool(sys.argv[3]) except: UPLOAD = False try: PROBLEM = sys.argv[4] except: PROBLEM = 'Copy-v0' DIR = '/tmp/openai/'+PROBLEM+'/' env = gym.make(PROBLEM) env.monitor.start(DIR, force=True) for _ in xrange(NUM_EPISODES): print 'running episode', _ env.reset() ai = MctsAi(env, NUM_ITER=NUM_ITER) ai.playAll() env.monitor.close() if(UPLOAD): gym.upload(DIR, api_key='sk_xzHs5ZzjQviZDZ8R2luFPw') return 0
def run(self, epochs, steps, api_key, rollouts_per_epoch = 100, updateTargetNetwork = defaultRunSettings['updateTargetNetwork'], explorationRate = defaultRunSettings['explorationRate'], miniBatchSize = defaultRunSettings['miniBatchSize'], learnStart = defaultRunSettings['learnStart'], renderPerXEpochs = defaultRunSettings['renderPerXEpochs'], shouldRender = defaultRunSettings['shouldRender'], experimentId = defaultRunSettings['experimentId'], force = defaultRunSettings['force'], upload = defaultRunSettings['upload']): last100Scores = [0] * 100 last100ScoresIndex = 0 last100Filled = False if experimentId != None: self.env.monitor.start('tmp/'+experimentId, force = force) for epoch in xrange(epochs): paths = [] for rollout in xrange(rollouts_per_epoch): path = {} path["actions"] = [] path["rewards"] = [] path["states"] = [] path["isDone"] = [] observation = self.env.reset() # number of timesteps totalReward = 0 for t in xrange(steps): policyValues = self.runModel(self.policyModel, observation) action = self.selectActionByProbability(policyValues) # action = self.selectActionByProbability(self.convertToProbabilities(policyValues)) path["states"].append(observation) path["actions"].append(action) newObservation, reward, done, info = self.env.step(action) path["rewards"].append(reward) path["isDone"].append(done) totalReward += reward observation = newObservation if done: break paths.append(path) self.learn(paths) self.env.monitor.close() if upload: gym.upload('/tmp/'+experimentId, api_key=api_key)
def upload(): """ Upload the results of training (as automatically recorded by your env's monitor) to OpenAI Gym. Parameters: - training_dir: A directory containing the results of a training run. - api_key: Your OpenAI API key - algorithm_id (default=None): An arbitrary string indicating the paricular version of the algorithm (including choices of parameters) you are running. """ request_data = request.get_json() j = request.get_json() training_dir = get_required_param(j, 'training_dir') api_key = get_required_param(j, 'api_key') algorithm_id = j.get('algorithm_id', None) try: gym.upload(training_dir, algorithm_id, writeup=None, api_key=api_key, ignore_open_monitors=False) return ('', 204) except gym.error.AuthenticationError: raise InvalidUsage('You must provide an OpenAI Gym API key')
def run(self): for entry in os.listdir(self.base_dir): if entry in ['.', '..']: continue training_dir = os.path.join(self.base_dir, entry) if not os.path.isdir(training_dir): logger.info('Skipping: {}'.format(training_dir)) continue gym.upload(training_dir, algorithm_id=self.algorithm_id, writeup=self.writeup)
def run(self): self.t_train = 0 self.t_test = 0 # create filtered environment self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env)) # self.env = gym.make(FLAGS.env) self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False) gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print(dimO,dimA) import pprint pprint.pprint(self.env.spec.__dict__,width=1) self.agent = ddpg.Agent(dimO=dimO,dimA=dimA) returns = [] # main loop while self.t_train < FLAGS.total: # test T = self.t_test R = [] self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False,resume=True) while self.t_test - T < FLAGS.test: R.append(self.run_episode(test=True,monitor=(len(R)==0))) avr = np.mean(R) print('Average test return\t{} after {} timesteps of training'.format(avr,self.t_train)) # save return returns.append((self.t_train, avr)) np.save(FLAGS.outdir+"/returns.npy",returns) # evaluate required number of episodes for gym if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold: for i in range(self.env.spec.trials): self.run_episode(test=True) self.env.monitor.close() # train T = self.t_train R = [] while self.t_train - T < FLAGS.train: R.append(self.run_episode(test=False)) avr = np.mean(R) print('Average training return\t{} after {} timesteps of training'.format(avr,self.t_train)) self.env.monitor.close() # upload results if FLAGS.upload: gym.upload(FLAGS.outdir+"/monitor",algorithm_id = GYM_ALGO_ID)
def run(self): self.t_train = 0 self.t_test = 0 # create filtered environment self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env)) # self.env = gym.make(FLAGS.env) self.env.monitor.start(FLAGS.outdir + '/monitor/', video_callable=lambda _: False) gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print(dimO, dimA) import pprint pprint.pprint(self.env.spec.__dict__, width=1) self.agent = ddpg.Agent(dimO=dimO, dimA=dimA) simplelog = open(FLAGS.outdir + '/log.txt', 'w') # main loop while self.t_train < FLAGS.total: # test T = self.t_test R = [] while self.t_test - T < FLAGS.test: R.append(self.run_episode(test=True, monitor=np.random.binomial(1, FLAGS.monitor))) self.t_test += 1 avr = np.mean(R) print('Average test return\t{} after {} episodes of training'.format(avr, self.t_train)) print >> simplelog, "%d\t%d" % (self.t_train, avr) # evaluate required number of episodes for gym and end training when above threshold if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold: avr = np.mean([self.run_episode(test=True) for _ in range(self.env.spec.trials)]) if avr > self.env.spec.reward_threshold: break # train T = self.t_train R = [] while self.t_train - T < FLAGS.train: R.append(self.run_episode(test=False)) self.t_train += 1 avr = np.mean(R) print('Average training return\t{} after {} episodes of training'.format(avr, self.t_train)) self.env.monitor.close() # upload results if FLAGS.upload: gym.upload(FLAGS.outdir + "/monitor", algorithm_id=GYM_ALGO_ID)
def upload(self, instance_id, algorithm_id, writeup, api_key, ignore_open_monitors): """ Upload training information created with monitor. :param instance_id: Id of the environment that was trained. :param algorithm_id: An arbitrary string indicating the paricular version of the algorithm (including choices of parameters) you are running. :param writeup: A Gist URL (of the form https://gist.github.com/<user>/<id>) containing your writeup for this evaluation. :param api_key: Your OpenAI API key. Can also be provided as an environment variable (OPENAI_GYM_API_KEY). :param ignore_open_monitors: Ignore open monitors when uploading. :return: """ directory = self.TRAINING_DIRECTORY.format(instance_id) gym.upload(directory, algorithm_id, writeup, api_key, ignore_open_monitors)
def run(self): self.t_log = 103 self.t_global = 0 # create filtered environment self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env)) self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False) gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print('dimO: '+str(dimO) +' dimA: '+str(dimA)) self.agent = ddpg.Agent(dimO=dimO,dimA=dimA) returns = [] t_last_test = 0 # main loop while self.t_global < t_train: # test t_last_test = self.t_global R = np.mean([self.run_episode(test=True,render=render) for _ in range(n_test)]) returns.append((self.t_global, R)) np.save(FLAGS.outdir+"/returns.npy",returns) print('Average return '+str(R)+ ' after '+str(self.t_global)+' timesteps of training') # train while self.t_global-t_last_test < FLAGS.test: self.run_episode(test=False) self.env.monitor.close() # upload results if FLAGS.gymkey != '': gym.upload(FLAGS.outdir+"/monitor",FLAGS.gymkey)
... #%% Monitor Wrapper cd d:/ROBOCZY/Python/Gym #%% import gym import ffmpeg from gym import wrappers env = gym.make('CartPole-v0') env = wrappers.Monitor(env, './cartpole-experiment-1') for i_episode in range(20): observation = env.reset() for t in range(100): env.render() print(observation) action = env.action_space.sample() observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t+1)) break env.close() gym.upload('./cartpole-experiment-1', api_key='blah') #! ERROR: DependencyNotInstalled:f #%%
import gym gym.upload('/tmp/cartpole-experiment-1', api_key='sk_5YJsWfHOQwOLiU3AAVyYeA')
#from random import * #import tensorflow as tf #from nets import * #from learner import * def h(x): return 0 if x < 0 else 1 if __name__ == '__main__': env = gym.make('CartPole-v0') env = gym.wrappers.Monitor(env, './evolution/cartpole-v0', force=True) policy_f = lambda w, obs: int(h(np.dot(w[0:4], obs) + w[4])) (te, _, _) = evolve_env(env, policy_f, np.asarray([0, 0, 0, 0, 0]), gaussian_perturb, normalized_avg, alpha=0.1, spawn=50, stages=50, print_every=1, max_steps=1000, eval_trials=100) print("Running:") for i in range(1000): print("Reward %f" % env_f(env, policy_f, te, max_steps=1000)) env.close() gym.upload('./evolution/cartpole-v0', api_key='API_KEY')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-e', '--environment', type=str, default='CartPole-v0', help='OpenAI Gym environment to run.') parser.add_argument('-p', '--episodes', type=int, default=1000, help='Number of episodes to simulate.') parser.add_argument('-g', '--goal', type=int, default=195, help='Goal score for the environment.') parser.add_argument('-t', '--time', type=int, default=200, help='Time steps for each episode.') parser.add_argument('-a', '--agent', type=str, default='QL', help='Learning agent type (QL or NAF).') parser.add_argument('--report', action='store_true', help='Report results.') parser.add_argument('-d', '--debug', action='store_true', help='Print max values at each time-step.') args = parser.parse_args() print(args) environment = args.environment episodes = args.episodes goal = args.goal time = args.time env = gym.make(environment) if (args.agent == 'QL'): agent = QLAgent(env) elif (args.agent == 'NAF'): agent = NAFAgent(env) elif (args.agent == 'Random'): agent = RandomAgent(env) scores = [] if args.report: filename = 'tmp/gym-report' env.monitor.start(filename, force=True) for i_episode in range(episodes): # Get initial observation. agent.reset() observation = env.reset() score = 0 alt_score = 0 # Run n = time steps for t in range(time): # Save the previous state. prev_state = observation #env.render() #if i_episode % 500 == 0: # env.render() report = args.debug next_action = agent.get_action(observation, report) observation, reward, done, info = env.step(next_action) score += reward reward += observation[0] if report: print(reward) alt_score += reward agent.update(prev_state, next_action, reward, observation, done) if done or t == time - 1: print(i_episode + 1, score, alt_score, t, done) scores.append(score) running_avg = np.average(scores[-100:]) #if running_avg > goal: # print '100-run average {0} on run {1}!'.format(\ # running_avg, i_episode) if (i_episode + 1) % 50 == 0: print('{0} average score at {1}'.format(running_avg, \ i_episode + 1)) break if args.report: env.monitor.close() key_file = open('api.key', 'r') gym_key = key_file.readline() if args.agent == 'NAF': algo_id = 'alg_xjVArtUxQXqfSq5q89dRjQ' else: algo_id = 'alg_sbIxfyjIRUSBrBA1IOFg' gym.upload(filename, api_key=gym_key, algorithm_id=algo_id) return
def uploadSimulation(): API_KEY = open('/home/dollarakshay/Documents/API Keys/Open AI Key.txt', 'r').read().rstrip() gym.upload('Artificial Intelligence/' + GAME, api_key=API_KEY)
env.monitor.start('./frozenlake-experiment', force=True) for i_episode in range(n_episode): observation = env.reset() #reset environment to beginning #run for several time-steps for t in xrange(max_time_steps): #display experiment #env.render() #sample a random action action = opt[observation] #observe next step and get reward observation, reward, done, info = env.step(action) if done: #env.render() print "Simulation finished after {0} timesteps".format(t) break env.monitor.close() gym.upload('/home/lucianodp/Documents/eua/reinforcement_learning/notebooks/frozenlake-experiment', api_key='sk_qkx3jhBbTRamxadtXqA3pQ')
def run(self,env): self.t_train = 0 self.t_test = 0 # create filtered environment # self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env)) self.env = filter_env.makeFilteredEnv(gym.make(env)) self.t_elapsed = [] # self.env = gym.make(FLAGS.env) if tf.gfile.Exists(FLAGS.outdir): tf.gfile.DeleteRecursively(FLAGS.outdir) # self.env.monitor.start(FLAGS.outdir + '/monitor/', video_callable=lambda _: False) # gym.logger.setLevel(gym.logging.WARNING) dimO = self.env.observation_space.shape dimA = self.env.action_space.shape print 'observationspace action space', print(dimO, dimA) import pprint pprint.pprint(self.env.spec.__dict__, width=1) self.agent = ddpg.Agent(dimO=dimO, dimA=dimA) returns = [] it = 0 episodelengths = [] testlengths = [] if env == 'Reacher-v1': self.train_frequency = 1 test_frequency = 3 plot_frequency = 1 if env == 'MountainCarContinuous-v0': test_frequency = 10 plot_frequency = 1 self.train_frequency = 16 if env == 'InvertedPendulum-v1': test_frequency = 100 plot_frequency = 300 self.train_frequency = 1 print 'using train frequency', self.train_frequency # main loop while self.t_train < FLAGS.total: it +=1 episodelengths.append(self.run_episode(test=False)) if it % test_frequency== 0: testlengths.append(self.run_episode(test=True)) if it % plot_frequency == 0: print 'avg time for sim step:', np.mean(np.array(self.t_elapsed)) plotting.plot_episode_lengths(episodelengths) plotting.plot_episode_lengths(testlengths) # plotting.plot_replay_memory_2d_state_histogramm(self.agent.rm.observations) # plotting.plot_learned_mu(self.agent.act_test, self.env) # else: # # test # T = self.t_test # R = [] # # while self.t_test - T < FLAGS.test: # # print 'running test episode' # R.append(self.run_episode(test=True, monitor=(self.t_test - T < FLAGS.monitor * FLAGS.test))) # avr = np.mean(R) # print('Average test return\t{} after {} timesteps of training'.format(avr, self.t_train)) # # save return # returns.append((self.t_train, avr)) # np.save(FLAGS.outdir + "/returns.npy", returns) # # # evaluate required number of episodes for gym and end training when above threshold # if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold: # avr = np.mean([self.run_episode(test=True) for _ in range(self.env.spec.trials)]) # if avr > self.env.spec.reward_threshold: # break # # # train # T = self.t_train # R = [] # while self.t_train - T < FLAGS.train: # # print 'running train episode' # R.append(self.run_episode(test=False)) # avr = np.mean(R) # print('Average training return\t{} after {} timesteps of training'.format(avr, self.t_train)) # self.env.monitor.close() # upload results if FLAGS.upload: gym.upload(FLAGS.outdir + "/monitor", algorithm_id=GYM_ALGO_ID)
if episode_number % EPISODES_PER_PRINT_PROGRESS == 0: t = perf_counter() - training_start_time print("(%d s) Episode: %d, Average reward = %f." % (t, episode_number, reward_sum / EPISODES_PER_PRINT_PROGRESS)) reward_sum = 0 # It is considered solved when the sum of reward is over 200 if reward > DONE_REWARD_LEVEL: num_streaks += 1 solved_episode = episode_number else: num_streaks = 0 solved_episode = -1 # It's considered done when it's solveFalsed over 120 times consecutively if num_streaks > STREAK_TO_END: print("Task solved in %d episodes and repeated %d times." % (episode_number, num_streaks)) break agent.brain.model.save_model( os.path.join(TRAINED_MODEL_DIR, TRAINED_MODEL_NAME), False) if GYM_ENABLE_UPLOAD: env.monitor.close() gym.upload(GYM_VIDEO_PATH, api_key=GYM_API_KEY) # testing the model test(os.path.join(TRAINED_MODEL_DIR, TRAINED_MODEL_NAME), num_episodes=1000)
def run(self, epochs, steps, api_key, monitor = True, updateTargetNetwork = defaultRunSettings['updateTargetNetwork'], explorationRate = defaultRunSettings['explorationRate'], miniBatchSize = defaultRunSettings['miniBatchSize'], learnStart = defaultRunSettings['learnStart'], renderPerXEpochs = defaultRunSettings['renderPerXEpochs'], shouldRender = defaultRunSettings['shouldRender'], experimentId = defaultRunSettings['experimentId'], force = defaultRunSettings['force'], upload = defaultRunSettings['upload']): last100Scores = [0] * 100 last100ScoresIndex = 0 last100Filled = False stepCounter = 0 if experimentId != None and monitor: self.env.monitor.start('tmp/'+experimentId, force = force) for epoch in xrange(epochs): observation = self.env.reset() qValues = self.getQValues(observation) action = self.selectAction(qValues, explorationRate) print explorationRate # number of timesteps totalReward = 0 for t in xrange(steps): if epoch % renderPerXEpochs == 0 and shouldRender: self.env.render() newObservation, reward, done, info = self.env.step(action) qValues = self.getQValues(observation) newAction = self.selectAction(qValues, explorationRate) totalReward += reward self.addMemory(observation, action, reward, newObservation, newAction, done) if stepCounter >= learnStart: if stepCounter <= updateTargetNetwork: self.learnOnMiniBatch(miniBatchSize, False) else : self.learnOnMiniBatch(miniBatchSize, True) observation = newObservation.copy() action = newAction if done: last100Scores[last100ScoresIndex] = totalReward last100ScoresIndex += 1 if last100ScoresIndex >= 100: last100Filled = True last100ScoresIndex = 0 if not last100Filled: print "Episode ",epoch," finished after {} timesteps".format(t+1)," with total reward",totalReward else : print "Episode ",epoch," finished after {} timesteps".format(t+1)," with total reward",totalReward," last 100 average: ",(sum(last100Scores)/len(last100Scores)) break stepCounter += 1 if stepCounter % updateTargetNetwork == 0: self.updateTargetNetwork() print "updating target network" explorationRate *= 0.995 # explorationRate -= (2.0/epochs) explorationRate = max (0.05, explorationRate) self.env.monitor.close() if upload: gym.upload('/tmp/'+experimentId, api_key=api_key)
import gym gym.upload('tmp/MountainCar-v0-mc', algorithm_id=None, api_key='sk_iQhXCgvKRjyxitEVLVW13g')
def do_submit(output, api_key): gym.upload(output, api_key=api_key)
# This declaration must go *after* the monitor call, since the # monitor's seeding creates a new action_space instance with the # appropriate pseudorandom number generator. agent = RandomAgent(env.action_space) episode_count = 100 max_steps = 200 reward = 0 done = False for i in range(episode_count): ob = env.reset() for j in range(max_steps): action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) if done: break # Note there's no env.render() here. But the environment still can open window and # render if asked by env.monitor: it calls env.render('rgb_array') to record video. # Video is not recorded every episode, see capped_cubic_video_schedule for details. # Dump result info to disk env.monitor.close() # Upload to the scoreboard. We could also do this from another # process if we wanted. logging.info("Successfully ran RandomAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.") gym.upload(outdir, api_key='YOUR_API_KEY')
else : deepQ.learnOnMiniBatch(minibatch_size, True) observation = newObservation if done: last100Scores[last100ScoresIndex] = t last100ScoresIndex += 1 if last100ScoresIndex >= 100: last100Filled = True last100ScoresIndex = 0 if not last100Filled: print "Episode ",epoch," finished after {} timesteps".format(t+1) else : print "Episode ",epoch," finished after {} timesteps".format(t+1)," last 100 average: ",(sum(last100Scores)/len(last100Scores)) break stepCounter += 1 if stepCounter % updateTargetNetwork == 0: deepQ.updateTargetNetwork() print "updating target network" explorationRate *= 0.995 # explorationRate -= (2.0/epochs) explorationRate = max (0.05, explorationRate) deepQ.printNetwork() env.monitor.close() gym.upload('/tmp/wingedsheep-cartpole-deepQLearning6', api_key='sk_GC4kfmRSQbyRvE55uTWMOw')
def run(self, epochs, steps, api_key, updateTargetNetwork = defaultRunSettings['updateTargetNetwork'], explorationRate = defaultRunSettings['explorationRate'], miniBatchSize = defaultRunSettings['miniBatchSize'], learnStart = defaultRunSettings['learnStart'], renderPerXEpochs = defaultRunSettings['renderPerXEpochs'], shouldRender = defaultRunSettings['shouldRender'], experimentId = defaultRunSettings['experimentId'], force = defaultRunSettings['force'], upload = defaultRunSettings['upload']): last100Scores = [0] * 100 last100ScoresIndex = 0 last100Filled = False if experimentId != None: self.env.monitor.start('tmp/'+experimentId, force = force) for epoch in xrange(epochs): path = {} path["actions"] = [] path["rewards"] = [] path["states"] = [] path["values"] = [] path["isDone"] = [] observation = self.env.reset() # number of timesteps totalReward = 0 for t in xrange(steps): if epoch % renderPerXEpochs == 0 and shouldRender: self.env.render() policyValues = self.runModel(self.policyModel, observation) # print policyValues action = self.selectActionByProbability(policyValues, 1e-8) # print "action: ",action path["states"].append(observation) path["actions"].append(action) path["values"].append(self.runModel(self.valueModel, observation)[0]) newObservation, reward, done, info = self.env.step(action) path["rewards"].append(reward) path["isDone"].append(done) totalReward += reward observation = newObservation if done: last100Scores[last100ScoresIndex] = totalReward last100ScoresIndex += 1 if last100ScoresIndex >= 100: last100Filled = True last100ScoresIndex = 0 if not last100Filled: print "Episode ",epoch," finished after {} timesteps".format(t+1)," with total reward",totalReward else : print "Episode ",epoch," finished after {} timesteps".format(t+1)," with total reward",totalReward," last 100 average: ",(sum(last100Scores)/len(last100Scores)) break self.learn(path, observation) self.env.monitor.close() if upload: gym.upload('/tmp/'+experimentId, api_key=api_key)
def upload_results(folder): gym.upload(folder, api_key=secrets.api_key)
env = gym.make(args.env_id) # You provide the directory to write to (can be an existing # directory, including one with existing data -- all monitor files # will be namespaced). You can also dump to a tempdir if you'd # like: tempfile.mkdtemp(). outdir = './tmp/random-agent-results' env = wrappers.Monitor(env, directory=outdir, force=True) env.seed(0) agent = RandomAgent(env.action_space) episode_count = 100 reward = 0 done = False for i in range(episode_count): ob = env.reset() while True: action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) if done: break # Note there's no env.render() here. But the environment still can open window and # render if asked by env.monitor: it calls env.render('rgb_array') to record video. # Video is not recorded every episode, see capped_cubic_video_schedule for details. # Close the env and write monitor result info to disk env.close() gym.upload('./tmp/random-agent-results')
break self.agent.experience_global(total_reward) if train: action_idx = self.agent.list_to_index(action) self.agent.update_model(old_seq, action_idx, reward, new_seq) self.agent.reduce_epsilon() return total_reward if __name__ == '__main__': agent=DQNAgent() env=walkerEnvironment() sim=simulator(env,agent) best_reword = -200 for i in range(10000): total_reword = sim.run(train=True) if best_reword < total_reword: best_reword = total_reword print(str(i) + " " + str(total_reword) + " " + str(best_reword)) env.reset() if best_reword > 200: break env.monitor_close() gym.upload('./walker-experiment', api_key='sk_oOcEXAWRgKM6bBJjtTcTw')
# action = env.env.action_space.sample() #else: # action = np.argmax(Q[state]) #IF random number is less than epsilon grab the random action else grab the argument max of Q[state] action = env.env.action_space.sample() if np.random.random() < epsilon else np.argmax(Q[state]) nstate, reward, done, infor = env.step(action) total_reward += reward #if nstate == 14: # print "14" #elif nstate == 15: # print "15" #Q Function Update #(not done) keeps the terminal state as 0 Q[state][action] += alpha * (reward + gamma * Q[nstate].max() * (not done) - Q[state][action]) #Q[state][action] = Q[state][action] + alpha * (reward + gamma * np.max(Q[nstate]) - Q[state][action]) state = nstate epsilon *= epsilon_decay if done: break pi = np.argmax(Q, axis=1) return pi,Q pi, Q = q_learning(env) print(pi,Q) #print (pi,V) env.close() gym.upload(tdir, api_key = 'sk_4hbReZHkQFqRUBZ1beREWg')
# You provide the directory to write to (can be an existing # directory, but can't contain previous monitor results. You can # also dump to a tempdir if you'd like: tempfile.mkdtemp(). outdir = '/tmp/random-agent-results' env.monitor.start(outdir, force=True) episode_count = 100 max_steps = 200 reward = 0 done = False for i in xrange(episode_count): ob = env.reset() for j in xrange(max_steps): action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) if done: break # Dump result info to disk env.monitor.close() # Upload to the scoreboard. We could also do this from another # process if we wanted. logger.info( "Successfully ran RandomAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results." ) gym.upload(outdir, algorithm_id='random')
# consecutive episodes. avg_reward = np.sum(stats.episode_rewards) / len(stats.episode_rewards) print("Average reward : {}".format(avg_reward)) def moving_avg(x, n=100): return np.convolve(x, np.ones((n, )) / n, mode='valid') ma = moving_avg(stats.episode_rewards, interval) peaks = np.where(ma > target)[0] if len(peaks) > 0: print("solved after {} episodes".format(peaks[0])) return True else: print("did not pass the openai criteria") return False if __name__ == "__main__": TARGET_AVG_REWARD = 0.78 TARGET_EPISODE_INTERVAL = 100 env = gym.make('FrozenLake-v0') env = wrappers.Monitor(env, '/tmp/frozenlake-experiment-0', force=True) Q, stats = qlearning_alpha_noise(env, best_enabled=True) env.close() if is_solved(stats, TARGET_AVG_REWARD, TARGET_EPISODE_INTERVAL): OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] gym.upload('/tmp/frozenlake-experiment-0', api_key=OPENAI_API_KEY)
with open('API.json') as api: data = json.load(api) env = gym.make('FrozenLake-v0') env.monitor.start('/tmp/frozenlake-experiment-9') observation = env.reset() action_space = env.action_space observation, reward, done, info = env.step(action_space.sample()) agent = Agent(observation, reward, info, action_space.sample(), action_space) num_episodes = 5000 for i_episode in range(num_episodes): observation = env.reset() done = False t = 0 while not done: env.render() action = agent.take_action() (observation, reward, done, info) = env.step(action) agent.update(observation, reward, info, action, action_space) t = t + 1 if done: break print("Episode finished after {} timesteps".format(t+1)) env.monitor.close() gym.scoreboard.api_key = data["api_key"] gym.upload('/tmp/frozenlake-experiment-9')
for i_episodes in range(episodes): State = env.reset() state = agent.RGBprocess(State) state = agent.stack(state) totalreward = 0 done = False while not done: #if i_episodes % 50 == 0: #env.render() action = agent.act(state) new_state, reward, done, info = env.step(action) new_state = agent.RGBprocess(new_state) new_state_dif = agent.stack(new_state) agent.remember(state, action, reward, new_state_dif, done) state = new_state_dif totalreward += reward agent.memory_replay(batch_size) if done: print("{} episode, score = {}\n".format(i_episodes + 1, totalreward)) agent.save_model() score.append(totalreward) if i_episodes % 100 == 0: print("{} episode, score = {}, rolling mean 100 episodes = {}". format(i_episodes + 1, totalreward, np.mean(score[-100:]))) if i_episodes % 40 == 0: agent.update_target_model() agent.f.close() env.close() gym.upload(env_name, api_key='sk_WRCITkqmTJKYB9hvBk5tPA')
# env.render() qValues = deepQ.getQValues(observation, 0) qValues2 = deepQ.getQValues(observation, 1) qValues3 = deepQ.getQValues(observation, 2) # action = deepQ.selectActionAdded(qValues, qValues2, explorationRate) action = deepQ.selectActionMostPreferred(qValues, qValues2, qValues3, explorationRate) # action = deepQ.selectActionByProbability(qValues, 1) newObservation, reward, done, info = env.step(action) # if done: # reward = -50 deepQ.addMemory(observation, action, reward, newObservation, done) deepQ.learnOnMiniBatch(minibatch_size, 0) deepQ.learnOnMiniBatch(minibatch_size, 1) deepQ.learnOnMiniBatch(minibatch_size, 2) observation = newObservation if done: print "Episode ",epoch," finished after {} timesteps".format(t+1) break explorationRate -= (2.0/epochs) explorationRate = max (0.1, explorationRate) env.monitor.close() gym.upload('/tmp/wingedsheep-cartpole-democraticDeepQ8', api_key='sk_GC4kfmRSQbyRvE55uTWMOw')
def on_allepisodes_end(self, logs={}): import gym gym.upload(args.env_monitor_dirresults_folder, api_key=args.api_key)
np.random.seed(args.seed) env = gym.make('CartPole-v0') num_steps = args.num_steps ef = None if args.algorithm == 'cem': ef = cem else: ef = pcem outdir = '/tmp/' + args.outdir env.monitor.start(outdir, force=True) f = evaluation_func(BinaryActionLinearPolicy, env, num_steps) # params for cem params = dict(n_iters=args.iters, n_samples=args.samples, top_frac=args.top_frac) u = np.random.randn(env.observation_space.shape[0]+1) var = np.square(np.ones_like(u) * 0.1) for (i, data) in enumerate(ef(f, u, var, **params)): print("Iteration {}. Episode mean reward: {}".format(i, data['y_mean'])) agent = BinaryActionLinearPolicy(data['theta_mean']) if args.render: do_rollout(agent, env, num_steps, render=True) env.monitor.close() # make sure to setup your OPENAI_GYM_API_KEY environment variable if args.upload: gym.upload(outdir, algorithm_id=args.algorithm)
def simulate(): ## Instantiating the learning related parameters learning_rate = get_learning_rate(0) explore_rate = get_explore_rate(0) discount_factor = 0.999 # since the world is unchanging num_streaks = 0 for episode in range(NUM_EPISODES): # Reset the environment obv = env.reset() # the initial state state_0 = state_to_bucket(obv) for t in range(MAX_T): # env.render() # Select an action action = select_action(state_0, explore_rate) # Execute the action obv, reward, done, _ = env.step(action) # Observe the result state = state_to_bucket(obv) # Update the Q based on the result best_q = np.amax(q_table[state]) q_table[state_0 + (action,)] += learning_rate*(reward + discount_factor*(best_q) - q_table[state_0 + (action,)]) # Setting up for the next iteration state_0 = state # Print data if (DEBUG_MODE): print("\nEpisode = %d" % episode) print("t = %d" % t) print("Action: %d" % action) print("State: %s" % str(state)) print("Reward: %f" % reward) print("Best Q: %f" % best_q) print("Explore rate: %f" % explore_rate) print("Learning rate: %f" % learning_rate) print("Streaks: %d" % num_streaks) print("") if done: print("Episode %d finished after %f time steps" % (episode, t)) if t >= SOLVED_T: num_streaks += 1 else: num_streaks = 0 break #sleep(0.25) # It's considered done when it's solved over 120 times consecutively if num_streaks > STREAK_TO_END: break # Update parameters explore_rate = get_explore_rate(episode) learning_rate = get_learning_rate(episode) if ENABLE_UPLOAD: env.monitor.close() gym.upload('/tmp/cart_pole_q_learning_4D', api_key='sk_93AMQvdmReWCi8pdL4m6Q')
info['env_id'] = env.spec.id # ------------------------------------------ def noisy_evaluation(theta): agent = BinaryActionLinearPolicy(theta) rew, T = do_rollout(agent, env, num_steps) return rew # Train the agent, and snapshot each stage for (i, iterdata) in enumerate( cem(noisy_evaluation, np.zeros(env.observation_space.shape[0] + 1), **params)): print 'Iteration %2i. Episode mean reward: %7.3f' % ( i, iterdata['y_mean']) agent = BinaryActionLinearPolicy(iterdata['theta_mean']) if args.display: do_rollout(agent, env, 200, render=True) writefile('agent-%.4i.pkl' % i, cPickle.dumps(agent, -1)) # Write out the env at the end so we store the parameters of this # environment. writefile('info.json', json.dumps(info)) env.monitor.close() logger.info( "Successfully ran RandomAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results." ) gym.upload(outdir, algorithm_id='cem')
def performMultipleEpisodes(self, numEpisodes): #helper for performing #multiple episodes for episode in range(numEpisodes): self.performEpisode() #main process if __name__ == "__main__": #apiKey = sys.argv[1] #numEpisodes = sys.argv[2] #tests #stateVec = np.array([1,2,3,4]) #action = 1 #print firstIntWithPolynomials(stateVec,action) #testQ = LinearQFunc(linearBasisFunc) #weightVec = np.array([0,0,0,0,1,1,1,1]) #print testQ.q(stateVec,action,weightVec) #print testQ.gradQ(stateVec,action,weightVec) epsilon = .001 alpha = .5 gamma = 1 newInteraction = AgentEnvironmentInteraction("CartPole-v0", alpha, gamma, epsilon, firstInteractionBasisFunc, "../submission/cp-e-16") newInteraction.performMultipleEpisodes(800) newInteraction.env.close() gym.upload("../submission/cp-e-16", api_key=sys.argv[1])
def close_and_upload(self, api_key): self.env.close() gym.upload(self.monitor_dir, api_key=api_key)
def _exec_loop(self, num_episodes: int, frame_skip: int, warmup_steps: int, render_every_n: int, train: bool, upload: bool): """Train the agent in the environment for a specified number of episodes. :param num_episodes: Terminate training after this many new episodes are observed :param steps_before_training: Individual steps to take in the environment before training begins :param render_every_n: Render every nth episode :param upload: Upload training results to OpenAI Gym site? """ if frame_skip <= 0: raise ValueError( f'Invalid value of {frame_skip} for `frame_skip`. Value must be a positive integer.' ) # TODO: pass metrics during training # TODO: wire & unwire events self._status = self._build_status() self._raise_execution_start_event() if upload: assert self.api_key, 'An API key must be specified before uploading training results.' monitor_path = mkdtemp() self.env = gym.wrappers.Monitor(self.env, monitor_path) self._raise_warmup_start_event() # If 0 or None is passed, disable rendering if not render_every_n: render_every_n = num_episodes + 1 try: self._status.total_steps = 0 for episode_count in range(1, num_episodes + 1): # Initial counters for the episode self._status.episode = episode_count self._status.render = episode_count % render_every_n == 0 self._status.episode_done = False self._status.step = 0 total_episode_error = 0 self._raise_episode_start_event() s = self.env.reset() # Get initial state observation while not self._status.episode_done: # End the warmup period as soon as the required number of steps have been taken if self._status.total_steps == warmup_steps: self._raise_warmup_end_event() s = np.asarray(s) self._raise_step_start_event(s=s) # Action replay. We repeat the selected action for n steps. a = self.choose_action(s.reshape(1, -1)) if isinstance(self.env.action_space, gym.spaces.Box): a = a.reshape(self.env.action_space.shape) r = 0 # Replay the selected action as necessary # Accumulate the reward from each action, but don't let the agent observe the intermediate states for i in range(frame_skip): if self._status.render: self.env.render() # Take action and observe reward and new state s_prime, r_new, episode_done, _ = self.env.step(a) # Some environments return reward as an array, flatten into a float for consistency if isinstance(r_new, np.ndarray): r_new = np.sum(r_new) # Discount the reward received by taking each action, after the first. r += r_new * np.power(self.gamma, i) if episode_done: break self._status.action = a self._status.step += 1 self._status.reward = r self._status.total_steps += 1 s, a, r, s_prime, episode_done = self.preprocess_observation( s, a, r, s_prime, episode_done) self._status.episode_done = episode_done # Store the experience before setting early termination flag. # Just because we end early in a state on one trajectory doesn't mean that state should always # be treated as a terminal state. if self.memory is not None: self.memory.append((s, a, r, s_prime, episode_done)) # Force the episode to end if we've reached the maximum number of steps allowed if self.max_steps_per_episode and self._status.step >= self.max_steps_per_episode: self._status.episode_done = True self._raise_step_end_event(s=s, s_prime=s_prime, a=a, r=r) # Train the agent's model(s) if necessary if train: self._raise_train_start_event() stats = self._update_weights() assert isinstance(stats, dict) or stats is None, \ 'Value of {} returned by _update_weights() is not a dictionary or None'.format(stats) if stats is not None: self._status.update(stats) self._raise_train_end_event() s = s_prime self._raise_episode_end_event(total_error=total_episode_error) self.env.close() except KeyboardInterrupt: return finally: if upload: gym.upload(monitor_path, api_key=self.api_key) rmtree(monitor_path) # Cleanup the temp dir self._raise_execution_end_event()
def uploadSimulation(): API_KEY = open(keyPath, 'r').read().rstrip() gym.upload('OpenAI/' + GAME + "/Data", api_key=API_KEY)
state = env.reset() rAll = 0 done = False # The Q-Table learning algorithm while not done: # Choose an action by greedily (with noise) picking from Q table action = np.argmax(Q[state, :] + np.random.randn(1, action_space_n) / (i + 1)) # Get new state and reward from environment new_state, reward, done, _ = env.step(action) # Update Q-Table with new knowledge using learning rate Q[state, action] = (1 - learning_rate) * Q[state, action] \ + learning_rate * (reward + dis * np.max(Q[new_state, :])) rAll += reward state = new_state rList.append(rAll) env.close() gym.upload("gym-results", api_key="sk_VT2wPcSSOylnlPORltmQ") print("Score over time: " + str(sum(rList) / num_episodes)) print("Final Q-Table Values") print(Q) plt.bar(range(len(rList)), rList, color="blue") plt.show()
total_reward += reward agent.store_experience(state, action, reward, next_state, done) loss += agent.train() state = next_state step += 1 if done: break # print("episode: ", episode, "total reward: ", total_reward) logger_train.log_episode(total_reward, loss, episode) # # if episode == MAX_EPISODES-1: # print("Test Period") # for i in range(MAX_EPISODES_TEST): # total_reward_test = 0 # loss_test = 0 # step_test = 0 # state_test = env.reset() # while True: # env.render() # action_test = agent.set_action(state_test) # next_state_test,reward_test,done_test,info_test = env.step(action_test) # total_reward_test += reward_test # state_test = next_state_test # step_test += 1 # if done_test: # break # print("episode: ", i, "total reward: ", total_reward_test) env.close() gym.upload('upload/' + GAME, api_key='sk_ocA2j8g2QyqixgtrtVbOSA')
Q[current, action]) rewards.append(t_reward) iterations.append(i) # Close environment env.close() # Plot results def chunk_list(l, n): for i in range(0, len(l), n): yield l[i:i + n] size = episodes // 50 chunks = list(chunk_list(rewards, size)) averages = [sum(chunk) / len(chunk) for chunk in chunks] plt.plot(list(range(0, len(rewards), size)), averages) plt.xlabel('Episode') plt.ylabel('Average Reward') plt.show() # Push solution api_key = os.environ.get('GYM_API_KEY', False) if api_key: print('Push solution? (y/n)') if input().lower() == 'y': gym.upload(folder, api_key=api_key)
feed_dict={ state_ph: np.asarray([elem[0] for elem in minibatch]), action_ph: np.asarray([elem[1] for elem in minibatch]), reward_ph: np.asarray([elem[2] for elem in minibatch]), next_state_ph: np.asarray([elem[3] for elem in minibatch]), is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch]), is_training_ph: True }) # update slow actor and critic targets towards current actor and critic _ = sess.run(update_slow_targets_op) observation = next_observation total_steps += 1 steps_in_ep += 1 if done: # Increment episode counter _ = sess.run(episode_inc_op) break episode_returns.append(total_reward) print('Episode %2i, Reward: %7.3f, Steps: %i, avg return: %.2f' % (ep, total_reward, steps_in_ep, np.mean(episode_returns))) # Finalize and upload results writefile('info.json', json.dumps(info)) env.close() gym.upload(outdir)
stats["KL between old and new distribution"] = kloldnew stats["Surrogate loss"] = surrafter for k, v in stats.iteritems(): print(k + ": " + " " * (40 - len(k)) + str(v)) if entropy != entropy: exit(-1) if exp > 0.8: self.train = False i += 1 training_dir = tempfile.mkdtemp() logging.getLogger().setLevel(logging.DEBUG) if len(sys.argv) > 1: task = sys.argv[1] else: task = "RepeatCopy-v0" env = envs.make(task) env.monitor.start(training_dir) env = SpaceConversionEnv(env, Box, Discrete) agent = TRPOAgent(env) agent.learn() env.monitor.close() gym.upload(training_dir, algorithm_id='trpo_ff')
trainer = Trainer( agent, gamma=0.95, learning_rate=0.1, learning_rate_decay=learning_decay, epsilon=1.0, epsilon_decay=epsilon_decay, max_step=250) if monitor: env.monitor.start(RECORD_PATH) trainer.train(env, episode_count=episodes, render=render) if monitor: env.monitor.close() if __name__ == "__main__": parser = argparse.ArgumentParser(description="train & run cartpole ") parser.add_argument("--episode", type=int, default=1000, help="episode to train") parser.add_argument("--render", action="store_true", help="render the screen") parser.add_argument("--monitor", action="store_true", help="monitor") parser.add_argument("--upload", type=str, default="", help="upload key to openai gym (training is not executed)") args = parser.parse_args() if args.upload: if os.path.isdir(RECORD_PATH): gym.upload(RECORD_PATH, api_key=args.upload) else: main(args.episode, args.render, args.monitor)
dim_theta = (env.observation_space.shape[0]+1) * env.action_space.n elif isinstance(env.action_space, Box): dim_theta = (env.observation_space.shape[0]+1) * env.action_space.shape[0] else: raise NotImplementedError # Initialize mean and standard deviation theta_mean = np.zeros(dim_theta) theta_std = np.ones(dim_theta) # Now, for the algorithm env.monitor.start('/tmp/pendulum', force = True) for iteration in xrange(n_iter): # Sample parameter vectors thetas = np.random.normal(theta_mean, theta_std, (batch_size, dim_theta)) rewards = [noisy_evaluation(theta) for theta in thetas] # Get elite parameters n_elite = int(batch_size * elite_frac) elite_inds = np.argsort(rewards)[batch_size - n_elite:batch_size] elite_thetas = [thetas[i] for i in elite_inds] # Update theta_mean, theta_std theta_mean = np.mean(elite_thetas, axis = 0) theta_std = np.std(elite_thetas, axis = 0) print "iteration %i. mean f: %8.3g. max f: %8.3g"%(iteration, np.mean(rewards), np.max(rewards)) do_episode(make_policy(theta_mean), env, num_steps, render=True) env.monitor.close() gym.upload('/tmp/pendulum', api_key='API_KEY ')
n_episodes = 4000 alpha = 0.4 gamma = 0.9 epsilon_decay_rate = 0.01 # Apply SARSA algorithm with linear epsilon decay. # Example submissions: # https://gym.openai.com/evaluations/eval_9UzqaZ5RgyQZrNePfe5Ww # https://gym.openai.com/evaluations/eval_ZXo8CfLQa6SOrPtY8e5w for episode_idx in range(n_episodes): s = env.reset() done = False policy = eps_greedy_policy(Q, max(1 - episode_idx * epsilon_decay_rate, 0), actions) a = policy(s) while True: s_star, reward, done, _ = env.step(a) if done: Q[s, a] += alpha * (reward - Q[s, a]) break a_star = policy(s_star) Q[s, a] += alpha * (reward + gamma * Q[s_star, a_star] - Q[s, a]) s = s_star a = a_star env.close() gym.upload('first_visit', api_key='sk_4jIycd3IT1SyJLj3d2mFxw')
env = gym.make('CartPole-v0') agent = RandomAgent(env.action_space) # You provide the directory to write to (can be an existing # directory, but can't contain previous monitor results. You can # also dump to a tempdir if you'd like: tempfile.mkdtemp(). outdir = '/tmp/random-agent-results' env.monitor.start(outdir, force=True) episode_count = 100 max_steps = 200 reward = 0 done = False for i in range(episode_count): ob = env.reset() for j in range(max_steps): action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) if done: break # Dump result info to disk env.monitor.close() # Upload to the scoreboard. We could also do this from another # process if we wanted. logger.info("Successfully ran RandomAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.") gym.upload(outdir, algorithm_id='random')
def Upload(): # Upload training record gym.upload(RECORD_DIR + RECORD_FILENAME, api_key=API_KEY)
env = self.env ret = [] for o, r, d in zip(observation_n, reward_n, done_n): o = env.observation_convert(o, env._env.observation_space, env.observation_space) obs = np.expand_dims(o, 0) action_dist_n = self.session.run(self.action_dist_n, {self.obs: obs}) action = int(np.argmax(action_dist_n, 1)[0]) action = env.action_convert(action, env.action_space, env._env.action_space) ret.append(action) return ret experiment_dir = tempfile.mkdtemp() logging.getLogger().setLevel(logging.DEBUG) print ("taks = {}".format(args.task)) env = envs.make(args.task) env.monitor.start(experiment_dir) agent = ContinTRPOAgent(env) agent.learn() env.monitor.close() gym.upload(experiment_dir, algorithm_id=algo) print (experiment_dir) from sys import argv print ('python {}'.format(' '.join(argv)))
def submit_result(self,algo_id,api_key): gym.upload(self.monitor_file, algorithm_id=algo_id, api_key=api_key, ignore_open_monitors=False)
def uploadSimulation(): API_KEY = open('/home/dollarakshay/Documents/API Keys/Open AI Key.txt', 'r').read().rstrip() gym.upload('OpenAI/'+GAME+"/Data", api_key=API_KEY)
np.random.seed(args.seed) tf.set_random_seed(args.seed) env = gym.make(args.env) outdir = '/tmp/' + args.outdir + '-' + args.env #env.monitor.start(outdir, force=True) print("******* WILL SAVE RESULTS TO", outdir, " *******") sess = tf.Session() in_dim = flatten_space(env.observation_space) out_dim = flatten_space(env.action_space) hidden_dim = 8 opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate) policy = CategoricalPolicy(in_dim, out_dim, hidden_dim, opt, sess) po = PolicyOptimizer(env, policy, 0, args.n_iter, args.n_episode, args.path_length) sess.run(tf.initialize_all_variables()) # train the policy optimizer po.train() env.monitor.close() # make sure to setup your OPENAI_GYM_API_KEY environment variable if args.upload: gym.upload(outdir, algorithm_id=args.algorithm)
# You provide the directory to write to (can be an existing # directory, but can't contain previous monitor results. You can # also dump to a tempdir if you'd like: tempfile.mkdtemp(). outdir = "tmp/c2" env.monitor.start(outdir, force=True) episode_count = 100 max_steps = 200 reward = 0 done = False for i in xrange(episode_count): ob = env.reset() reward = done = None for j in xrange(max_steps): action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) env.render() if done: break # Dump result info to disk env.monitor.close() # Upload to the scoreboard. We could also do this from another # process if we wanted. logger.info("Successfully ran CodedAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.") gym.upload(outdir, algorithm_id='coded', api_key='YOUR API_KEY')
import gym api='sk_bomf7HJCRRul6yEOGkCqdw' gym.upload('/tmp/cartpole-experiment-1', api_key=api)
# TO DO: for j in range(n_episodes): done = False state = env.reset() policy = epsilon_greedy_policy(Q, epsilon=1. / (j + 1), actions=actions) episode = [] ### Generate sample episode while not done: action = policy(state) new_state, reward, done, _ = env.step(action) episode.append((state, action, reward)) print((state, action, reward)) state = new_state ### NOT RELEVANT FOR SARSA/Q-LEARNING sa_in_episode = set([(x[0], x[1]) for x in episode]) # Find first visit of each s,a in the episode for s, a in sa_in_episode: first_visit = next(i for i, x in enumerate(episode) if x[0] == s and x[1] == a) G = sum(x[2] * (gamma**i) for i, x in enumerate(episode[first_visit:])) R[s, a] += G N[s, a] += 1 Q[s, a] += R[s, a] / N[s, a] env.close() gym.upload('first_visit', api_key=API_KEY)
# This declaration must go *after* the monitor call, since the # monitor's seeding creates a new action_space instance with the # appropriate pseudorandom number generator. agent = RandomAgent(env.action_space) episode_count = 100 max_steps = 200 reward = 0 done = False for i in range(episode_count): ob = env.reset() for j in range(max_steps): action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) if done: break # Note there's no env.render() here. But the environment still can open window and # render if asked by env.monitor: it calls env.render('rgb_array') to record video. # Video is not recorded every episode, see capped_cubic_video_schedule for details. # Dump result info to disk env.monitor.close() # Upload to the scoreboard. We could also do this from another # process if we wanted. logger.info("Successfully ran RandomAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.") gym.upload(outdir)
# This declaration must go *after* the monitor call, since the # monitor's seeding creates a new action_space instance with the # appropriate pseudorandom number generator. agent = RandomAgent(env.action_space) episode_count = 1000 max_steps = 200 reward = 0 done = False for i in range(episode_count): ob = env.reset() for j in range(max_steps): action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) if done: break # Note there's no env.render() here. But the environment still can open window and # render if asked by env.monitor: it calls env.render('rgb_array') to record video. # Video is not recorded every episode, see capped_cubic_video_schedule for details. # Dump result info to disk env.monitor.close() # Upload to the scoreboard. We could also do this from another # process if we wanted. logger.info("Successfully ran RandomAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.") gym.upload(outdir,api_key='sk_A1BpYdknRCFWZDAIFWSew')
import gym gym.upload('/tmp/cartpole-experiment-1', api_key='sk_J3iun2y7RvKPIZtvgmyJXw')
from __future__ import print_function from __future__ import absolute_import import argparse import os import os.path as osp import gym from rllab.viskit.core import load_params if __name__ == "__main__": # rl_gym.api_key = 'g8JOpnNVmcjMShBiFtyji2VWX3P2uCzc' if 'OPENAI_GYM_API_KEY' not in os.environ: raise ValueError("OpenAi Gym API key not configured. Please register an account on https://gym.openai.com and" " set the OPENAI_GYM_API_KEY environment variable, and try the script again.") parser = argparse.ArgumentParser() parser.add_argument('log_dir', type=str, help='path to the logging directory') parser.add_argument('--algorithm_id', type=str, default=None, help='Algorithm ID') args = parser.parse_args() snapshot_dir = osp.abspath(osp.join(args.log_dir, "..")) params_file_path = osp.join(snapshot_dir, "params.json") params_json = load_params(params_file_path) gym.upload(args.log_dir, algorithm_id=args.algorithm_id)