Esempio n. 1
0
def main():
    try:
        NUM_ITER=int(sys.argv[1])
    except:
        NUM_ITER=1000
    try:
        NUM_EPISODES=int(sys.argv[2])
    except:
        NUM_EPISODES=1
    try:
        UPLOAD = bool(sys.argv[3])
    except:
        UPLOAD = False
    try:
        PROBLEM = sys.argv[4]
    except:
        PROBLEM = 'Copy-v0'
    DIR = '/tmp/openai/'+PROBLEM+'/'

    env = gym.make(PROBLEM)
    env.monitor.start(DIR, force=True)
    for _ in xrange(NUM_EPISODES):
        print 'running episode', _
        env.reset()
        ai = MctsAi(env, NUM_ITER=NUM_ITER)
        ai.playAll()
    env.monitor.close()

    if(UPLOAD):
        gym.upload(DIR, api_key='sk_xzHs5ZzjQviZDZ8R2luFPw')

    return 0
Esempio n. 2
0
File: pg.py Progetto: wingedsheep/rl
    def run(self, 
            epochs,  
            steps, 
            api_key,
            rollouts_per_epoch = 100,
            updateTargetNetwork = defaultRunSettings['updateTargetNetwork'], 
            explorationRate = defaultRunSettings['explorationRate'], 
            miniBatchSize = defaultRunSettings['miniBatchSize'], 
            learnStart = defaultRunSettings['learnStart'], 
            renderPerXEpochs = defaultRunSettings['renderPerXEpochs'], 
            shouldRender = defaultRunSettings['shouldRender'], 
            experimentId = defaultRunSettings['experimentId'], 
            force = defaultRunSettings['force'], 
            upload = defaultRunSettings['upload']):

        last100Scores = [0] * 100
        last100ScoresIndex = 0
        last100Filled = False

        if experimentId != None:
            self.env.monitor.start('tmp/'+experimentId, force = force)

        for epoch in xrange(epochs):
            paths = []
            for rollout in xrange(rollouts_per_epoch):
                path = {}
                path["actions"] = []
                path["rewards"] = []
                path["states"] = []
                path["isDone"] = []

                observation = self.env.reset()
                # number of timesteps
                totalReward = 0
                for t in xrange(steps):
                    policyValues = self.runModel(self.policyModel, observation)
                    action = self.selectActionByProbability(policyValues)
                    # action = self.selectActionByProbability(self.convertToProbabilities(policyValues))

                    path["states"].append(observation)
                    path["actions"].append(action)

                    newObservation, reward, done, info = self.env.step(action)

                    path["rewards"].append(reward)
                    path["isDone"].append(done)

                    totalReward += reward

                    observation = newObservation

                    if done:
                        break
                paths.append(path)

            self.learn(paths)

        self.env.monitor.close()
        if upload:
            gym.upload('/tmp/'+experimentId, api_key=api_key)
Esempio n. 3
0
def upload():
    """
    Upload the results of training (as automatically recorded by
    your env's monitor) to OpenAI Gym.
    
    Parameters:
        - training_dir: A directory containing the results of a
        training run.
        - api_key: Your OpenAI API key
        - algorithm_id (default=None): An arbitrary string
        indicating the paricular version of the algorithm
        (including choices of parameters) you are running.
        """  
    request_data = request.get_json()

    j = request.get_json()
    training_dir = get_required_param(j, 'training_dir')
    api_key      = get_required_param(j, 'api_key')
    algorithm_id = j.get('algorithm_id', None)

    try:
        gym.upload(training_dir, algorithm_id, writeup=None, api_key=api_key,
                   ignore_open_monitors=False)
        return ('', 204)
    except gym.error.AuthenticationError:
        raise InvalidUsage('You must provide an OpenAI Gym API key')
Esempio n. 4
0
File: uploader.py Progetto: gdb/trpo
 def run(self):
     for entry in os.listdir(self.base_dir):
         if entry in ['.', '..']:
             continue
         training_dir = os.path.join(self.base_dir, entry)
         if not os.path.isdir(training_dir):
             logger.info('Skipping: {}'.format(training_dir))
             continue
         gym.upload(training_dir, algorithm_id=self.algorithm_id, writeup=self.writeup)
Esempio n. 5
0
  def run(self):
    self.t_train = 0
    self.t_test = 0

    # create filtered environment
    self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env))
    # self.env = gym.make(FLAGS.env)
    
    self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False)
    gym.logger.setLevel(gym.logging.WARNING)

    dimO = self.env.observation_space.shape
    dimA = self.env.action_space.shape
    print(dimO,dimA)

    import pprint
    pprint.pprint(self.env.spec.__dict__,width=1)

    self.agent = ddpg.Agent(dimO=dimO,dimA=dimA)

    returns = []

    # main loop
    while self.t_train < FLAGS.total:

      # test
      T = self.t_test
      R = []
      self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False,resume=True)
      while self.t_test - T < FLAGS.test:
        R.append(self.run_episode(test=True,monitor=(len(R)==0)))
      avr = np.mean(R)
      print('Average test return\t{} after {} timesteps of training'.format(avr,self.t_train))
      # save return
      returns.append((self.t_train, avr))
      np.save(FLAGS.outdir+"/returns.npy",returns)

      # evaluate required number of episodes for gym
      if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold:
        for i in range(self.env.spec.trials):
          self.run_episode(test=True)

      self.env.monitor.close()


      # train
      T = self.t_train
      R = []
      while self.t_train - T < FLAGS.train:
        R.append(self.run_episode(test=False))
      avr = np.mean(R)
      print('Average training return\t{} after {} timesteps of training'.format(avr,self.t_train))

    self.env.monitor.close()
    # upload results
    if FLAGS.upload:
      gym.upload(FLAGS.outdir+"/monitor",algorithm_id = GYM_ALGO_ID)
Esempio n. 6
0
File: run.py Progetto: leopard1/ddpg
    def run(self):
        self.t_train = 0
        self.t_test = 0

        # create filtered environment
        self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env))
        # self.env = gym.make(FLAGS.env)

        self.env.monitor.start(FLAGS.outdir + '/monitor/', video_callable=lambda _: False)
        gym.logger.setLevel(gym.logging.WARNING)

        dimO = self.env.observation_space.shape
        dimA = self.env.action_space.shape
        print(dimO, dimA)

        import pprint
        pprint.pprint(self.env.spec.__dict__, width=1)

        self.agent = ddpg.Agent(dimO=dimO, dimA=dimA)

        simplelog = open(FLAGS.outdir + '/log.txt', 'w')

        # main loop
        while self.t_train < FLAGS.total:

            # test
            T = self.t_test
            R = []
            while self.t_test - T < FLAGS.test:
                R.append(self.run_episode(test=True, monitor=np.random.binomial(1, FLAGS.monitor)))
                self.t_test += 1
            avr = np.mean(R)
            print('Average test return\t{} after {} episodes of training'.format(avr, self.t_train))
            print >> simplelog, "%d\t%d" % (self.t_train, avr)

            # evaluate required number of episodes for gym and end training when above threshold
            if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold:
                avr = np.mean([self.run_episode(test=True) for _ in range(self.env.spec.trials)])
                if avr > self.env.spec.reward_threshold:
                    break

            # train
            T = self.t_train
            R = []
            while self.t_train - T < FLAGS.train:
                R.append(self.run_episode(test=False))
                self.t_train += 1
            avr = np.mean(R)
            print('Average training return\t{} after {} episodes of training'.format(avr, self.t_train))

        self.env.monitor.close()
        # upload results
        if FLAGS.upload:
            gym.upload(FLAGS.outdir + "/monitor", algorithm_id=GYM_ALGO_ID)
    def upload(self, instance_id, algorithm_id, writeup, api_key, ignore_open_monitors):
        """
        Upload training information created with monitor.
        :param instance_id: Id of the environment that was trained.
        :param algorithm_id: An arbitrary string indicating the paricular version of the algorithm
               (including choices of parameters) you are running.
        :param writeup: A Gist URL (of the form https://gist.github.com/<user>/<id>)
                        containing your writeup for this evaluation.
        :param api_key:  Your OpenAI API key. Can also be provided as an environment variable (OPENAI_GYM_API_KEY).
        :param ignore_open_monitors: Ignore open monitors when uploading.
        :return:
        """
        directory = self.TRAINING_DIRECTORY.format(instance_id)

        gym.upload(directory, algorithm_id, writeup, api_key,
                   ignore_open_monitors)
Esempio n. 8
0
File: run.py Progetto: amoliu/ddpg
  def run(self):
    self.t_log = 103
    self.t_global = 0

    # create filtered environment
    self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env))

    self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False)
    gym.logger.setLevel(gym.logging.WARNING)

    dimO = self.env.observation_space.shape
    dimA = self.env.action_space.shape

    print('dimO: '+str(dimO) +'  dimA: '+str(dimA))

    self.agent = ddpg.Agent(dimO=dimO,dimA=dimA)

    returns = []
    t_last_test = 0

    # main loop
    while self.t_global < t_train:

      # test
      t_last_test = self.t_global
      R = np.mean([self.run_episode(test=True,render=render) for _ in range(n_test)])
      returns.append((self.t_global, R))
      np.save(FLAGS.outdir+"/returns.npy",returns)
      print('Average return '+str(R)+ ' after '+str(self.t_global)+' timesteps of training')

      # train
      while self.t_global-t_last_test <  FLAGS.test:
        self.run_episode(test=False)

    self.env.monitor.close()
    # upload results
    if FLAGS.gymkey != '':
      gym.upload(FLAGS.outdir+"/monitor",FLAGS.gymkey)
Esempio n. 9
0
...


#%%  Monitor Wrapper

cd d:/ROBOCZY/Python/Gym

#%%
import gym
import ffmpeg
from gym import wrappers
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, './cartpole-experiment-1')
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()
gym.upload('./cartpole-experiment-1', api_key='blah')

#!  ERROR: DependencyNotInstalled:f

#%%

import gym
gym.upload('/tmp/cartpole-experiment-1', api_key='sk_5YJsWfHOQwOLiU3AAVyYeA')
Esempio n. 11
0
#from random import *
#import tensorflow as tf
#from nets import *
#from learner import *


def h(x):
    return 0 if x < 0 else 1


if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    env = gym.wrappers.Monitor(env, './evolution/cartpole-v0', force=True)
    policy_f = lambda w, obs: int(h(np.dot(w[0:4], obs) + w[4]))
    (te, _, _) = evolve_env(env,
                            policy_f,
                            np.asarray([0, 0, 0, 0, 0]),
                            gaussian_perturb,
                            normalized_avg,
                            alpha=0.1,
                            spawn=50,
                            stages=50,
                            print_every=1,
                            max_steps=1000,
                            eval_trials=100)
    print("Running:")
    for i in range(1000):
        print("Reward %f" % env_f(env, policy_f, te, max_steps=1000))
    env.close()
    gym.upload('./evolution/cartpole-v0', api_key='API_KEY')
Esempio n. 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--environment', type=str, default='CartPole-v0',
        help='OpenAI Gym environment to run.')
    parser.add_argument('-p', '--episodes', type=int, default=1000,
        help='Number of episodes to simulate.')
    parser.add_argument('-g', '--goal', type=int, default=195,
        help='Goal score for the environment.')
    parser.add_argument('-t', '--time', type=int, default=200,
        help='Time steps for each episode.')
    parser.add_argument('-a', '--agent', type=str, default='QL',
        help='Learning agent type (QL or NAF).')
    parser.add_argument('--report', action='store_true', help='Report results.')
    parser.add_argument('-d', '--debug', action='store_true', 
        help='Print max values at each time-step.')

    args = parser.parse_args()
    print(args)

    environment = args.environment
    episodes = args.episodes
    goal = args.goal
    time = args.time

    env = gym.make(environment)

    if (args.agent == 'QL'):
        agent = QLAgent(env)
    elif (args.agent == 'NAF'):
        agent = NAFAgent(env)
    elif (args.agent == 'Random'):
        agent = RandomAgent(env)

    scores = []

    if args.report:
        filename = 'tmp/gym-report'
        env.monitor.start(filename, force=True)

    for i_episode in range(episodes):
        # Get initial observation.
        agent.reset()
        observation = env.reset()

        score = 0
        alt_score = 0

        # Run n = time steps
        for t in range(time):
            # Save the previous state.
            prev_state = observation

            #env.render()
            #if i_episode % 500 == 0:
            #    env.render()
            report = args.debug
            next_action = agent.get_action(observation, report)

            observation, reward, done, info = env.step(next_action)

            score += reward

            reward += observation[0]
            if report:
                print(reward)

            alt_score += reward

            agent.update(prev_state, next_action, reward, observation, done)

            if done or t == time - 1:
                print(i_episode + 1, score, alt_score, t, done)
                scores.append(score)

                running_avg = np.average(scores[-100:])
                #if running_avg > goal:
                #    print '100-run average {0} on run {1}!'.format(\
                #        running_avg, i_episode)

                if (i_episode + 1) % 50 == 0:
                    print('{0} average score at {1}'.format(running_avg, \
                        i_episode + 1))
                                
                break

    if args.report:
        env.monitor.close()
        key_file = open('api.key', 'r')
        gym_key = key_file.readline()
        if args.agent == 'NAF':
            algo_id = 'alg_xjVArtUxQXqfSq5q89dRjQ'
        else:
            algo_id = 'alg_sbIxfyjIRUSBrBA1IOFg'
        gym.upload(filename, api_key=gym_key, algorithm_id=algo_id)

    return
Esempio n. 13
0
def uploadSimulation():
    API_KEY = open('/home/dollarakshay/Documents/API Keys/Open AI Key.txt',
                   'r').read().rstrip()
    gym.upload('Artificial Intelligence/' + GAME, api_key=API_KEY)
env.monitor.start('./frozenlake-experiment', force=True)

for i_episode in range(n_episode):

    observation = env.reset() #reset environment to beginning 

    #run for several time-steps
    for t in xrange(max_time_steps): 
        #display experiment
        #env.render() 

        #sample a random action 
        action = opt[observation]

        #observe next step and get reward 
        observation, reward, done, info = env.step(action)

        if done:
            #env.render() 
            print "Simulation finished after {0} timesteps".format(t)
            break
            
env.monitor.close()

gym.upload('/home/lucianodp/Documents/eua/reinforcement_learning/notebooks/frozenlake-experiment', api_key='sk_qkx3jhBbTRamxadtXqA3pQ')





Esempio n. 15
0
File: run.py Progetto: febert/DeepRL
    def run(self,env):
        self.t_train = 0
        self.t_test = 0

        # create filtered environment
        # self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env))
        self.env = filter_env.makeFilteredEnv(gym.make(env))

        self.t_elapsed = []

        # self.env = gym.make(FLAGS.env)

        if tf.gfile.Exists(FLAGS.outdir):
            tf.gfile.DeleteRecursively(FLAGS.outdir)
        # self.env.monitor.start(FLAGS.outdir + '/monitor/', video_callable=lambda _: False)
        # gym.logger.setLevel(gym.logging.WARNING)

        dimO = self.env.observation_space.shape
        dimA = self.env.action_space.shape
        print 'observationspace action space',
        print(dimO, dimA)

        import pprint
        pprint.pprint(self.env.spec.__dict__, width=1)

        self.agent = ddpg.Agent(dimO=dimO, dimA=dimA)

        returns = []

        it = 0
        episodelengths = []
        testlengths = []

        if env == 'Reacher-v1':
            self.train_frequency = 1
            test_frequency = 3
            plot_frequency = 1

        if env == 'MountainCarContinuous-v0':
            test_frequency = 10
            plot_frequency = 1
            self.train_frequency = 16

        if env == 'InvertedPendulum-v1':
            test_frequency = 100
            plot_frequency = 300
            self.train_frequency = 1

        print 'using train frequency', self.train_frequency

        # main loop
        while self.t_train < FLAGS.total:

            it +=1

            episodelengths.append(self.run_episode(test=False))


            if it % test_frequency== 0:
                testlengths.append(self.run_episode(test=True))

            if it % plot_frequency == 0:
                print 'avg time for sim step:', np.mean(np.array(self.t_elapsed))
                plotting.plot_episode_lengths(episodelengths)
                plotting.plot_episode_lengths(testlengths)
                # plotting.plot_replay_memory_2d_state_histogramm(self.agent.rm.observations)
                # plotting.plot_learned_mu(self.agent.act_test, self.env)

            # else:
            #     # test
            #     T = self.t_test
            #     R = []
            #
            #     while self.t_test - T < FLAGS.test:
            #         # print 'running test episode'
            #         R.append(self.run_episode(test=True, monitor=(self.t_test - T < FLAGS.monitor * FLAGS.test)))
            #     avr = np.mean(R)
            #     print('Average test return\t{} after {} timesteps of training'.format(avr, self.t_train))
            #     # save return
            #     returns.append((self.t_train, avr))
            #     np.save(FLAGS.outdir + "/returns.npy", returns)
            #
            #     # evaluate required number of episodes for gym and end training when above threshold
            #     if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold:
            #         avr = np.mean([self.run_episode(test=True) for _ in range(self.env.spec.trials)])
            #         if avr > self.env.spec.reward_threshold:
            #             break
            #
            #     # train
            #     T = self.t_train
            #     R = []
            #     while self.t_train - T < FLAGS.train:
            #         # print 'running train episode'
            #         R.append(self.run_episode(test=False))
            #     avr = np.mean(R)
            #     print('Average training return\t{} after {} timesteps of training'.format(avr, self.t_train))

        # self.env.monitor.close()
        # upload results
        if FLAGS.upload:
            gym.upload(FLAGS.outdir + "/monitor", algorithm_id=GYM_ALGO_ID)
            if episode_number % EPISODES_PER_PRINT_PROGRESS == 0:
                t = perf_counter() - training_start_time
                print("(%d s) Episode: %d, Average reward = %f." %
                      (t, episode_number,
                       reward_sum / EPISODES_PER_PRINT_PROGRESS))
                reward_sum = 0

            # It is considered solved when the sum of reward is over 200
            if reward > DONE_REWARD_LEVEL:
                num_streaks += 1
                solved_episode = episode_number
            else:
                num_streaks = 0
                solved_episode = -1

            # It's considered done when it's solveFalsed over 120 times consecutively
            if num_streaks > STREAK_TO_END:
                print("Task solved in %d episodes and repeated %d times." %
                      (episode_number, num_streaks))
                break

        agent.brain.model.save_model(
            os.path.join(TRAINED_MODEL_DIR, TRAINED_MODEL_NAME), False)

        if GYM_ENABLE_UPLOAD:
            env.monitor.close()
            gym.upload(GYM_VIDEO_PATH, api_key=GYM_API_KEY)

    # testing the model
    test(os.path.join(TRAINED_MODEL_DIR, TRAINED_MODEL_NAME),
         num_episodes=1000)
Esempio n. 17
0
    def run(self, 
            epochs, 
            steps, 
            api_key,
            monitor = True,
            updateTargetNetwork = defaultRunSettings['updateTargetNetwork'], 
            explorationRate = defaultRunSettings['explorationRate'], 
            miniBatchSize = defaultRunSettings['miniBatchSize'], 
            learnStart = defaultRunSettings['learnStart'], 
            renderPerXEpochs = defaultRunSettings['renderPerXEpochs'], 
            shouldRender = defaultRunSettings['shouldRender'], 
            experimentId = defaultRunSettings['experimentId'], 
            force = defaultRunSettings['force'], 
            upload = defaultRunSettings['upload']):
    
        last100Scores = [0] * 100
        last100ScoresIndex = 0
        last100Filled = False

        stepCounter = 0

        if experimentId != None and monitor:
            self.env.monitor.start('tmp/'+experimentId, force = force)

        for epoch in xrange(epochs):
            observation = self.env.reset()

            qValues = self.getQValues(observation)
            action = self.selectAction(qValues, explorationRate)

            print explorationRate
            # number of timesteps
            totalReward = 0
            for t in xrange(steps):
                if epoch % renderPerXEpochs == 0 and shouldRender:
                    self.env.render()

                newObservation, reward, done, info = self.env.step(action)
                
                qValues = self.getQValues(observation)
                newAction = self.selectAction(qValues, explorationRate)

                totalReward += reward

                self.addMemory(observation, action, reward, newObservation, newAction, done)

                if stepCounter >= learnStart:
                    if stepCounter <= updateTargetNetwork:
                        self.learnOnMiniBatch(miniBatchSize, False)
                    else :
                        self.learnOnMiniBatch(miniBatchSize, True)

                observation = newObservation.copy()
                action = newAction

                if done:
                    last100Scores[last100ScoresIndex] = totalReward
                    last100ScoresIndex += 1
                    if last100ScoresIndex >= 100:
                        last100Filled = True
                        last100ScoresIndex = 0
                    if not last100Filled:
                        print "Episode ",epoch," finished after {} timesteps".format(t+1)," with total reward",totalReward
                    else :
                        print "Episode ",epoch," finished after {} timesteps".format(t+1)," with total reward",totalReward," last 100 average: ",(sum(last100Scores)/len(last100Scores))
                    break

                stepCounter += 1
                if stepCounter % updateTargetNetwork == 0:
                    self.updateTargetNetwork()
                    print "updating target network"

            explorationRate *= 0.995
            # explorationRate -= (2.0/epochs)
            explorationRate = max (0.05, explorationRate)

        self.env.monitor.close()
        if upload:
            gym.upload('/tmp/'+experimentId, api_key=api_key)
Esempio n. 18
0
import gym
gym.upload('tmp/MountainCar-v0-mc',
algorithm_id=None,
api_key='sk_iQhXCgvKRjyxitEVLVW13g')
Esempio n. 19
0
def do_submit(output, api_key):
    gym.upload(output, api_key=api_key)
Esempio n. 20
0
    # This declaration must go *after* the monitor call, since the
    # monitor's seeding creates a new action_space instance with the
    # appropriate pseudorandom number generator.
    agent = RandomAgent(env.action_space)

    episode_count = 100
    max_steps = 200
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()

        for j in range(max_steps):
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            if done:
                break
            # Note there's no env.render() here. But the environment still can open window and
            # render if asked by env.monitor: it calls env.render('rgb_array') to record video.
            # Video is not recorded every episode, see capped_cubic_video_schedule for details.

    # Dump result info to disk
    env.monitor.close()

    # Upload to the scoreboard. We could also do this from another
    # process if we wanted.
    logging.info("Successfully ran RandomAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.")
    gym.upload(outdir, api_key='YOUR_API_KEY')
Esempio n. 21
0
            else :
                deepQ.learnOnMiniBatch(minibatch_size, True)

        observation = newObservation

        if done:
            last100Scores[last100ScoresIndex] = t
            last100ScoresIndex += 1
            if last100ScoresIndex >= 100:
                last100Filled = True
                last100ScoresIndex = 0
            if not last100Filled:
                print "Episode ",epoch," finished after {} timesteps".format(t+1)
            else :
                print "Episode ",epoch," finished after {} timesteps".format(t+1)," last 100 average: ",(sum(last100Scores)/len(last100Scores))
            break

        stepCounter += 1
        if stepCounter % updateTargetNetwork == 0:
            deepQ.updateTargetNetwork()
            print "updating target network"

    explorationRate *= 0.995
    # explorationRate -= (2.0/epochs)
    explorationRate = max (0.05, explorationRate)

deepQ.printNetwork()

env.monitor.close()
gym.upload('/tmp/wingedsheep-cartpole-deepQLearning6', api_key='sk_GC4kfmRSQbyRvE55uTWMOw')
Esempio n. 22
0
    def run(self, 
            epochs, 
            steps, 
            api_key,
            updateTargetNetwork = defaultRunSettings['updateTargetNetwork'], 
            explorationRate = defaultRunSettings['explorationRate'], 
            miniBatchSize = defaultRunSettings['miniBatchSize'], 
            learnStart = defaultRunSettings['learnStart'], 
            renderPerXEpochs = defaultRunSettings['renderPerXEpochs'], 
            shouldRender = defaultRunSettings['shouldRender'], 
            experimentId = defaultRunSettings['experimentId'], 
            force = defaultRunSettings['force'], 
            upload = defaultRunSettings['upload']):

        last100Scores = [0] * 100
        last100ScoresIndex = 0
        last100Filled = False

        if experimentId != None:
            self.env.monitor.start('tmp/'+experimentId, force = force)

        for epoch in xrange(epochs):
            path = {}
            path["actions"] = []
            path["rewards"] = []
            path["states"] = []
            path["values"] = []
            path["isDone"] = []

            observation = self.env.reset()
            # number of timesteps
            totalReward = 0
            for t in xrange(steps):
                if epoch % renderPerXEpochs == 0 and shouldRender:
                    self.env.render()

                policyValues = self.runModel(self.policyModel, observation)
                # print policyValues
                action = self.selectActionByProbability(policyValues, 1e-8)
                # print "action: ",action

                path["states"].append(observation)
                path["actions"].append(action)
                path["values"].append(self.runModel(self.valueModel, observation)[0])

                newObservation, reward, done, info = self.env.step(action)

                path["rewards"].append(reward)
                path["isDone"].append(done)

                totalReward += reward

                observation = newObservation

                if done:
                    last100Scores[last100ScoresIndex] = totalReward
                    last100ScoresIndex += 1
                    if last100ScoresIndex >= 100:
                        last100Filled = True
                        last100ScoresIndex = 0
                    if not last100Filled:
                        print "Episode ",epoch," finished after {} timesteps".format(t+1)," with total reward",totalReward
                    else :
                        print "Episode ",epoch," finished after {} timesteps".format(t+1)," with total reward",totalReward," last 100 average: ",(sum(last100Scores)/len(last100Scores))
                    break

            self.learn(path, observation)

        self.env.monitor.close()
        if upload:
            gym.upload('/tmp/'+experimentId, api_key=api_key)
def upload_results(folder):
    gym.upload(folder, api_key=secrets.api_key)
Esempio n. 24
0
    env = gym.make(args.env_id)

    # You provide the directory to write to (can be an existing
    # directory, including one with existing data -- all monitor files
    # will be namespaced). You can also dump to a tempdir if you'd
    # like: tempfile.mkdtemp().
    outdir = './tmp/random-agent-results'
    env = wrappers.Monitor(env, directory=outdir, force=True)
    env.seed(0)
    agent = RandomAgent(env.action_space)

    episode_count = 100
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()
        while True:
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            if done:
                break
            # Note there's no env.render() here. But the environment still can open window and
            # render if asked by env.monitor: it calls env.render('rgb_array') to record video.
            # Video is not recorded every episode, see capped_cubic_video_schedule for details.

    # Close the env and write monitor result info to disk
    env.close()

gym.upload('./tmp/random-agent-results')
Esempio n. 25
0
                break
        
        self.agent.experience_global(total_reward)

        if train:
            action_idx = self.agent.list_to_index(action)
            self.agent.update_model(old_seq, action_idx, reward, new_seq)
            self.agent.reduce_epsilon()

        return total_reward

if __name__ == '__main__':
    agent=DQNAgent()
    env=walkerEnvironment()
    sim=simulator(env,agent)

    best_reword = -200
    for i in range(10000):
        total_reword = sim.run(train=True)
        if best_reword < total_reword:
            best_reword = total_reword

        print(str(i) + " " + str(total_reword) + " " + str(best_reword))            
        env.reset()

        if best_reword > 200:
            break

    env.monitor_close()
    gym.upload('./walker-experiment', api_key='sk_oOcEXAWRgKM6bBJjtTcTw')
Esempio n. 26
0
            #    action = env.env.action_space.sample()
            #else:
            #    action = np.argmax(Q[state])
            #IF random number is less than epsilon grab the random action else grab the argument max of Q[state]
            action = env.env.action_space.sample() if np.random.random() < epsilon else np.argmax(Q[state])
            nstate, reward, done, infor = env.step(action)
            total_reward += reward
            #if nstate == 14:
            #    print "14"
            #elif nstate == 15:
            #    print "15"
            #Q Function Update
            #(not done) keeps the terminal state as 0
            Q[state][action] += alpha * (reward + gamma * Q[nstate].max() * (not done) - Q[state][action])
            #Q[state][action] = Q[state][action] + alpha * (reward + gamma * np.max(Q[nstate]) - Q[state][action])
            state = nstate
            epsilon *= epsilon_decay
            if done:
                break
    pi = np.argmax(Q, axis=1)
    return pi,Q

pi, Q = q_learning(env)
print(pi,Q)
#print (pi,V)



env.close()
gym.upload(tdir, api_key = 'sk_4hbReZHkQFqRUBZ1beREWg')
Esempio n. 27
0
    # You provide the directory to write to (can be an existing
    # directory, but can't contain previous monitor results. You can
    # also dump to a tempdir if you'd like: tempfile.mkdtemp().
    outdir = '/tmp/random-agent-results'
    env.monitor.start(outdir, force=True)

    episode_count = 100
    max_steps = 200
    reward = 0
    done = False

    for i in xrange(episode_count):
        ob = env.reset()

        for j in xrange(max_steps):
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            if done:
                break

    # Dump result info to disk
    env.monitor.close()

    # Upload to the scoreboard. We could also do this from another
    # process if we wanted.
    logger.info(
        "Successfully ran RandomAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results."
    )
    gym.upload(outdir, algorithm_id='random')
Esempio n. 28
0
    # consecutive episodes.
    avg_reward = np.sum(stats.episode_rewards) / len(stats.episode_rewards)
    print("Average reward : {}".format(avg_reward))

    def moving_avg(x, n=100):
        return np.convolve(x, np.ones((n, )) / n, mode='valid')

    ma = moving_avg(stats.episode_rewards, interval)
    peaks = np.where(ma > target)[0]
    if len(peaks) > 0:
        print("solved after {} episodes".format(peaks[0]))
        return True
    else:
        print("did not pass the openai criteria")
        return False


if __name__ == "__main__":
    TARGET_AVG_REWARD = 0.78
    TARGET_EPISODE_INTERVAL = 100

    env = gym.make('FrozenLake-v0')
    env = wrappers.Monitor(env, '/tmp/frozenlake-experiment-0', force=True)
    Q, stats = qlearning_alpha_noise(env, best_enabled=True)

    env.close()

    if is_solved(stats, TARGET_AVG_REWARD, TARGET_EPISODE_INTERVAL):
        OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
        gym.upload('/tmp/frozenlake-experiment-0', api_key=OPENAI_API_KEY)
Esempio n. 29
0
with open('API.json') as api:
    data = json.load(api)

env = gym.make('FrozenLake-v0')
env.monitor.start('/tmp/frozenlake-experiment-9')

observation = env.reset()
action_space = env.action_space
observation, reward, done, info = env.step(action_space.sample())
agent = Agent(observation, reward, info, action_space.sample(), action_space)
num_episodes = 5000

for i_episode in range(num_episodes):
    observation = env.reset()
    done = False
    t = 0
    while not done:
        env.render()
        action = agent.take_action()
        (observation, reward, done, info) = env.step(action)
        agent.update(observation, reward, info, action, action_space)
        t = t + 1
        if done:
            break
            print("Episode finished after {} timesteps".format(t+1))

env.monitor.close()
gym.scoreboard.api_key = data["api_key"]
gym.upload('/tmp/frozenlake-experiment-9')
Esempio n. 30
0
for i_episodes in range(episodes):
    State = env.reset()
    state = agent.RGBprocess(State)
    state = agent.stack(state)
    totalreward = 0
    done = False
    while not done:
        #if i_episodes % 50 == 0:
        #env.render()
        action = agent.act(state)
        new_state, reward, done, info = env.step(action)
        new_state = agent.RGBprocess(new_state)
        new_state_dif = agent.stack(new_state)
        agent.remember(state, action, reward, new_state_dif, done)
        state = new_state_dif
        totalreward += reward
    agent.memory_replay(batch_size)
    if done:
        print("{} episode, score = {}\n".format(i_episodes + 1, totalreward))
        agent.save_model()
        score.append(totalreward)
        if i_episodes % 100 == 0:
            print("{} episode, score = {}, rolling mean 100 episodes = {}".
                  format(i_episodes + 1, totalreward, np.mean(score[-100:])))
        if i_episodes % 40 == 0:
            agent.update_target_model()
agent.f.close()
env.close()
gym.upload(env_name, api_key='sk_WRCITkqmTJKYB9hvBk5tPA')
Esempio n. 31
0
        # env.render()
        qValues = deepQ.getQValues(observation, 0)
        qValues2 = deepQ.getQValues(observation, 1)
        qValues3 = deepQ.getQValues(observation, 2)

        # action = deepQ.selectActionAdded(qValues, qValues2, explorationRate)
        action = deepQ.selectActionMostPreferred(qValues, qValues2, qValues3, explorationRate)
        # action = deepQ.selectActionByProbability(qValues, 1)

        newObservation, reward, done, info = env.step(action)

        # if done:
        #     reward = -50
        deepQ.addMemory(observation, action, reward, newObservation, done)

        deepQ.learnOnMiniBatch(minibatch_size, 0)
        deepQ.learnOnMiniBatch(minibatch_size, 1)
        deepQ.learnOnMiniBatch(minibatch_size, 2)

        observation = newObservation

        if done:
            print "Episode ",epoch," finished after {} timesteps".format(t+1)
            break

    explorationRate -= (2.0/epochs)
    explorationRate = max (0.1, explorationRate)

env.monitor.close()
gym.upload('/tmp/wingedsheep-cartpole-democraticDeepQ8', api_key='sk_GC4kfmRSQbyRvE55uTWMOw')
 def on_allepisodes_end(self, logs={}):
     import gym
     gym.upload(args.env_monitor_dirresults_folder, api_key=args.api_key)
Esempio n. 33
0
    np.random.seed(args.seed)
    env = gym.make('CartPole-v0')
    num_steps = args.num_steps

    ef = None
    if args.algorithm == 'cem':
        ef = cem
    else:
        ef = pcem

    outdir = '/tmp/' + args.outdir
    env.monitor.start(outdir, force=True)

    f = evaluation_func(BinaryActionLinearPolicy, env, num_steps)

    # params for cem
    params = dict(n_iters=args.iters, n_samples=args.samples, top_frac=args.top_frac)
    u = np.random.randn(env.observation_space.shape[0]+1)
    var = np.square(np.ones_like(u) * 0.1)
    for (i, data) in enumerate(ef(f, u, var, **params)):
        print("Iteration {}. Episode mean reward: {}".format(i, data['y_mean']))
        agent = BinaryActionLinearPolicy(data['theta_mean'])
        if args.render:
            do_rollout(agent, env, num_steps, render=True)

    env.monitor.close()
    # make sure to setup your OPENAI_GYM_API_KEY environment variable
    if args.upload:
        gym.upload(outdir, algorithm_id=args.algorithm)
def simulate():

    ## Instantiating the learning related parameters
    learning_rate = get_learning_rate(0)
    explore_rate = get_explore_rate(0)
    discount_factor = 0.999  # since the world is unchanging

    num_streaks = 0

    for episode in range(NUM_EPISODES):

        # Reset the environment
        obv = env.reset()

        # the initial state
        state_0 = state_to_bucket(obv)

        for t in range(MAX_T):
            # env.render()

            # Select an action
            action = select_action(state_0, explore_rate)

            # Execute the action
            obv, reward, done, _ = env.step(action)

            # Observe the result
            state = state_to_bucket(obv)

            # Update the Q based on the result
            best_q = np.amax(q_table[state])
            q_table[state_0 + (action,)] += learning_rate*(reward + discount_factor*(best_q) - q_table[state_0 + (action,)])

            # Setting up for the next iteration
            state_0 = state

            # Print data
            if (DEBUG_MODE):
                print("\nEpisode = %d" % episode)
                print("t = %d" % t)
                print("Action: %d" % action)
                print("State: %s" % str(state))
                print("Reward: %f" % reward)
                print("Best Q: %f" % best_q)
                print("Explore rate: %f" % explore_rate)
                print("Learning rate: %f" % learning_rate)
                print("Streaks: %d" % num_streaks)

                print("")

            if done:
                print("Episode %d finished after %f time steps" % (episode, t))

                if t >= SOLVED_T:
                    num_streaks += 1
                else:
                    num_streaks = 0
                break

        #sleep(0.25)

        # It's considered done when it's solved over 120 times consecutively
        if num_streaks > STREAK_TO_END:
            break

        # Update parameters
        explore_rate = get_explore_rate(episode)
        learning_rate = get_learning_rate(episode)

    if ENABLE_UPLOAD:
        env.monitor.close()
        gym.upload('/tmp/cart_pole_q_learning_4D',
                   api_key='sk_93AMQvdmReWCi8pdL4m6Q')
Esempio n. 35
0
    info['env_id'] = env.spec.id

    # ------------------------------------------


    def noisy_evaluation(theta):
        agent = BinaryActionLinearPolicy(theta)
        rew, T = do_rollout(agent, env, num_steps)
        return rew

    # Train the agent, and snapshot each stage
    for (i, iterdata) in enumerate(
            cem(noisy_evaluation, np.zeros(env.observation_space.shape[0] + 1),
                **params)):
        print 'Iteration %2i. Episode mean reward: %7.3f' % (
            i, iterdata['y_mean'])
        agent = BinaryActionLinearPolicy(iterdata['theta_mean'])
        if args.display: do_rollout(agent, env, 200, render=True)
        writefile('agent-%.4i.pkl' % i, cPickle.dumps(agent, -1))

    # Write out the env at the end so we store the parameters of this
    # environment.
    writefile('info.json', json.dumps(info))

    env.monitor.close()

    logger.info(
        "Successfully ran RandomAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results."
    )
    gym.upload(outdir, algorithm_id='cem')
    def performMultipleEpisodes(self, numEpisodes):  #helper for performing
        #multiple episodes
        for episode in range(numEpisodes):
            self.performEpisode()


#main process

if __name__ == "__main__":
    #apiKey = sys.argv[1]
    #numEpisodes = sys.argv[2]
    #tests
    #stateVec = np.array([1,2,3,4])
    #action = 1
    #print firstIntWithPolynomials(stateVec,action)
    #testQ = LinearQFunc(linearBasisFunc)
    #weightVec = np.array([0,0,0,0,1,1,1,1])
    #print testQ.q(stateVec,action,weightVec)
    #print testQ.gradQ(stateVec,action,weightVec)
    epsilon = .001
    alpha = .5
    gamma = 1
    newInteraction = AgentEnvironmentInteraction("CartPole-v0", alpha, gamma,
                                                 epsilon,
                                                 firstInteractionBasisFunc,
                                                 "../submission/cp-e-16")
    newInteraction.performMultipleEpisodes(800)
    newInteraction.env.close()
    gym.upload("../submission/cp-e-16", api_key=sys.argv[1])
 def close_and_upload(self, api_key):
     self.env.close()
     gym.upload(self.monitor_dir, api_key=api_key)
    def _exec_loop(self, num_episodes: int, frame_skip: int, warmup_steps: int,
                   render_every_n: int, train: bool, upload: bool):
        """Train the agent in the environment for a specified number of episodes.

        :param num_episodes: Terminate training after this many new episodes are observed
        :param steps_before_training: Individual steps to take in the environment before training begins
        :param render_every_n: Render every nth episode
        :param upload: Upload training results to OpenAI Gym site?
        """

        if frame_skip <= 0:
            raise ValueError(
                f'Invalid value of {frame_skip} for `frame_skip`. Value must be a positive integer.'
            )

        # TODO: pass metrics during training
        # TODO: wire & unwire events

        self._status = self._build_status()
        self._raise_execution_start_event()

        if upload:
            assert self.api_key, 'An API key must be specified before uploading training results.'
            monitor_path = mkdtemp()
            self.env = gym.wrappers.Monitor(self.env, monitor_path)

        self._raise_warmup_start_event()

        # If 0 or None is passed, disable rendering
        if not render_every_n:
            render_every_n = num_episodes + 1

        try:
            self._status.total_steps = 0

            for episode_count in range(1, num_episodes + 1):
                # Initial counters for the episode
                self._status.episode = episode_count
                self._status.render = episode_count % render_every_n == 0
                self._status.episode_done = False
                self._status.step = 0
                total_episode_error = 0

                self._raise_episode_start_event()

                s = self.env.reset()  # Get initial state observation

                while not self._status.episode_done:
                    # End the warmup period as soon as the required number of steps have been taken
                    if self._status.total_steps == warmup_steps:
                        self._raise_warmup_end_event()

                    s = np.asarray(s)

                    self._raise_step_start_event(s=s)

                    # Action replay.  We repeat the selected action for n steps.
                    a = self.choose_action(s.reshape(1, -1))

                    if isinstance(self.env.action_space, gym.spaces.Box):
                        a = a.reshape(self.env.action_space.shape)

                    r = 0

                    # Replay the selected action as necessary
                    # Accumulate the reward from each action, but don't let the agent observe the intermediate states
                    for i in range(frame_skip):
                        if self._status.render:
                            self.env.render()

                        # Take action and observe reward and new state
                        s_prime, r_new, episode_done, _ = self.env.step(a)

                        # Some environments return reward as an array, flatten into a float for consistency
                        if isinstance(r_new, np.ndarray):
                            r_new = np.sum(r_new)

                        # Discount the reward received by taking each action, after the first.
                        r += r_new * np.power(self.gamma, i)

                        if episode_done:
                            break

                    self._status.action = a
                    self._status.step += 1
                    self._status.reward = r
                    self._status.total_steps += 1

                    s, a, r, s_prime, episode_done = self.preprocess_observation(
                        s, a, r, s_prime, episode_done)
                    self._status.episode_done = episode_done

                    # Store the experience before setting early termination flag.
                    # Just because we end early in a state on one trajectory doesn't mean that state should always
                    # be treated as a terminal state.
                    if self.memory is not None:
                        self.memory.append((s, a, r, s_prime, episode_done))

                    # Force the episode to end if we've reached the maximum number of steps allowed
                    if self.max_steps_per_episode and self._status.step >= self.max_steps_per_episode:
                        self._status.episode_done = True

                    self._raise_step_end_event(s=s, s_prime=s_prime, a=a, r=r)

                    # Train the agent's model(s) if necessary
                    if train:
                        self._raise_train_start_event()

                        stats = self._update_weights()
                        assert isinstance(stats, dict) or stats is None, \
                            'Value of {} returned by _update_weights() is not a dictionary or None'.format(stats)

                        if stats is not None:
                            self._status.update(stats)

                        self._raise_train_end_event()

                    s = s_prime

                self._raise_episode_end_event(total_error=total_episode_error)

            self.env.close()

        except KeyboardInterrupt:
            return
        finally:
            if upload:
                gym.upload(monitor_path, api_key=self.api_key)
                rmtree(monitor_path)  # Cleanup the temp dir

            self._raise_execution_end_event()
Esempio n. 39
0
def uploadSimulation():
    API_KEY = open(keyPath, 'r').read().rstrip()
    gym.upload('OpenAI/' + GAME + "/Data", api_key=API_KEY)
    state = env.reset()
    rAll = 0
    done = False

    # The Q-Table learning algorithm
    while not done:
        # Choose an action by greedily (with noise) picking from Q table
        action = np.argmax(Q[state, :] + np.random.randn(1, action_space_n) /
                           (i + 1))

        # Get new state and reward from environment
        new_state, reward, done, _ = env.step(action)

        # Update Q-Table with new knowledge using learning rate
        Q[state, action] = (1 - learning_rate) * Q[state, action] \
            + learning_rate * (reward + dis * np.max(Q[new_state, :]))

        rAll += reward
        state = new_state

    rList.append(rAll)

env.close()
gym.upload("gym-results", api_key="sk_VT2wPcSSOylnlPORltmQ")

print("Score over time: " + str(sum(rList) / num_episodes))
print("Final Q-Table Values")
print(Q)
plt.bar(range(len(rList)), rList, color="blue")
plt.show()
Esempio n. 41
0
        total_reward += reward
        agent.store_experience(state, action, reward, next_state, done)
        loss += agent.train()
        state = next_state
        step += 1
        if done:
            break
#    print("episode: ", episode, "total reward: ", total_reward)
    logger_train.log_episode(total_reward, loss, episode)
#
#    if episode == MAX_EPISODES-1:
#        print("Test Period")
#        for i in range(MAX_EPISODES_TEST):
#            total_reward_test = 0
#            loss_test = 0
#            step_test = 0
#            state_test = env.reset()
#            while True:
#                env.render()
#                action_test = agent.set_action(state_test)
#                next_state_test,reward_test,done_test,info_test = env.step(action_test)
#                total_reward_test += reward_test
#                state_test = next_state_test
#                step_test += 1
#                if done_test:
#                    break
#            print("episode: ", i, "total reward: ", total_reward_test)

env.close()
gym.upload('upload/' + GAME, api_key='sk_ocA2j8g2QyqixgtrtVbOSA')
Esempio n. 42
0
                              Q[current, action])

    rewards.append(t_reward)
    iterations.append(i)

# Close environment
env.close()


# Plot results
def chunk_list(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]


size = episodes // 50
chunks = list(chunk_list(rewards, size))
averages = [sum(chunk) / len(chunk) for chunk in chunks]

plt.plot(list(range(0, len(rewards), size)), averages)
plt.xlabel('Episode')
plt.ylabel('Average Reward')
plt.show()

# Push solution
api_key = os.environ.get('GYM_API_KEY', False)
if api_key:
    print('Push solution? (y/n)')
    if input().lower() == 'y':
        gym.upload(folder, api_key=api_key)
Esempio n. 43
0
                feed_dict={
                    state_ph: np.asarray([elem[0] for elem in minibatch]),
                    action_ph: np.asarray([elem[1] for elem in minibatch]),
                    reward_ph: np.asarray([elem[2] for elem in minibatch]),
                    next_state_ph: np.asarray([elem[3] for elem in minibatch]),
                    is_not_terminal_ph:
                    np.asarray([elem[4] for elem in minibatch]),
                    is_training_ph: True
                })

            # update slow actor and critic targets towards current actor and critic
            _ = sess.run(update_slow_targets_op)

        observation = next_observation
        total_steps += 1
        steps_in_ep += 1

        if done:
            # Increment episode counter
            _ = sess.run(episode_inc_op)
            break

    episode_returns.append(total_reward)
    print('Episode %2i, Reward: %7.3f, Steps: %i, avg return: %.2f' %
          (ep, total_reward, steps_in_ep, np.mean(episode_returns)))

# Finalize and upload results
writefile('info.json', json.dumps(info))
env.close()
gym.upload(outdir)
Esempio n. 44
0
                stats["KL between old and new distribution"] = kloldnew
                stats["Surrogate loss"] = surrafter
                for k, v in stats.iteritems():
                    print(k + ": " + " " * (40 - len(k)) + str(v))
                if entropy != entropy:
                    exit(-1)
                if exp > 0.8:
                    self.train = False
            i += 1

training_dir = tempfile.mkdtemp()
logging.getLogger().setLevel(logging.DEBUG)

if len(sys.argv) > 1:
    task = sys.argv[1]
else:
    task = "RepeatCopy-v0"

env = envs.make(task)
env.monitor.start(training_dir)

env = SpaceConversionEnv(env, Box, Discrete)

agent = TRPOAgent(env)
agent.learn()
env.monitor.close()
gym.upload(training_dir,
           algorithm_id='trpo_ff')


Esempio n. 45
0
    trainer = Trainer(
        agent, 
        gamma=0.95,
        learning_rate=0.1, learning_rate_decay=learning_decay, 
        epsilon=1.0, epsilon_decay=epsilon_decay,
        max_step=250)

    if monitor:
        env.monitor.start(RECORD_PATH)

    trainer.train(env, episode_count=episodes, render=render)

    if monitor:
        env.monitor.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="train & run cartpole ")
    parser.add_argument("--episode", type=int, default=1000, help="episode to train")
    parser.add_argument("--render", action="store_true", help="render the screen")
    parser.add_argument("--monitor", action="store_true", help="monitor")
    parser.add_argument("--upload", type=str, default="", help="upload key to openai gym (training is not executed)")

    args = parser.parse_args()

    if args.upload:
        if os.path.isdir(RECORD_PATH):
            gym.upload(RECORD_PATH, api_key=args.upload)
    else:
        main(args.episode, args.render, args.monitor)
Esempio n. 46
0
File: main.py Progetto: xtmachine/rl
    dim_theta = (env.observation_space.shape[0]+1) * env.action_space.n
elif isinstance(env.action_space, Box):
    dim_theta = (env.observation_space.shape[0]+1) * env.action_space.shape[0]
else:
    raise NotImplementedError

# Initialize mean and standard deviation
theta_mean = np.zeros(dim_theta)
theta_std = np.ones(dim_theta)

# Now, for the algorithm
env.monitor.start('/tmp/pendulum', force = True)		

for iteration in xrange(n_iter):
    # Sample parameter vectors
    thetas = np.random.normal(theta_mean, theta_std, (batch_size, dim_theta))
    rewards = [noisy_evaluation(theta) for theta in thetas]
    # Get elite parameters
    n_elite = int(batch_size * elite_frac)
    elite_inds = np.argsort(rewards)[batch_size - n_elite:batch_size]
    elite_thetas = [thetas[i] for i in elite_inds]
    # Update theta_mean, theta_std
    theta_mean = np.mean(elite_thetas, axis = 0)
    theta_std = np.std(elite_thetas, axis = 0)
    print "iteration %i. mean f: %8.3g. max f: %8.3g"%(iteration, np.mean(rewards), np.max(rewards))
    do_episode(make_policy(theta_mean), env, num_steps, render=True)

env.monitor.close()

gym.upload('/tmp/pendulum', api_key='API_KEY ')
Esempio n. 47
0
n_episodes = 4000
alpha = 0.4
gamma = 0.9
epsilon_decay_rate = 0.01

# Apply SARSA algorithm with linear epsilon decay.
# Example submissions:
# https://gym.openai.com/evaluations/eval_9UzqaZ5RgyQZrNePfe5Ww
# https://gym.openai.com/evaluations/eval_ZXo8CfLQa6SOrPtY8e5w

for episode_idx in range(n_episodes):
    s = env.reset()
    done = False
    policy = eps_greedy_policy(Q, max(1 - episode_idx * epsilon_decay_rate, 0),
                               actions)
    a = policy(s)

    while True:
        s_star, reward, done, _ = env.step(a)
        if done:
            Q[s, a] += alpha * (reward - Q[s, a])
            break
        a_star = policy(s_star)
        Q[s, a] += alpha * (reward + gamma * Q[s_star, a_star] - Q[s, a])
        s = s_star
        a = a_star

env.close()
gym.upload('first_visit', api_key='sk_4jIycd3IT1SyJLj3d2mFxw')
Esempio n. 48
0
    env = gym.make('CartPole-v0')
    agent = RandomAgent(env.action_space)

    # You provide the directory to write to (can be an existing
    # directory, but can't contain previous monitor results. You can
    # also dump to a tempdir if you'd like: tempfile.mkdtemp().
    outdir = '/tmp/random-agent-results'
    env.monitor.start(outdir, force=True)

    episode_count = 100
    max_steps = 200
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()

        for j in range(max_steps):
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            if done:
                break

    # Dump result info to disk
    env.monitor.close()

    # Upload to the scoreboard. We could also do this from another
    # process if we wanted.
    logger.info("Successfully ran RandomAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.")
    gym.upload(outdir, algorithm_id='random')
Esempio n. 49
0
def Upload():

    # Upload training record
    gym.upload(RECORD_DIR + RECORD_FILENAME, api_key=API_KEY)
Esempio n. 50
0
        env = self.env
        ret = []
        for o, r, d in zip(observation_n, reward_n, done_n):
            o = env.observation_convert(o, env._env.observation_space, env.observation_space)  
            obs = np.expand_dims(o, 0)
            action_dist_n = self.session.run(self.action_dist_n, {self.obs: obs})
            action = int(np.argmax(action_dist_n, 1)[0])
            action = env.action_convert(action, env.action_space, env._env.action_space)
            ret.append(action)
        return ret


experiment_dir = tempfile.mkdtemp()
logging.getLogger().setLevel(logging.DEBUG)
print ("taks = {}".format(args.task))
env = envs.make(args.task)


env.monitor.start(experiment_dir)

agent = ContinTRPOAgent(env)
agent.learn()
env.monitor.close()
gym.upload(experiment_dir, algorithm_id=algo)


print (experiment_dir)

from sys import argv
print ('python {}'.format(' '.join(argv)))
Esempio n. 51
0
 def submit_result(self,algo_id,api_key):
     gym.upload(self.monitor_file,
         algorithm_id=algo_id,
         api_key=api_key,
         ignore_open_monitors=False)
def uploadSimulation():
    API_KEY = open('/home/dollarakshay/Documents/API Keys/Open AI Key.txt', 'r').read().rstrip()
    gym.upload('OpenAI/'+GAME+"/Data", api_key=API_KEY)
Esempio n. 53
0
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    env = gym.make(args.env)
    outdir = '/tmp/' + args.outdir + '-' + args.env
    #env.monitor.start(outdir, force=True)

    print("******* WILL SAVE RESULTS TO", outdir, " *******")

    sess = tf.Session()

    in_dim = flatten_space(env.observation_space)
    out_dim = flatten_space(env.action_space)
    hidden_dim = 8
    opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    policy = CategoricalPolicy(in_dim, out_dim, hidden_dim, opt, sess)
    po = PolicyOptimizer(env, policy, 0, args.n_iter, args.n_episode,
                         args.path_length)

    sess.run(tf.initialize_all_variables())

    # train the policy optimizer
    po.train()

    env.monitor.close()

    # make sure to setup your OPENAI_GYM_API_KEY environment variable
    if args.upload:
        gym.upload(outdir, algorithm_id=args.algorithm)
    # You provide the directory to write to (can be an existing
    # directory, but can't contain previous monitor results. You can
    # also dump to a tempdir if you'd like: tempfile.mkdtemp().
    outdir = "tmp/c2"
    env.monitor.start(outdir, force=True)

    episode_count = 100
    max_steps = 200
    reward = 0
    done = False

    for i in xrange(episode_count):
        ob = env.reset()
        reward = done = None

        for j in xrange(max_steps):
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            env.render()
            if done:
                break

    # Dump result info to disk
    env.monitor.close()

    # Upload to the scoreboard. We could also do this from another
    # process if we wanted.
    logger.info("Successfully ran CodedAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.")
    gym.upload(outdir, algorithm_id='coded', api_key='YOUR API_KEY')
Esempio n. 55
0
import gym
api='sk_bomf7HJCRRul6yEOGkCqdw'
gym.upload('/tmp/cartpole-experiment-1', api_key=api)
Esempio n. 56
0
# TO DO:
for j in range(n_episodes):
    done = False
    state = env.reset()
    policy = epsilon_greedy_policy(Q, epsilon=1. / (j + 1), actions=actions)
    episode = []
    ### Generate sample episode
    while not done:
        action = policy(state)
        new_state, reward, done, _ = env.step(action)
        episode.append((state, action, reward))
        print((state, action, reward))
        state = new_state

    ### NOT RELEVANT FOR SARSA/Q-LEARNING
    sa_in_episode = set([(x[0], x[1]) for x in episode])

    # Find first visit of each s,a in the episode
    for s, a in sa_in_episode:
        first_visit = next(i for i, x in enumerate(episode)
                           if x[0] == s and x[1] == a)

        G = sum(x[2] * (gamma**i) for i, x in enumerate(episode[first_visit:]))
        R[s, a] += G
        N[s, a] += 1
        Q[s, a] += R[s, a] / N[s, a]

env.close()

gym.upload('first_visit', api_key=API_KEY)
Esempio n. 57
0
    # This declaration must go *after* the monitor call, since the
    # monitor's seeding creates a new action_space instance with the
    # appropriate pseudorandom number generator.
    agent = RandomAgent(env.action_space)

    episode_count = 100
    max_steps = 200
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()

        for j in range(max_steps):
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            if done:
                break
            # Note there's no env.render() here. But the environment still can open window and
            # render if asked by env.monitor: it calls env.render('rgb_array') to record video.
            # Video is not recorded every episode, see capped_cubic_video_schedule for details.

    # Dump result info to disk
    env.monitor.close()

    # Upload to the scoreboard. We could also do this from another
    # process if we wanted.
    logger.info("Successfully ran RandomAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.")
    gym.upload(outdir)
Esempio n. 58
0
    # This declaration must go *after* the monitor call, since the
    # monitor's seeding creates a new action_space instance with the
    # appropriate pseudorandom number generator.
    agent = RandomAgent(env.action_space)

    episode_count = 1000
    max_steps = 200
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()

        for j in range(max_steps):
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            if done:
                break
            # Note there's no env.render() here. But the environment still can open window and
            # render if asked by env.monitor: it calls env.render('rgb_array') to record video.
            # Video is not recorded every episode, see capped_cubic_video_schedule for details.

    # Dump result info to disk
    env.monitor.close()

    # Upload to the scoreboard. We could also do this from another
    # process if we wanted.
    logger.info("Successfully ran RandomAgent. Now trying to upload results to the scoreboard. If it breaks, you can always just try re-uploading the same results.")
    gym.upload(outdir,api_key='sk_A1BpYdknRCFWZDAIFWSew')
import gym
gym.upload('/tmp/cartpole-experiment-1', api_key='sk_J3iun2y7RvKPIZtvgmyJXw')
Esempio n. 60
0
from __future__ import print_function
from __future__ import absolute_import
import argparse
import os
import os.path as osp
import gym
from rllab.viskit.core import load_params

if __name__ == "__main__":
    # rl_gym.api_key = 'g8JOpnNVmcjMShBiFtyji2VWX3P2uCzc'
    if 'OPENAI_GYM_API_KEY' not in os.environ:
        raise ValueError("OpenAi Gym API key not configured. Please register an account on https://gym.openai.com and"
                         " set the OPENAI_GYM_API_KEY environment variable, and try the script again.")

    parser = argparse.ArgumentParser()
    parser.add_argument('log_dir', type=str,
                        help='path to the logging directory')
    parser.add_argument('--algorithm_id', type=str, default=None, help='Algorithm ID')
    args = parser.parse_args()
    snapshot_dir = osp.abspath(osp.join(args.log_dir, ".."))
    params_file_path = osp.join(snapshot_dir, "params.json")
    params_json = load_params(params_file_path)
    gym.upload(args.log_dir, algorithm_id=args.algorithm_id)