Ejemplo n.º 1
0
    def playonce(self, noise_level, _env):
        t = time.time()

        skip = ENV_SKIP
        env = fastenv(_env, skip)

        noise_source = one_fsq_noise()
        for j in range(200):
            noise_source.one((DIM_ACTION * 2,), noise_level)

        state = env.reset()

        n_steps = 0
        ep_reward = 0
        warmup = BATCH_SIZE * 32

        noise_phase = int(np.random.uniform() * 999999)

        while True:
            action = self.agent.select_action(state)

            phased_noise_anneal_duration = 100
            phased_noise_amplitude = ((-noise_phase-n_steps) % phased_noise_anneal_duration) / phased_noise_anneal_duration
            phased_noise_amplitude = max(0, phased_noise_amplitude * 2 - 1)
            phased_noise_amplitude = max(0.01, phased_noise_amplitude ** 2)

            exploration_noise = noise_source.one((DIM_ACTION * 2,), noise_level) * phased_noise_amplitude
            action += exploration_noise * 0.5
            action = np.clip(action, 0, 1)

            next_state, reward, done, info = env.step(action.tolist())
            self.agent.memory.push(deepcopy_all(state, action, [reward], next_state, [float(done)]))

            if len(self.agent.memory) >= warmup:
                with self.lock:
                    self.agent.learn()

            state = next_state
            ep_reward += reward
            n_steps += 1

            if done:
                break

        with self.lock:
            t = time.time() - t
            print('reward: {}, n_steps: {}, explore: {:.5f}, n_mem: {}, time: {:.2f}' \
                  .format(ep_reward, n_steps, noise_level, len(self.agent.memory), t))

            global t0
            self.plotter.pushys([ep_reward, noise_level, (time.time() - t0) % 3600 / 3600 - 2])

        _env.rel()
        del env
Ejemplo n.º 2
0
    def __init__(self, args):
        self.rpm = rpm(1000000)
        self.render = True
        self.training = True
        self.noise_source = one_fsq_noise()

        self.train_multiplier = args.train_multiplier
        self.inputdims = args.observation_space_dims

        low = 0.0
        high = 1.0
        num_of_actions = args.action_space
        self.action_bias = high / 2.0 + low / 2.0
        self.action_multiplier = high - self.action_bias

        def clamper(actions):
            return np.clip(actions, a_max=high, a_min=low)

        self.clamper = clamper

        self.outputdims = args.action_space
        self.discount_factor = args.gamma
        ids, ods = self.inputdims, self.outputdims
        print('inputdims:{}, outputdims:{}'.format(ids, ods))

        self.actor = models.create_actor_network(ids, ods).cuda()
        self.critic = models.create_critic_network(ids, ods).cuda()
        self.actor_target = models.create_actor_network(ids, ods).cuda()
        self.critic_target = models.create_critic_network(ids, ods).cuda()
        self.critic_criterion = nn.MSELoss().cuda()

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=3e-4)

        self.plot_epoch = [0]
        self.plot_reward = [0]
        import threading as th
        self.lock = th.Lock()
Ejemplo n.º 3
0
    def playonce(self, noise_level, _env):
        t = time.time()

        skip = ENV_SKIP
        env = fastenv(_env, skip)

        noise_source = one_fsq_noise()
        for j in range(200):
            noise_source.one((DIM_ACTION, ), noise_level)

        state = env.reset()
        info = {'step': 0}

        n_steps = 0
        ep_reward = 0
        warmup = BATCH_SIZE * 32

        noise_phase = int(np.random.uniform() * 999999)

        while True:
            action = self.agent.select_action(state)

            phased_noise_anneal_duration = 100
            phased_noise_amplitude = (
                (-noise_phase - n_steps) %
                phased_noise_anneal_duration) / phased_noise_anneal_duration
            phased_noise_amplitude = max(0, phased_noise_amplitude * 2 - 1)
            phased_noise_amplitude = max(0.01, phased_noise_amplitude**2)

            exploration_noise = noise_source.one(
                (DIM_ACTION, ), noise_level) * phased_noise_amplitude
            action += exploration_noise * 0.5
            action = np.clip(action, 0, 1)

            next_state, reward, done, info = env.step(action.tolist())
            done1 = False if info['step'] == MAX_EP_STEPS else done

            self.agent.memory.push(
                deepcopy_all(state, action, [reward], next_state, [done1]))
            if n_steps >= 100 / ENV_SKIP:
                self.agent.memory.push(
                    deepcopy_all(mirror_s(state), mirror_a(action), [reward],
                                 mirror_s(next_state), [done1]))
            else:
                self.agent.memory.push(
                    deepcopy_all(state, action, [reward], next_state, [done1]))

            if len(self.agent.memory) >= warmup:
                with self.lock:
                    self.agent.learn()

            state = next_state
            ep_reward += reward
            n_steps += 1

            if done:
                break

        with self.lock:
            t = time.time() - t
            print('reward: {}, n_steps: {}, explore: {:.5f}, n_mem: {}, time: {:.2f}' \
                  .format(ep_reward, info['step'], noise_level, len(self.agent.memory), t))

        _env.rel()
        del env
Ejemplo n.º 4
0
    def __init__(
        self,
        observation_space_dims,
        action_space,
        stack_factor=1,
        discount_factor=.99,  # gamma
        # train_skip_every=1,
        train_multiplier=1,
    ):
        self.rpm = rpm(1000000)  # 1M history
        self.plotter = plotter(num_lines=3)
        self.render = True
        self.training = True
        self.noise_source = one_fsq_noise()
        self.train_counter = 0
        # self.train_skip_every = train_skip_every
        self.train_multiplier = train_multiplier
        self.observation_stack_factor = stack_factor

        self.inputdims = observation_space_dims * self.observation_stack_factor
        # assume observation_space is continuous

        self.is_continuous = True if isinstance(action_space, Box) else False

        if self.is_continuous:  # if action space is continuous

            low = action_space.low
            high = action_space.high

            num_of_actions = action_space.shape[0]

            self.action_bias = high / 2. + low / 2.
            self.action_multiplier = high - self.action_bias

            # say high,low -> [2,7], then bias -> 4.5
            # mult = 2.5. then [-1,1] multiplies 2.5 + bias 4.5 -> [2,7]

            def clamper(actions):
                return np.clip(actions,
                               a_max=action_space.high,
                               a_min=action_space.low)

            self.clamper = clamper
        else:
            num_of_actions = action_space.n

            self.action_bias = .5
            self.action_multiplier = .5  # map (-1,1) into (0,1)

            def clamper(actions):
                return np.clip(actions, a_max=1., a_min=0.)

            self.clamper = clamper

        self.outputdims = num_of_actions
        self.discount_factor = discount_factor
        ids, ods = self.inputdims, self.outputdims
        print('inputdims:{}, outputdims:{}'.format(ids, ods))

        self.actor = self.create_actor_network(ids, ods)
        self.critic = self.create_critic_network(ids, ods)
        self.actor_target = self.create_actor_network(ids, ods)
        self.critic_target = self.create_critic_network(ids, ods)

        # print(self.actor.get_weights())
        # print(self.critic.get_weights())

        self.feed, self.joint_inference, sync_target = self.train_step_gen()

        sess = ct.get_session()
        sess.run(tf.global_variables_initializer())

        sync_target()

        import threading as th
        self.lock = th.Lock()

        if not hasattr(self, 'wavegraph'):
            num_waves = self.outputdims * 2 + 1

            def rn():
                r = np.random.uniform()
                return 0.2 + r * 0.4

            colors = []
            for i in range(num_waves - 1):
                color = [rn(), rn(), rn()]
                colors.append(color)
            colors.append([0.2, 0.5, 0.9])
            self.wavegraph = wavegraph(num_waves, 'actions/noises/Q',
                                       np.array(colors))
Ejemplo n.º 5
0
    def play(self,
             env,
             max_steps=-1,
             realtime=False,
             noise_level=0.):  # play 1 episode
        timer = time.time()
        noise_source = one_fsq_noise()

        for j in range(10):
            noise_source.one((self.outputdims, ), noise_level)

        max_steps = max_steps if max_steps > 0 else 50000
        steps = 0
        total_reward = 0
        episode_memory = []

        # removed: state stacking
        # moved: observation processing

        try:
            observation = env.reset()
        except Exception as e:
            print('(agent) something wrong on reset(). episode terminates now')
            traceback.print_exc()
            print(e)
            return

        while True and steps <= max_steps:
            steps += 1

            observation_before_action = observation  # s1

            exploration_noise = noise_source.one((self.outputdims, ),
                                                 noise_level)
            # exploration_noise -= noise_level * 1

            # self.lock.acquire() # please do not disrupt.
            action = self.act(observation_before_action,
                              exploration_noise)  # a1
            # self.lock.release()

            if self.is_continuous:
                # add noise to our actions, since our policy by nature is deterministic
                exploration_noise *= self.action_multiplier
                # print(exploration_noise,exploration_noise.shape)
                action += exploration_noise
                action = self.clamper(action)
                action_out = action
            else:
                raise RuntimeError(
                    'this version of ddpg is for continuous only.')

            # o2, r1,
            try:
                observation, reward, done, _info = env.step(
                    action_out)  # take long time
            except Exception as e:
                print(
                    '(agent) something wrong on step(). episode teminates now')
                traceback.print_exc()
                print(e)
                return

            # d1
            isdone = 1 if done else 0
            total_reward += reward

            # feed into replay memory
            if self.training == True:
                episode_memory.append((observation_before_action, action,
                                       reward, isdone, observation))

                # don't feed here since you never know whether the episode will complete without error.
                # self.feed_one((
                #     observation_before_action,action,reward,isdone,observation
                # )) # s1,a1,r1,isdone,s2
                # self.lock.acquire()
                self.train(verbose=2 if steps == 1 else 0)
                # self.lock.release()

            # if self.render==True and (steps%30==0 or realtime==True):
            #     env.render()
            if done:
                break

        # print('episode done in',steps,'steps',time.time()-timer,'second total reward',total_reward)
        totaltime = time.time() - timer
        print(
            'episode done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}'
            .format(steps, totaltime, totaltime / steps, total_reward))
        self.lock.acquire()

        for t in episode_memory:
            self.feed_one(t)

        self.plotter.pushys(
            [total_reward, noise_level, (time.time() % 3600) / 3600 - 2])
        # self.noiseplotter.pushy(noise_level)
        self.lock.release()

        return
Ejemplo n.º 6
0
    def play(self,env,max_steps=-1,realtime=False,noise_level=0.): # play 1 episode
        timer = time.time()
        noise_source = one_fsq_noise()
        noise_source.skip = 1 # freq adj

        for j in range(200):
            noise_source.one((self.outputdims,),noise_level)

        max_steps = max_steps if max_steps > 0 else 50000
        steps = 0
        total_reward = 0
        episode_memory = []

        # removed: state stacking
        # moved: observation processing

        noise_phase = int(np.random.uniform()*999999)

        try:
            observation = env.reset()
        except Exception as e:
            print('(agent) something wrong on reset(). episode terminates now')
            traceback.print_exc()
            print(e)
            return

        while True and steps <= max_steps:
            steps +=1

            observation_before_action = observation # s1

            phased_noise_anneal_duration = 100
            # phased_noise_amplitude = ((-noise_phase-steps)%phased_noise_anneal_duration)/phased_noise_anneal_duration*2*np.pi
            # phased_noise_amplitude = max(0.1,np.sin(phased_noise_amplitude))

            phased_noise_amplitude = ((-noise_phase-steps)%phased_noise_anneal_duration)/phased_noise_anneal_duration
            phased_noise_amplitude = max(0,phased_noise_amplitude*2-1)
            phased_noise_amplitude = max(0.01,phased_noise_amplitude**2)

            exploration_noise = noise_source.one((self.outputdims,),noise_level)*phased_noise_amplitude
            # exploration_noise = np.random.normal(size=(self.outputdims,))*noise_level*phased_noise_amplitude
            # exploration_noise -= noise_level * 1

            # exploration_noise = np.random.normal(size=(self.outputdims,))*0.
            #
            # # we want to add some shot noise
            # shot_noise_prob = min(1, noise_level/5) # 0.05 => 1% shot noise
            # shot_noise_replace = (np.random.uniform(size=exploration_noise.shape)<shot_noise_prob).astype('float32') # 0 entries passes thru, 1 entries shot noise.
            #
            # shot_noise_amplitude = np.random.uniform(size=exploration_noise.shape)*2-1
            # # [-1, 1]
            # # add shot noise!
            # exploration_noise = exploration_noise*(1-shot_noise_replace) + shot_noise_amplitude*shot_noise_replace

            # self.lock.acquire() # please do not disrupt.
            action = self.act(observation_before_action, exploration_noise) # a1
            # self.lock.release()

            if self.is_continuous:
                # add noise to our actions, since our policy by nature is deterministic
                exploration_noise *= self.action_multiplier
                # print(exploration_noise,exploration_noise.shape)
                action += exploration_noise
                action = self.clamper(action) # don't clamp, see what happens.
                action_out = action
            else:
                raise RuntimeError('this version of ddpg is for continuous only.')

            # o2, r1,
            try:
                observation, reward, done, _info = env.step(action_out) # take long time
            except Exception as e:
                print('(agent) something wrong on step(). episode teminates now')
                traceback.print_exc()
                print(e)
                return

            # d1
            isdone = 1 if done else 0
            total_reward += reward

            # feed into replay memory
            if self.training == True:
                # episode_memory.append((
                #     observation_before_action,action,reward,isdone,observation
                # ))

                # don't feed here since you never know whether the episode will complete without error.
                # changed mind: let's feed here since this way the training dynamic is not disturbed
                self.feed_one((
                    observation_before_action,action,reward,isdone,observation
                )) # s1,a1,r1,isdone,s2
                # self.lock.acquire()
                self.train(verbose=2 if steps==1 else 0)
                # self.lock.release()

            # if self.render==True and (steps%30==0 or realtime==True):
            #     env.render()
            if done :
                break

        # print('episode done in',steps,'steps',time.time()-timer,'second total reward',total_reward)
        totaltime = time.time()-timer
        print('episode done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}'.format(
        steps,totaltime,totaltime/steps,total_reward
        ))
        self.lock.acquire()

        # for t in episode_memory:
        #     if np.random.uniform()>0.5:
        #         self.feed_one(t)

        self.plotter.pushys([total_reward,noise_level,(time.time()%3600)/3600-2,steps/1000-1])
        # self.noiseplotter.pushy(noise_level)
        self.lock.release()

        return
Ejemplo n.º 7
0
    def __init__(
        self,
        observation_space,
        action_space,
        stack_factor=1,
        discount_factor=.99,  # gamma
        train_skip_every=1,
    ):
        self.rpm = rpm(1000000)  # 1M history
        self.render = True
        self.noise_source = one_fsq_noise()
        self.train_counter = 0
        self.train_skip_every = train_skip_every
        self.observation_stack_factor = stack_factor

        self.inputdims = observation_space.shape[
            0] * self.observation_stack_factor
        # assume observation_space is continuous

        self.is_continuous = True if isinstance(action_space, Box) else False

        if self.is_continuous:  # if action space is continuous

            low = action_space.low
            high = action_space.high

            num_of_actions = action_space.shape[0]

            self.action_bias = high / 2. + low / 2.
            self.action_multiplier = high - self.action_bias

            # say high,low -> [2,7], then bias -> 4.5
            # mult = 2.5. then [-1,1] multiplies 2.5 + bias 4.5 -> [2,7]

            def clamper(actions):
                return np.clip(actions,
                               a_max=action_space.high,
                               a_min=action_space.low)

            self.clamper = clamper
        else:
            num_of_actions = action_space.n

            self.action_bias = .5
            self.action_multiplier = .5  # map (-1,1) into (0,1)

            def clamper(actions):
                return np.clip(actions, a_max=1., a_min=0.)

            self.clamper = clamper

        self.outputdims = num_of_actions
        self.discount_factor = discount_factor
        ids, ods = self.inputdims, self.outputdims
        print('inputdims:{}, outputdims:{}'.format(ids, ods))

        self.actor = self.create_actor_network(ids, ods)
        self.critic = self.create_critic_network(ids, ods)
        self.actor_target = self.create_actor_network(ids, ods)
        self.critic_target = self.create_critic_network(ids, ods)

        # print(self.actor.get_weights())
        # print(self.critic.get_weights())

        self.feed, self.joint_inference, sync_target = self.train_step_gen()

        sess = ct.get_session()
        sess.run(tf.global_variables_initializer())

        sync_target()
    def run_episode(self,
                    fenv,
                    max_steps=-1,
                    training=False,
                    render=False,
                    noise_level=0.,
                    ac_id=0):
        time_start = time.time()

        noise_source = None
        if noise_level > 0.0:
            noise_source = one_fsq_noise()
            # warm up noise source
            for _ in range(2000):
                noise_source.one((self.outputdims, ), noise_level)

        max_steps = max_steps if max_steps > 0 else 50000
        steps = 0
        total_reward = 0

        try:
            # this might be a remote env
            observation = np.array(fenv.reset())
        except Exception as e:
            print('Bad things during reset. Episode terminated.', e)
            traceback.print_exc()
            return

        while True and steps <= max_steps:
            steps += 1

            observation_before_action = observation  # s1

            exploration_noise = 0.0
            if noise_level > 0.0:
                exploration_noise = noise_source.one((self.outputdims, ),
                                                     noise_level)

            # get action
            action = None
            with self.lock_swap:
                if training:
                    action = self.get_action(observation_before_action, ac_id)
                else:
                    action = self.get_max_action(observation_before_action)

            # add noise to our actions, since our policy is deterministic
            if noise_level > 0.0:
                exploration_noise *= self.action_multiplier
                action += exploration_noise
            action = self.clamp_action(action)

            # step
            try:
                # can't send receive np arrays over pyro
                action_out = [float(action[i]) for i in range(len(action))]
                observation, reward, done, _info = fenv.step(action_out)
                observation = np.array(observation)
            except Exception as e:
                print('Bad things during step. Episode terminated.', e)
                traceback.print_exc()
                return

            # d1
            isdone = 1 if done else 0
            total_reward += reward

            # train
            if training == True:
                # The code works without this lock, but depending on training speed there is too much noise on updates.
                # The model always trains and is more stable with lock here
                with self.lock:
                    self.append_memory(observation_before_action, action,
                                       reward, isdone,
                                       observation)  # s1,a1,r1,isdone,s2
                    for i in range(self.nr_networks):
                        self.train_batch(i)
            else:
                if render:
                    fenv.render()

            if done:
                break

        totaltime = time.time() - time_start

        if training == True:
            self.global_step += 1
            print(
                self.global_step,
                ': Episode done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}'
                .format(steps, totaltime, totaltime / steps, total_reward))
            self.history.append_train(total_reward, noise_level, steps)
        else:
            print(
                'Test done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}'
                .format(steps, totaltime, totaltime / steps, total_reward))
            self.history.append_test(total_reward, self.global_step, steps)
            if render == False:
                # background test
                if total_reward > self.max_reward:
                    self.max_reward = total_reward
                    self.save_weights("max_model")
                    print("Saved new max model with score: ", total_reward)

        return total_reward
Ejemplo n.º 9
0
    def play(self, env, max_steps=-1, realtime=False, noise_Level=0.):
        timer = time.time()
        noise_source = one_fsq_noise()

        for j in range(200):
            noise_source.one((self.outputdims, ), noise_Level)

        max_steps = max_steps if max_steps > 0 else 50000
        steps = 0
        total_reward = 0
        episode_memory = []

        observation = env.reset()
        while True and steps <= max_steps:
            steps += 1

            observation_before_action = observation

            exploration_noise = noise_source.one((self.outputdims, ),
                                                 noise_Level)

            action = self.act(observation_before_action, exploration_noise)

            exploration_noise *= self.action_multiplier
            action = self.clamper(action)
            action_out = action

            observation, reward, done, _info = env.step(action_out)

            isdone = 1 if done else 0
            total_reward += reward

            if self.training == True:
                episode_memory.append((observation_before_action, action,
                                       reward, isdone, observation))

                self.train()

            if done:
                break

        totaltime = time.time() - timer
        print(
            'episode done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}'
            .format(steps, totaltime, totaltime / steps, total_reward))

        self.plot_epoch.append(self.plot_epoch[-1] + 1)
        self.plot_reward.append(total_reward)
        #epoch = range(0,3000)
        #rewards = range(0,6000,2)

        self.lock.acquire()

        for t in episode_memory:
            self.feed_one(t)

        #self.plotter.pushys([total_reward,noise_level,(time.time()%3600)/3600-2])
        # self.noiseplotter.pushy(noise_level)
        self.lock.release()

        return