Exemple #1
0
class TensorforceAgent:
    def __init__(self,actions):
        preprocessing_config = [
            {
                "type": "grayscale"
            }
        ]
        exploration_config = dict(
            type="epsilon_anneal",
            initial_epsilon=0.25,
            final_epsilon=0.01,
            timesteps=1000000
        )

        network_spec = [
            dict(type='conv2d', size=16, window=8, stride=4, activation='lrelu'),
            dict(type='conv2d', size=32, window=4, stride=2, activation='lrelu'),
            dict(type='flatten'),
            dict(type='dense', size=256, activation='lrelu')
        ]
        self.network_path = "network/"
        self.agent = PPOAgent(
            actions = dict(type='int', num_actions=len(actions)),
            states = dict(type='float', shape=(35, 150, 3)),
            network = network_spec,
            actions_exploration = exploration_config,
            states_preprocessing = preprocessing_config
        )

    def act(self, obs):
        #Cut out only the part needed
        partly = np.delete(obs, np.s_[96:], 0)
        partly = np.delete(partly, np.s_[0:26], 0)
        partly = np.delete(partly, np.s_[35:45], 0)
        partly = np.delete(partly, np.s_[38:53], 0)
        partly = np.delete(partly, np.s_[31:35], 0)
        partly = np.delete(partly, np.s_[10:16], 0)
        frame = np.delete(partly, np.s_[150:], 1)

        #scipy.misc.imsave('outfile.jpg', frame)

        return self.agent.act(frame)

    def load(self):
        import os
        if os.path.isdir(self.network_path):
            try:
                self.agent.restore_model(self.network_path)
            except:
                print("Failed to load model")

    def observe(self, terminal = False, reward = 0):
        return self.agent.observe(terminal, reward)

    def save_model(self):
        import os
        if not os.path.isdir(self.network_path):
            os.makedirs(self.network_path)
        self.agent.save_model(self.network_path)
def main():
    env = gym.make('CartPole-v0')

    # (4,)
    print(env.observation_space.shape)
    # [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
    print(env.observation_space.high)
    # [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
    print(env.observation_space.low)
    # 2
    print(env.action_space.n)

    agent = PPOAgent(
        states=dict(type='float', shape=env.observation_space.shape),
        network=[
            dict(type='dense', size=32, activation='relu'),
            dict(type='dense', size=32, activation='relu'),
        ],
        actions=dict(type='int', num_actions=env.action_space.n),
        step_optimizer=dict(type='adam', learning_rate=1e-4)
    )

    model_dir = 'models/cartpole'

    if os.path.exists(f'{model_dir}/checkpoint'):
        agent.restore_model(directory=model_dir)

    try:
        for ep in range(2000):
            observation = env.reset()
            done = False
            ep_reward = 0
            while not done:
                # env.render()

                states = observation / 4

                action = agent.act(states=states)

                observation, reward, done, info = env.step(action)

                agent.observe(reward=reward, terminal=done)

                ep_reward += reward

                if done:
                    print(f'ep = {ep}, ep_reward = {ep_reward}')
    except Exception as e:
        raise e
    finally:
        agent.save_model(directory=f'{model_dir}/agent')
Exemple #3
0
class ForwardActor:
    def __init__(self):

        actions = {}
        for i in range(12):
            actions[str(i)] = {'type': 'float'}  # 'num_actions': 10

        network_spec = [
            dict(type='dense', size=100, activation='relu'),
            dict(type='dense', size=100, activation='relu')
        ]

        self.agent = PPOAgent(
            states=dict(type='float', shape=(12, )),
            actions=actions,
            batching_capacity=2000,
            network=network_spec,
            step_optimizer=dict(type='adam', learning_rate=1e-4),
        )

    def act(self, state):
        jp = np.expand_dims(np.nan_to_num(np.array(state["JointPosition"])),
                            axis=0)
        jv = np.expand_dims(np.array(state["JointVelocity"]), axis=0)

        #actiondict = self.agent.act( np.concatenate([jp,jv],axis=1))
        actiondict = self.agent.act(jp)

        action = np.zeros(12)
        for i in range(12):
            action[i] = actiondict[str(i)][0]
        action = np.nan_to_num(action)
        #print(action)
        return np.clip(action, -1.0, 1.0)

    def observe(self, reward, terminal):
        self.agent.observe(reward=reward, terminal=terminal)

    def save(self, directory):
        self.agent.save_model(directory=directory)

    def restore(self, directory):
        self.agent.restore_model(directory=directory)
    def test_readme(self):
        environment = UnittestEnvironment(states=dict(type='float',
                                                      shape=(10, )),
                                          actions=dict(type='int',
                                                       num_values=5))

        def get_current_state():
            return environment.reset()

        def execute_decision(x):
            return environment.execute(actions=x)[2]

        # Instantiate a Tensorforce agent
        agent = PPOAgent(states=dict(type='float', shape=(10, )),
                         actions=dict(type='int', num_values=5),
                         memory=10000,
                         network='auto',
                         update_mode=dict(unit='episodes', batch_size=10),
                         step_optimizer=dict(type='adam', learning_rate=1e-4))

        # Initialize the agent
        agent.initialize()

        # Retrieve the latest (observable) environment state
        state = get_current_state()  # (float array of shape [10])

        # Query the agent for its action decision
        action = agent.act(states=state)  # (scalar between 0 and 4)

        # Execute the decision and retrieve the current performance score
        reward = execute_decision(action)  # (any scalar float)

        # Pass feedback about performance (and termination) to the agent
        agent.observe(reward=reward, terminal=False)

        agent.close()
        environment.close()
        self.assertTrue(expr=True)
Exemple #5
0
class PPOAgent(Agent):
    def __init__(self,
                 name,
                 game_inputs=None,
                 callbacks=None,
                 input_shape=None,
                 input_type=None,
                 use_tensorboard=True,
                 tensorforce_kwargs=None):
        super().__init__(name, game_inputs=game_inputs, callbacks=callbacks)

        if input_shape is None or not isinstance(input_shape, tuple):
            raise SerpentError("'input_shape' should be a tuple...")

        if input_type is None or input_type not in ["bool", "int", "float"]:
            raise SerpentError(
                "'input_type' should be one of bool|int|float...")

        states_spec = {"type": input_type, "shape": input_shape}

        # TODO: Support multiple actions
        # TODO: Support continuous action spaces
        actions_spec = {"type": "int", "num_actions": len(self.game_inputs)}

        summary_spec = None

        if use_tensorboard:
            summary_spec = {
                "directory":
                "./tensorboard/",
                "steps":
                50,
                "labels": [
                    "configuration", "gradients_scalar", "regularization",
                    "inputs", "losses", "variables"
                ]
            }

        default_network_spec = [{
            "type": "conv2d",
            "size": 32,
            "window": 8,
            "stride": 4
        }, {
            "type": "conv2d",
            "size": 64,
            "window": 4,
            "stride": 2
        }, {
            "type": "conv2d",
            "size": 64,
            "window": 3,
            "stride": 1
        }, {
            "type": "flatten"
        }, {
            "type": "dense",
            "size": 1024
        }]

        agent_kwargs = dict(batch_size=1024,
                            batched_observe=1024,
                            network_spec=default_network_spec,
                            device=None,
                            session_config=None,
                            saver_spec=None,
                            distributed_spec=None,
                            discount=0.99,
                            variable_noise=None,
                            states_preprocessing_spec=None,
                            explorations_spec=None,
                            reward_preprocessing_spec=None,
                            distributions_spec=None,
                            entropy_regularization=0.01,
                            keep_last_timestep=True,
                            baseline_mode=None,
                            baseline=None,
                            baseline_optimizer=None,
                            gae_lambda=None,
                            likelihood_ratio_clipping=None,
                            step_optimizer=None,
                            optimization_steps=10)

        if isinstance(tensorforce_kwargs, dict):
            for key, value in tensorforce_kwargs.items():
                if key in agent_kwargs:
                    agent_kwargs[key] = value

        self.agent = TFPPOAgent(states_spec=states_spec,
                                actions_spec=actions_spec,
                                summary_spec=summary_spec,
                                scope="ppo",
                                **agent_kwargs)

        try:
            self.restore_model()
        except Exception:
            pass

    def generate_action(self, state, **kwargs):
        if isinstance(state, GameFrame):
            self.current_state = state.frame
        elif isinstance(state, GameFrameBuffer):
            self.current_state = np.stack(
                [game_frame.frame for game_frame in state.frames], axis=2)
        else:
            self.current_state = state

        action = self.agent.act(self.current_state)
        label = self.game_inputs_mapping[action]

        return label, self.game_inputs[label]

    def observe(self, reward=0, terminal=False, **kwargs):
        if self.current_state is None:
            return None

        if self.callbacks.get("before_observe") is not None:
            self.callbacks["before_observe"]()

        will_update = self.agent.batch_count == self.agent.batch_size - 1

        if will_update:
            if self.callbacks.get("before_update") is not None:
                self.callbacks["before_update"]()

            self.agent.observe(reward=reward, terminal=terminal)
            self.save_model()

            if self.callbacks.get("after_update") is not None:
                self.callbacks["after_update"]()
        else:
            self.agent.observe(reward=reward, terminal=terminal)

        self.current_state = None

        self.current_reward = reward
        self.cumulative_reward += reward

        if self.callbacks.get("after_observe") is not None:
            self.callbacks["after_observe"]()

    def save_model(self):
        self.agent.save_model(directory=os.path.join(os.getcwd(), "datasets",
                                                     self.name, self.name),
                              append_timestep=False)

    def restore_model(self):
        self.agent.restore_model(
            directory=os.path.join(os.getcwd(), "datasets", self.name))
        print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts=r.timestep + 1))
        print("Episode reward: {}".format(r.episode_rewards[-1]))
        print("Average of last 10 rewards: {}".format(np.mean(r.episode_rewards[-10:])))
    return True

runner = Runner(agent, environment)

runner.run(num_timesteps=3600, num_episodes=3, episode_finished= episode_finished)

# Poll new state from client
#for outsideTemperature in loganOutsideTemperatures:
for i in range(2):
	outsideTemperature = 1.1
	# iterate through one hour with the same temperature
	for	i in range(3600):
		state = hvacBuilding.get_state(outsideTemperature)
		action = agent.act(state, True)
		reward = hvacBuilding.Act(action)
		agent.observe(reward=reward, terminal=False)
		hvacBuilding.step(outsideTemperature)
	
		#currently the only state is to turn on cooling or turn off
		# if not hvac.HeatingIsShuttingDown and hvac.HeatingIsOn and hvacBuilding.current_temperature > 18.8889:#21:
		# 	#print("Turning the Heater Off")
		# 	hvac.TurnHeatingOff()

		# if hvac.HeatingIsOn == False and hvacBuilding.current_temperature < 17.7778:#17:
		# 	#print("Turning the Heater On")
		# 	numberOfHeatingOn = numberOfHeatingOn + 1
		# 	hvac.TurnHeatingOn()
Exemple #7
0
print("agents made")

monkey = []
rl_ppo = []
rl_dqn = []
rl_vpg = []

#training
for i in tqdm(range(5000)):
    infrastructure.initializeGraph()
    while infrastructure.attempts < len(infrastructure.peers):
        #agent_ppo actions
        state = infrastructure.get_state()

        action = agent_ppo.act(state)
        action = action.values()

        #print("ai", action)
        reward = infrastructure.shutdown(action)

        if infrastructure.attempts < infrastructure.peers:
            agent_ppo.observe(reward=reward, terminal=False)
        else:
            agent_ppo.observe(reward=reward, terminal=True)

        rl_ppo.append(reward)

        #dqn agent
        action = agent_dqn.act(state)
        action = action.values()
Exemple #8
0
    success = False

    while True:
        latent_vector = vae.get_vector(observation.reshape(1, 48, 64, 3))
        latent_vector = list(itertools.chain(*latent_vector))  # [[ ]]  ->  [ ]
        relative_pos = GazeboMaze.p
        previous_act = GazeboMaze.vel_cmd
        print(previous_act)
        # state = latent_vector + relative_pos + previous_act
        state = dict(latent_vector=latent_vector,
                     previous_act=previous_act,
                     relative_pos=relative_pos)
        # print(state)

        # Query the agent for its action decision
        action = agent.act(state, deterministic=deterministic)
        # Execute the decision and retrieve the current information
        observation, terminal, reward = GazeboMaze.execute(action)
        observation = observation / 255.0  # normalize
        # print(reward)
        # Pass feedback about performance (and termination) to the agent
        agent.observe(terminal=terminal, reward=reward)
        timestep += 1
        episode_reward += reward
        if terminal or timestep == max_timesteps:
            success = GazeboMaze.success
            break

    episode += 1
    total_timestep += timestep
    # avg_reward = float(episode_reward)/timestep
def main():
    env = gym.make('Breakout-v0')

    # (210, 160, 3)
    print(env.observation_space.shape)
    # [[[255...]]]
    print(env.observation_space.high)
    # [[[0...]]]
    print(env.observation_space.low)
    # 4
    print(env.action_space.n)

    agent = PPOAgent(
        # (210, 160, 3)
        states=dict(type='float', shape=env.observation_space.shape),
        network=[
            # (51, 29, 32)
            dict(type='conv2d', size=32, window=8, stride=4,
                 activation='relu'),
            # (24, 18, 64)
            dict(type='conv2d', size=64, window=4, stride=2,
                 activation='relu'),
            # (22, 16, 64)
            dict(type='conv2d', size=64, window=3, stride=1,
                 activation='relu'),
            # 22528
            dict(type='flatten'),
            dict(type='dense', size=512, activation='relu'),
            dict(type='dense', size=32, activation='relu'),
        ],
        # batching_capacity=10,
        memory=dict(
            type='latest',
            include_next_states=False,
            capacity=1000,
        ),
        # update=dict(unit='timesteps', batch_size=64),
        actions=dict(type='int', num_actions=env.action_space.n),
        step_optimizer=dict(type='adam', learning_rate=1e-4))

    model_dir = 'models/breakout'

    # load model
    if os.path.exists(f'{model_dir}/checkpoint'):
        agent.restore_model(directory=model_dir)

    try:
        for step in range(100000):
            observation = env.reset()

            done = False
            step_reward = 0
            while not done:
                # env.render()

                # from PIL import Image
                # pil_img = Image.fromarray(observation)
                # pil_img.save('./observation.png')

                states = observation / 256

                action = agent.act(states=states)

                observation, reward, done, info = env.step(action)

                reward = reward / 10

                agent.observe(reward=reward, terminal=done)

                step_reward += reward

                if done:
                    print(f'step = {step}, reward = {step_reward}')
    except Exception as e:
        raise e
    finally:
        agent.save_model(directory=f'{model_dir}/agent')
Exemple #10
0
    success = False

    if GazeboMaze.goal not in config.test_space[maze_id]:  # train
        while True:
            relative_pos = GazeboMaze.p
            previous_act = GazeboMaze.vel_cmd
            previous_reward = GazeboMaze.reward
            print(previous_act)
            state = dict(image=observation,
                         relative_pos=relative_pos,
                         previous_act=previous_act,
                         previous_reward=[previous_reward])
            # state = dict(image=observation, previous_act=GazeboMaze.vel_cmd, relative_pos=GazeboMaze.p)

            # Query the agent for its action decision
            action = agent.act(state)
            # Execute the decision and retrieve the current information
            observation, terminal, reward = GazeboMaze.execute(action)
            observation = observation / 255.0  # normalize
            # print(reward)
            # Pass feedback about performance (and termination) to the agent
            agent.observe(terminal=terminal, reward=reward)
            timestep += 1
            episode_reward += reward
            if terminal or timestep == max_timesteps:
                success = GazeboMaze.success
                break

        episode += 1
        total_timestep += timestep
        # avg_reward = float(episode_reward)/timestep
 done = False
 agent.reset()
 while simulation.gameOver(
 ) == False and turns < 100 and bad_move_count < 150:
     # print(player)
     counter += 1
     # if player done, continue
     state = simulation.get_state()
     state.append(float(turns))
     state = [state]
     moved = False
     # print(len(state))
     # exit()
     # print(simulation.players)
     # print(simulation.item_watch())
     action = agent.act(np.asarray(state))
     # print(action)
     if simulation.move_check(player, action):
         #print('GOOD MOVE')
         turns += 1
         moved = True
         # print(player)
         old_pos, new_pos = simulation.movePlayer(player, action)
         #print(old_reward)
         reward = simulation.reward_2(old_pos, new_pos, player, saved_pos)
         #print('factor streak in player' + str(factor_streak))
         #print(map_num)
         print(str(old_pos) + " " + str(new_pos) + " " + str(reward))
         # Update Items
         simulation.item_update(new_pos)
         # Update Statuses
Exemple #12
0
	    states={"type":'float', "shape": infrastructure.graph.shape },
	    actions={
	    	str(i): dict(type="int", num_actions=infrastructure.servers) for i in range(infrastructure.clients)
	    },
	    network=[
		    dict(type='flatten'),
		    dict(type="dense", size=32),
		   	dict(type="dense", size=32),
		   	dict(type="dense", size=32)
	    ],
	)

for i in tqdm(range(100000)):
	state = infrastructure.graph

	action_monkey = monkey.act(state).values()
	action_manager = manager.act(state)
	action_manager_matrix = np.full((infrastructure.servers, infrastructure.clients), 0)
	for item in action_manager.items():
		clientID = int(item[0])
		serverID = item[1]
		action_manager_matrix[serverID][clientID] = 1

	for x in range(infrastructure.servers):
		for y in range(infrastructure.clients):
			if x == y:
				infrastructure.graph[x][y] = 1

	reward = infrastructure.reward(action_monkey, action_manager_matrix)

	monkey.observe(reward=reward, terminal=False)
Exemple #13
0
            latent_vector = vae.get_vector(observation.reshape(1, 48, 64, 3))
            latent_vector = list(
                itertools.chain(*latent_vector))  # [[ ]]  ->  [ ]
            relative_pos = GazeboMaze.p
            previous_act = GazeboMaze.vel_cmd
            previous_reward = GazeboMaze.reward
            print(previous_act)
            # state = latent_vector + relative_pos + previous_act + [previous_reward]
            state = dict(latent_vector=latent_vector,
                         relative_pos=relative_pos,
                         previous_act=previous_act,
                         previous_reward=[previous_reward])
            # print(state)

            # Query the agent for its action decision
            action = agent.act(state, deterministic=deterministic)
            # Execute the decision and retrieve the current information
            observation, terminal, reward = GazeboMaze.execute(action)
            observation = observation / 255.0  # normalize
            # print(reward)
            # Pass feedback about performance (and termination) to the agent
            agent.observe(terminal=terminal, reward=reward)
            timestep += 1
            episode_reward += reward
            if terminal or timestep == max_timesteps:
                success = GazeboMaze.success
                break

        episode += 1
        total_timestep += timestep
        # avg_reward = float(episode_reward)/timestep
Exemple #14
0
class Controller:
    def __init__(self, apikey, agent_id, frames_per_state=1, host=None):

        # PPO agent seems to learn that it needs to speed around the environment to collect rewards
        self._agent = PPOAgent(
            states_spec=dict(type='float', shape=(frames_per_state * 25, )),
            actions_spec=dict(type='float',
                              shape=(3, ),
                              min_value=np.float32(-1.0),
                              max_value=np.float32(1.0)),
            network_spec=[
                dict(type='dense', activation='relu', size=128),
                dict(type='dense', activation='relu', size=128),
            ],
            optimization_steps=5,
            # Model
            scope='ppo',
            discount=0.99,
            # DistributionModel
            distributions_spec=None,
            entropy_regularization=0.01,
            # PGModel
            baseline_mode=None,
            baseline=None,
            baseline_optimizer=None,
            gae_lambda=None,
            # PGLRModel
            likelihood_ratio_clipping=0.2,
            summary_spec=None,
            distributed_spec=None,
            batch_size=2048,
            step_optimizer=dict(type='adam', learning_rate=1e-4))

        self._logger = setup_custom_logger("Controller")

        self._frame_count_per_episode = 0
        self._total_frames = 1
        self._frames_per_state = frames_per_state

        self._client = AsyncClient(apikey, agent_id,
                                   self._train_state_callback, host)

        self._state_stack = StateStack(self._frames_per_state)

    async def _train_state_callback(self, state, reward, error):

        terminal = False

        # We are controlling the the episode to be terminal if either
        # 1. the agent gets a reward in the environment
        # 2. the agent has not had a reward for _frame_count_per_episode states from the environment
        if reward != 0.0:
            reward = reward * 20.0
            terminal = True
            self._frame_count_per_episode = 0
            print("terminal, got reward - %.2f" % reward)
        elif self._frame_count_per_episode == self._max_frame_count_per_episode:
            reward = -100.0
            terminal = True
            self._frame_count_per_episode = 0
            print("terminal, killing")

        self._state_stack.add_state(state[11:])

        if self._total_frames > self._frames_per_state:

            combined_state = self._state_stack.get_combined_state()

            # Currently ignoring the first 11 states as they are sensor for other agents in the environment
            action = self._agent.act(combined_state)

            self._agent.observe(reward=reward, terminal=terminal)

            # Only let the mbot travel forwards
            action[0] = (action[0] + 1.0) / 3.0

            self.total_rewards[self._total_frames] = reward

            await self._client.send_agent_action(action)

            if self._total_frames % 100 == 0:

                self._logger.info(
                    "%d iterations: Running AVG reward per last %d states: %.2f"
                    %
                    (self._total_frames, self._max_frame_count_per_episode,
                     self.total_rewards[max(0, self._total_frames -
                                            10000):self._total_frames].mean()))
        self._total_frames += 1
        self._frame_count_per_episode += 1

        if self._total_frames >= self.max_iterations:
            self._client.stop()

    def train(self, max_iterations, max_frame_count_per_episode=1000):
        """
        :param max_iterations: the maximum iterations across all episodes
        :param max_frame_count_per_episode: we control how the episodes are handled
        :return:
        """
        self._max_frame_count_per_episode = max_frame_count_per_episode

        self.max_iterations = max_iterations
        self.total_rewards = np.zeros(max_iterations)
        self.total_costs = np.zeros(max_iterations)

        self._client.start()
Exemple #15
0
class SerpentPPO:
    def __init__(self, frame_shape=None, game_inputs=None):

        if frame_shape is None:
            raise SerpentError("A 'frame_shape' tuple kwarg is required...")

        states_spec = {"type": "float", "shape": frame_shape}

        if game_inputs is None:
            raise SerpentError("A 'game_inputs' dict kwarg is required...")

        self.game_inputs = game_inputs
        self.game_inputs_mapping = self._generate_game_inputs_mapping()

        print('game inputs mapping:')
        print(self.game_inputs_mapping)
        actions_spec = {"type": "int", "num_values": len(self.game_inputs)}

        summary_spec = {
            "directory":
            "./board/",
            "steps":
            50,
            "labels": [
                "configuration", "gradients_scalar", "regularization",
                "inputs", "losses", "variables"
            ]
        }

        network_spec = [{
            "type": "conv2d",
            "size": 16,
            "window": 8,
            "stride": 4
        }, {
            "type": "conv2d",
            "size": 32,
            "window": 4,
            "stride": 2
        }, {
            "type": "conv2d",
            "size": 32,
            "window": 3,
            "stride": 1
        }, {
            "type": "flatten"
        }, {
            "type": "dense",
            "size": 64
        }]

        baseline_spec = {
            "type": "cnn",
            "conv_sizes": [32, 32],
            "dense_sizes": [32]
        }

        saver_spec = {
            "directory": os.path.join(os.getcwd(), "datasets",
                                      "t4androidmodel"),
            "seconds": 120
        }
        #         memory_spec = {'type':'latest', 'include_next_states':False, 'capacity':1000*1000}

        self.agent = PPOAgent(
            states=states_spec,
            actions=actions_spec,
            network=network_spec,
            #             baseline_mode='states',
            #             baseline=baseline_spec,
            summarizer=summary_spec,
            memory=10,
            update_mode=dict(unit='timesteps', batch_size=2),
            discount=0.97,
            saver=saver_spec)

        self.agent.initialize()
#
#             batched_observe=2560,
#             scope="ppo",
#             summarizer=summary_spec,
#             network=network_spec,
#             device=None,
#             session_config=None,
#             saver_spec=None,
#             distributed_spec=None,
#             discount=0.97,
#             variable_noise=None,
#             states_preprocessing_spec=None,
#             explorations_spec=None,
#             reward_preprocessing_spec=None,
#             distributions_spec=None,
#             entropy_regularization=0.01,
#             batch_size=2560,
#             keep_last_timestep=True,
#             baseline_mode=None,
#             baseline=None,
#             baseline_optimizer=None,
#             gae_lambda=None,
#             likelihood_ratio_clipping=None,
#             step_optimizer=None,
#             optimization_steps=10
#
#         )

    def generate_action(self, game_frame_buffer):
        states = np.stack(game_frame_buffer, axis=2)

        action = self.agent.act(states)
        label = self.game_inputs_mapping[action]

        return action, label, self.game_inputs[label]

    def observe(self, reward=0, terminal=False):
        self.agent.observe(reward=reward, terminal=terminal)

    def _generate_game_inputs_mapping(self):
        mapping = dict()

        for index, key in enumerate(self.game_inputs):
            mapping[index] = key

        return mapping
Exemple #16
0
    pbar = tqdm.tqdm(total=nprocs * batch_allocation)
# Run this single worker (episode loop) as long as episode threshold have not been reached.
while not should_stop:
    state = env.reset()
    #print('Calling reset')
    agent.reset()
    #print('Reset resolved')
    episode_reward = 0

    # Time step (within episode) loop
    time_step = 0
    time_start = time.time()
    while True:
        #print('Calling act')
        action, internals, states = agent.act(states=state,
                                              deterministic=deterministic,
                                              buffered=False,
                                              independent=True)
        #print('Act resolved')
        reward = 0
        for repeat in xrange(repeat_actions):
            state, terminal, step_reward = env.execute(action=action)
            reward += step_reward
            if terminal:
                break

        time_step += 1
        episode_reward += reward
        data_buffer.append((state, action, internals, reward, terminal))

        if terminal or time_step == max_episode_timesteps:
            break
Exemple #17
0
def main(xml_name):
    with open(xml_name) as xmlf:
        xml_str = xmlf.read()

    gen = DummyGen()
    gen.override_from_xml(xml_str)

    _DEFAULT_TIME_LIMIT = 10
    _CONTROL_TIMESTEP = .04
    display_stride = 1 / .04 // 24

    genesis_physics = Physics.from_xml_string(common.read_model(os.path.join(os.getcwd(), xml_name)), 
                                              common.ASSETS)

    genesis_physics.set_genesis(gen)
    genesis_task = FindTarget()
    genesis_env = control.Environment(genesis_physics, 
                                     genesis_task,
                                     control_timestep=_CONTROL_TIMESTEP,
                                     time_limit=_DEFAULT_TIME_LIMIT)
    action_spec = genesis_env.action_spec()
    observation_spec = genesis_env.observation_spec()
    observation_shape = np.array([0])

    for (name, row) in observation_spec.items():
        print (name, observation_shape, row.shape)
        if(row.shape == ()):
            observation_shape[0] += 1
            continue
        print(row.shape)
        observation_shape[0] += row.shape[0]
    observation_shape = (observation_shape[0],)
    print(action_spec)
    print(action_spec.minimum)
    agent = PPOAgent(
        states=dict(type='float', min_value=action_spec.minimum, max_value=action_spec.maximum, shape=observation_shape),
        actions=dict(type='float', min_value=action_spec.minimum, max_value=action_spec.maximum, shape=action_spec.shape),
        network=[
            dict(type='dense', size=128, activation='relu'),
            dict(type='dense', size=64, activation='relu'),
            dict(type='dense', size=16, activation='tanh')
        ],
        step_optimizer={
            "type": "adam",
            "learning_rate": 1e-4
        },
        entropy_regularization=0.01,
        batching_capacity=64,
        subsampling_fraction=0.1,
        optimization_steps=50,
        discount=0.99,
        likelihood_ratio_clipping=0.2,
        baseline_mode="states",
        baseline={
            "type":"mlp",
            "sizes": [32, 32]
        },
        baseline_optimizer={
            "type":"multi_step",
            "optimizer": {
                "type": "adam",
                "learning_rate": 1e-4
            },
            "num_steps": 5
        },
        update_mode={
            "unit": "episodes",
            "batch_size": 128,
            "frequency": 10
        },
        memory={
            "type": "latest",
            "include_next_states": False,
            "capacity": 2000
        }
    )

    time_step = genesis_env.reset()
    curtime = 0.0
    top_view = genesis_env.physics.render(480, 480, camera_id='tracking_top')
    side_view = genesis_env.physics.render(480, 480, camera_id='arm_eye')
    did_except = False
    
    NUM_EPISODES = 10000
    N_INPROG_VIDS = 4
    VID_EVERY = NUM_EPISODES // N_INPROG_VIDS

    for i in tqdm.tqdm(range(NUM_EPISODES)):
        time_step = genesis_env.reset()
        j = 0
        tot = 0
        reward = []
        while not time_step.last():
            state = observation2state(time_step.observation)
            action = agent.act(state)
            time_step = genesis_env.step(action)
            tot += time_step.reward
            reward.append(time_step.reward)
            agent.observe(reward=time_step.reward, terminal=time_step.last())
            if(j % 50 == 0 and i % 25 == 1):
                pass
                #clear_output()
                #img = plt.imshow(np.array(env.physics.render(480, 640)).reshape(480, 640, 3))
                #plt.pause(0.5)
                
            j += 1

        if i % 100 == 0:
                #tot /= j
            tqdm.tqdm.write("for episode " + str(i) +  " : " + str(tot))
            

        if (i % VID_EVERY) == 0 or i == NUM_EPISODES - 1:
            
            agent.save_model('./models/starfish_model_target')

            time_step = genesis_env.reset()
            
            vid_suffix = str(i)
            if i == NUM_EPISODES - 1:
                vid_suffix = 'final'
            vid_name = 'videos/starfish_{}.mp4'.format(vid_suffix)

            imnames = set()
            picidx = 0
            curtime = 0.0

            while not time_step.last():
                try:
                    state = observation2state(time_step.observation)
                    action = agent.act(state)
                    time_step = genesis_env.step(action)
                    savename = "/tmp/starfish_{0:04}.jpg".format(picidx)
                    picidx += 1
                    imnames.add(savename)
                    curtime += _CONTROL_TIMESTEP
                    top_view = genesis_env.physics.render(480, 480, camera_id='tracking_top')
                    side_view = genesis_env.physics.render(480, 480, camera_id='arm_eye')
                    #plt.imshow(np.concatenate((top_view, side_view), axis=1))
                    #plt.pause(0.5)
                    io.imsave(savename, np.concatenate((top_view, side_view), axis=1))
                except PhysicsError:
                    print('except')
                    did_except = True
                    break
            if os.path.isfile(vid_name):
                os.remove(vid_name)
            if not did_except:
                os.system('ffmpeg -nostats -loglevel 0 -f image2 -pattern_type sequence -i "/tmp/starfish_%4d.jpg" -qscale:v 0 {}'.format(vid_name))
            for name in imnames:
                os.remove(name)
            print("recorded video")
        agent.restore(directory="saved/" + args.agent + "/" + args.contrarian)
        print("restored")
    except:
        lastEpoch = 0

    epochs = 100000
    cluster_vals = []
    for epoch in tqdm(range(lastEpoch, epochs)):
        G = Audience(20, 15)

        #20 reccomendations for every user
        training_size = G.graph.shape[0] * 20
        changes = []
        for step in range(training_size):
            action = agent.act(G.graph)

            reward = G.recommendation(action["user"], action["item"])

            #reward = weight * reward + weight * change

            #if contrarian get this
            if args.contrarian == "on":
                cluster_val = G.clustering() + 0.01
                cluster_vals.append(cluster_val)
                # print(reward, cluster_val, reward / cluster_val)
                reward = reward / cluster_val

                #change
                if (len(cluster_vals) % 10) == 0:
                    if len(cluster_vals) > 0:
Exemple #19
0
def main():
    ''' 
    Train an agent. Note that I've created a custom OpenAI Gym environment
    to allow for quick plug and play in comparing performance across 
    different RL models. 

    '''
    env = gym.make(
        'Trade-v0',
        window=50,
        datadir='stocks/s_coinbaseUSD_1_min_data_2014-12-01_to_2018-11-11.csv',
        preprocesses=['MinMax'])

    network_spec = [
        dict(type='flatten'),
        dict(type='dense', size=32, activation='tanh'),
        dict(type='dense', size=32, activation='tanh')
    ]

    agent = PPOAgent(
        states=env.observation_space,
        actions=env.action_space,
        network=network_spec,
        step_optimizer=dict(type='adam', learning_rate=1e-3),
        optimization_steps=10,
        scope='ppo',
        discount=0.99,
        entropy_regularization=0.01,
        baseline_mode=None,
        baseline=None,
        baseline_optimizer=None,
        gae_lambda=None,
        likelihood_ratio_clipping=0.2,
    )

    runner = Runner(agent=agent, environment=env)

    def episode_finished(r):
        print("Finished episode {ep} after {ts} timesteps (reward: {reward})".
              format(ep=r.episode,
                     ts=r.episode_timestep,
                     reward=r.episode_rewards[-1]))
        return True

    runner.run(episodes=10, episode_finished=episode_finished)

    print(
        "Learning finished. Total episodes: {ep}. Average reward of last 10 episodes (of 10): {ar}."
        .format(ep=runner.episode, ar=np.mean(runner.episode_rewards[-5:])))

    print('Testing for an episode...')

    s = env.reset()

    collectables = []
    while True:
        action = agent.act(s)
        s, r, d, i = env.step(action)
        agent.observe(reward=r, terminal=d)
        collectables.append(
            (s[0][0],
             action))  # to be replaced by env.render() when i get it fixed
        if d:
            break

    plot(collectables, 0.001)  # plot only .1% of one episode
Exemple #20
0
class ForwardActorSimple:
    def __init__(self):

        actions = {}
        actions_exp = {}
        for i in range(12):
            actions[str(i)] = {'type': 'float'}  # 'num_actions': 10
            actions_exp[str(i)] = dict(type='ornstein_uhlenbeck',
                                       sigma=0.1,
                                       mu=0.0,
                                       theta=0.1)

        preprocessing_config = [{"type": "standardize"}]

        preprocessing_config = None

        customnet = dict(type=CustomNetwork)
        layerSize = 300
        network_spec = [
            dict(type='dense', size=100),
            dict(type='lstm', size=100)
        ]
        '''
        network_spec = [
                            dict(type='dense', size=100),
                           dict(type='internal_lstm', size=100)
                       ]
       
        '''

        network_spec = [
            dict(type='dense', size=layerSize, activation='selu'),
            dict(type='dense', size=layerSize, activation='selu'),
            dict(type='dense', size=layerSize, activation='selu')
        ]

        self.agent = PPOAgent(
            states=dict(type='float', shape=(12 + 9, )),
            actions=actions,
            batching_capacity=1000,
            network=network_spec,
            states_preprocessing=preprocessing_config,
            actions_exploration=actions_exp,
            step_optimizer=dict(type='adam', learning_rate=1e-5),
        )

    def act(self, state):
        jp = np.expand_dims(np.nan_to_num(np.array(state["JointPosition"])),
                            axis=0)
        #jv = np.expand_dims(np.array(state["JointVelocity"]), axis=0)
        orient = np.expand_dims(np.array(state["bodyRot"]), axis=0)
        actiondict = self.agent.act(
            np.nan_to_num(np.concatenate([jp, orient], axis=1)) / 5.0)
        #actiondict = self.agent.act(jp)

        action = np.zeros(12)
        for i in range(12):
            action[i] = actiondict[str(i)][0]
        action = np.nan_to_num(action)
        #print(action)
        return np.clip(action, -1.0, 1.0)

    def observe(self, reward, terminal):
        self.agent.observe(reward=reward, terminal=terminal)

    def save(self, directory):
        self.agent.save_model(directory=directory)

    def restore(self, directory):
        self.agent.restore_model(directory=directory)
Exemple #21
0
class SerpentPPO:

    def __init__(self, frame_shape=None, game_inputs=None):

        if frame_shape is None:
            raise SerpentError("A 'frame_shape' tuple kwarg is required...")

        states_spec = {"type": "float", "shape": frame_shape}

        if game_inputs is None:
            raise SerpentError("A 'game_inputs' dict kwarg is required...")

        self.game_inputs = game_inputs
        self.game_inputs_mapping = self._generate_game_inputs_mapping()

        actions_spec = {"type": "int", "num_actions": len(self.game_inputs)}

        network_spec = [
            {"type": "conv2d", "size": 1, "window": 2, "stride": 1},
            {"type": "flatten"},
            # {"type": "dense", "size": 64},
            {"type": "dense", "size": 6}
        ]

        self.agent = PPOAgent(
            states=states_spec,
            actions=actions_spec,
            network=network_spec,

            batched_observe=256,
            batching_capacity=1000,
            # BatchAgent
            #keep_last_timestep=True,
            # PPOAgent
            step_optimizer=dict(
                type='adam',
                learning_rate=1e-4
            ),
            optimization_steps=10,
            # Model
            scope='ppo'
                #discount=0.97,
            # DistributionModel
                #distributions=None,
                #entropy_regularization=0.01,
            # PGModel
                #baseline_mode=None,
                #baseline=None,
                #baseline_optimizer=None,
                #gae_lambda=None,
            # PGLRModel
                #likelihood_ratio_clipping=None,
            #summary_spec=summary_spec,
            #distributed_spec=None,
            # More info
                #device=None,
            #session_config=None,
                #saver=None,
                #variable_noise=None,
            #states_preprocessing_spec=None,
            #explorations_spec=None,
            #reward_preprocessing_spec=None,
                #execution=None,
                #actions_exploration=None,
                #update_mode=None,
                #memory=None,
                #subsampling_fraction=0.1
        )

    def generate_action(self, game_frame_buffer):
        states = np.stack(
            game_frame_buffer,
            axis=2
        )

        # Get prediction from agent, execute
        action = self.agent.act(states)
        label = self.game_inputs_mapping[action]

        return action, label, self.game_inputs[label]

    def observe(self, reward=0, terminal=False):
        self.agent.observe(reward=reward, terminal=terminal)

    def _generate_game_inputs_mapping(self):
        mapping = dict()

        for index, key in enumerate(self.game_inputs):
            mapping[index] = key

        return mapping

    def save_model(self):
        self.agent.save_model(directory=os.path.join(os.getcwd(), "datasets", "bomberman", "ppo_model"), append_timestep=False)

    def restore_model(self):
        self.agent.restore_model(directory=os.path.join(os.getcwd(), "datasets", "bomberman"))
Exemple #22
0
                 optimization_steps=20)
'''
    batching_capacity=200,
    step_optimizer=dict(
        type='adadelta',
        learning_rate=1e-3)
'''

allRewards = np.zeros(shape=(1, 1))

for game in range(NUM_GAMES_TO_PLAY):
    obs = env.reset()
    gameTotalReward = 0
    for step in range(1000):
        env.render()
        a = agent.act(obs)
        #print("ACTION ->",a)
        if CLIP_ACTION:
            for i in range(np.alen(a)):
                if a[i] < -1: a[i] = -0.99999999999
                if a[i] > 1: a[i] = 0.99999999999
        obs, reward, done, info = env.step(a)
        #reward = reward/100
        gameTotalReward = gameTotalReward + reward
        allRewards = np.vstack((allRewards, np.array([reward])))
        if done:
            agent.observe(reward=reward, terminal=True)
        else:
            agent.observe(reward=reward, terminal=False)

        #print("Action: {} Observations Size:{} score: {}".format(a,obs.shape,reward))
Exemple #23
0
)

reward_list = []

args_episodes = 1000
args_episode_max_steps = 200
episode = 0
agent.reset()
while True:
    agent.reset()
    state = env.reset()
    episode += 1
    episode_step = 0
    episode_reward = 0
    while True:
        action = agent.act(state)
        state, terminal, reward = env.execute(action)
        reward = np.abs(state[1]) - 0.05
        episode_reward += reward
        episode_step += 1
        if args_episode_max_steps is not None and episode_step >= args_episode_max_steps:
            terminal = True
        agent.observe(terminal, reward)

        if terminal:
            break
    print('episode {0} steps {1} reward {2}'.format(episode, episode_step, episode_reward))
    reward_list.append(episode_reward)
    if episode >= args_episodes:
        break
    # if len(reward_list) > 100 and np.mean(reward_list[-100:]) > 199:
Exemple #24
0
        if state_downscaled is not None:
            ax.imshow(state_downscaled)

    anim = animation.FuncAnimation(fig, animate, interval=100)
    plt.show()


threading.Thread(target=anim_thread).start()

for step in range(500000):
    if done:
        state = env.reset()
    # state.shape = 240, 256, 3
    state_cutted = state[:, 85:215]
    state_downscaled = state_cutted[6::12, 6::12]
    action = agent.act(state_downscaled)
    state, reward, done, info = env.step(action)

    # Train the agent model
    agent.observe(reward=reward, terminal=False)

    if step % 100 == 0:
        log.debug('state {}: %s'.format(type(state)), state.shape)
        log.debug('reward {}: %s'.format(type(reward)), reward)
        log.debug('done {}: %s'.format(type(done)), done)
        log.debug('info {}: %s'.format(type(info)), info)
        log.debug('_y_pos {}: %s'.format(type(_env._y_position)),
                  _env._y_position)

    env.render()
Exemple #25
0
class SerpentPPO:
    def __init__(self, frame_shape=None, game_inputs=None):

        if frame_shape is None:
            raise SerpentError("A 'frame_shape' tuple kwarg is required...")

        states_spec = {"type": "float", "shape": frame_shape}

        if game_inputs is None:
            raise SerpentError("A 'game_inputs' dict kwarg is required...")

        self.game_inputs = game_inputs
        self.game_inputs_mapping = self._generate_game_inputs_mapping()

        actions_spec = {"type": "int", "num_actions": len(self.game_inputs)}

        network_spec = [{
            "type": "conv2d",
            "size": 32,
            "window": 8,
            "stride": 4
        }, {
            "type": "conv2d",
            "size": 64,
            "window": 4,
            "stride": 2
        }, {
            "type": "conv2d",
            "size": 64,
            "window": 3,
            "stride": 1
        }, {
            "type": "flatten"
        }, {
            "type": "dense",
            "size": 512
        }]

        self.agent = PPOAgent(
            states_spec=states_spec,
            actions_spec=actions_spec,
            batched_observe=128,
            scope="ppo",
            summary_spec=None,
            network_spec=network_spec,
            device=None,
            session_config=None,
            saver_spec=None,
            distributed_spec=None,
            discount=0.99,
            variable_noise=None,
            states_preprocessing_spec=None,
            explorations_spec=None,
            reward_preprocessing_spec=None,
            distributions_spec=None,
            entropy_regularization=1e-2,
            batch_size=128,
            keep_last_timestep=True,
            baseline_mode=None,
            baseline=None,
            baseline_optimizer=None,
            gae_lambda=None,
            likelihood_ratio_clipping=None,
            step_optimizer=None,
            #optimization_steps=10
        )

    def generate_action(self, game_frame_buffer):
        states = np.stack(
            [game_frame.frame for game_frame in game_frame_buffer.frames],
            axis=2)

        action = self.agent.act(states)
        label = self.game_inputs_mapping[action]

        return action, label, self.game_inputs[label]

    def observe(self, reward=0, terminal=False):
        self.agent.observe(reward=reward, terminal=terminal)

    def _generate_game_inputs_mapping(self):
        mapping = dict()

        for index, key in enumerate(self.game_inputs):
            mapping[index] = key

        return mapping
Exemple #26
0
class Product:
    def __init__(self, name, light, price, quantity, avg_cost_estimate):
        # initalize product
        self.name = name

        # initialize state
        self.light = light
        self.quantity = quantity
        self.avg_cost_estimate = avg_cost_estimate  # the approximated cost of each item sold

        self.price = price  # what the price is being set at

        self.history_log = []  # history of product over time

        # initalize agent
        self.agent = PPOAgent(
            states=dict(type='float', shape=(4)),
            actions=dict(type='int', num_actions=len(PRICE_CHANGES)),
            network=[dict(type='dense', size=4),
                     dict(type='dense', size=4)],
            step_optimizer=dict(type='adam', learning_rate=0.01))
        self.agent.initialize_model()

    def get_history_log(self):
        return self.history_log

    def get_quantity(self):
        return self.quantity

    def get_avg_cost_estimate(self):
        return self.avg_cost_estimate

    def get_light(self):
        return self.light

    def get_price(self):
        return self.price

    def get_recommended_price(self):
        return max(
            0, self.price + PRICE_CHANGES[self.agent.act(states=(
                self.light, self.price, self.quantity, self.avg_cost_estimate),
                                                         deterministic=True,
                                                         independent=True)])

    def set_light(self, new_light):
        self.history_log.append("light=" + str(new_light))
        self.light = new_light

    def set_price(self, new_price):
        self.history_log.append("price=" + str(new_price))
        self.price = new_price

    def update_price(self):
        self.history_log.append("price=" + str(new_price))
        self.price = max(
            0, self.price + PRICE_CHANGES[self.agent.act(states=(
                self.light, self.price, self.quantity, self.avg_cost_estimate),
                                                         deterministic=True,
                                                         independent=True)])

    def record_delivery(self, delivery_quantity, delivery_cost_per_item):
        self.history_log.append("delivery," + str(delivery_quantity) +
                                str(delivery_cost_per_item))

        # increase quantity as per size of delivery
        self.quantity += delivery_quantity

        # update cost
        # TODO improve algorithm
        self.cost = delivery_cost_per_item

    def record_sale(self, sale_quantity):
        self.history_log.append("sale," + str(sale_quantity))

        # decrease quantity as per size of sale
        self.quantity -= sale_quantity

        # calculate approximate profit per item of sale
        avg_profit_estimate = sale_quantity * (self.price -
                                               self.avg_cost_estimate)
        self.agent.act(states=(self.light, self.price, self.quantity,
                               self.avg_cost_estimate),
                       deterministic=False,
                       independent=False)
        self.agent.observe(reward=avg_profit_estimate, terminal=False)