Exemple #1
def main():
    # Instanciate specified environment.
    env = fe.FlockingEnv(size1, dynamic="first")

    # Get environment specs
    num_states = (size1 + 1) * dim * 2
    num_actions = size1 * dim

    # Print specs
    print("Number of states: %d" % num_states)
    print("Number of actions: %d" % num_actions)

    # Instanciate reinforcement learning agent which contains Actor/Critic DNN.
    #agents =[]
    #for i in range(0,size):
    agent = DDPGAgent(ob_shape=num_states, ac_shape=dim)
    #    agents.append(agent)
    # Exploration noise generator which uses Ornstein-Uhlenbeck process.
    noise = OUNoise(1)

    for i in range(episodes_num):
        print("--------Episode %d--------" % i)
        reward_per_episode = 0
        observation = env.reset_mul()
        #observation = env.reset_full()

        for j in range(steps_limit):
            if is_movie_on: env.render()

            # Select action off-policy
            state = observation
            action = np.zeros((size1, dim), dtype=np.float32)

            # get individual ob states here
            for k in range(0, size1):
                ac = agent.feed_forward_actor(
                    np.reshape(state[k], [1, num_states]))
                # print(noise.generate())
                if i % 2 == 0:
                    action[k][0] = ac[0][0] + noise.generate()
                    action[k][1] = ac[0][1] + noise.generate()
                    action[k][0] = ac[0][0] + noise.generate()
                    action[k][1] = ac[0][1] + noise.generate()
            action = agent.feed_forward_actor(np.reshape(state, [1, num_states]))
            action = np.reshape(action, [size1,dim])
            for k in range(0, size1):
                if i % 2 == 0:
                    action[k][0] += noise.generate()
                    action[k][1] += noise.generate()
            # Throw action to environment
            observation, reward, done, info = env.step_mul(action)
            #observation, reward, done, info = env.step_full(action)

            for k in range(0, size1):
                                                [num_states]), action[k],
                                     np.reshape(observation[k], [num_states]),
                                     reward[k], done)
            #agent.add_experience(np.reshape(state, [num_states]), action,
            #                                np.reshape(observation, [ num_states]), reward, done)

            # Train actor/critic network
            if len(agent.replay_buffer) > MINI_BATCH_SIZE: agent.train()

            reward_per_episode = reward.sum()

            if j % 100 == 0:
                print(j, "step finished. reward=", reward_per_episode, "info=",
                # print("action=",action,"observation=",observation)
            if (done or j == steps_limit - 1):
                print("Steps count: %d" % j)
                print("Total reward: %d" % reward_per_episode)


                with open("reward_log.csv", "a") as f:
                    f.write("%d,%f\n" % (i, reward_per_episode))

Exemple #2
def main():
    # Instanciate specified environment.
    env = gym.make(ENV_NAME)

    # Confirm that state and action spaces are continuous
    assert isinstance(env.observation_space,
                      gym.spaces.Box), "State space must be continuous!"
    assert isinstance(env.action_space,
                      gym.spaces.Box), "Action space must be continuous!"

    # Get environment specs
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    steps_limit = env.spec.timestep_limit

    # Print specs
    print("-----------Env specs (%s)------------" % ENV_NAME)
    print("Number of states: %d" % num_states)
    print("Number of actions: %d" % num_actions)
    print("Limit of steps per episode: %d" % steps_limit)

    # Instanciate reinforcement learning agent which contains Actor/Critic DNN.
    agent = DDPGAgent(env)
    # Exploration noise generator which uses Ornstein-Uhlenbeck process.
    noise = OUNoise(num_actions)

    for i in range(episodes_num):
        print("--------Episode %d--------" % i)
        reward_per_episode = 0
        observation = env.reset()

        for j in range(steps_limit):
            if is_movie_on: env.render()

            # Select action off-policy
            state = observation
            action = agent.feed_forward_actor(
                np.reshape(state, [1, num_states]))
            action = action[0] + noise.generate()

            # Throw action to environment
            observation, reward, done, info = env.step(action)

            # For replay buffer. (s_t, a_t, s_t+1, r)
            agent.add_experience(state, action, observation, reward, done)

            # Train actor/critic network
            if len(agent.replay_buffer) > MINI_BATCH_SIZE: agent.train()

            reward_per_episode += reward

            if (done or j == steps_limit - 1):
                print("Steps count: %d" % j)
                print("Total reward: %d" % reward_per_episode)


                with open("reward_log.csv", "a") as f:
                    f.write("%d,%f\n" % (i, reward_per_episode))
