Esempio n. 1
0
def main():
    action_high = 2
    action_low = -2
    action_high = np.array([action_high])
    action_low = np.array([action_low])
    buffer_size = 100000
    minibatch_size = 256
    num_episode = 500

    env = gym.make("Pendulum-v0")
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    agent = Agent(state_size, action_size, buffer_size, minibatch_size,
                  action_high, action_low)
    reward_list = []
    for i_episode in range(num_episode):
        print("episode: %d" % i_episode)
        state = env.reset()
        total_reward = 0
        for t_timesteps in range(env.spec.timestep_limit):
            env.render()
            action = agent.choose_action(state)
            next_state, reward, done, info = env.step(action)
            total_reward += reward
            transition = [state, action, next_state, reward, done]
            agent.train(transition)
            state = next_state
            if (done or t_timesteps == env.spec.timestep_limit - 1):
                print("Episode finish---time steps: %d" % t_timesteps)
                print("total reward: %d" % total_reward)
                reward_list.append(total_reward)
                break
    np.save('reward', reward_list)
    def __init__(self, state_size, action_size):
        super(MADDPG, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE,
                                   1)

        self.maddpg_agent = [
            Agent(self.state_size, self.action_size, 0),
            Agent(self.state_size, self.action_size, 13)
        ]
 def act(self, states):
     # corpus for Agent Actions
     actions = []
     for Agent, state in zip(self.maddpg_agent, states):
         action = Agent.act(state)
         actions.append(action)
     return actions
    def __init__(self, state_size, action_size, random_seed):
        super(maddpg, self).__init__()

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        # As it was clear the the amount of Agent is limited to two, i
        # 'hardcoded' both into the class
        self.maddpg_agent = [
            Agent(state_size, action_size, random_seed),
            Agent(state_size, action_size, random_seed)
        ]

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
    def __init__(self, n_agents, state_size, action_size, seed):
        """
        Initializes a MultiAgent object

        PARAMS
        =====
        n_agents: Number of agents
        state_size: The dimension of the state space
        action_size: The dimensions of the action space
        seed: The seed to use
        """
        self.n_agents = n_agents
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed
        self.agents = [
            Agent(self.state_size, self.action_size, self.seed)
            for i in range(n_agents)
        ]

        # Single Memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Gamma
        self.Gamma = GAMMA
        self.t_step = 0
Esempio n. 6
0
def main():

    env = UnityEnvironment(file_name="./Tennis_Linux/Tennis.x86_64",
                           no_graphics=True)

    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    action_size = brain.vector_action_space_size

    env_info = env.reset(train_mode=True)[brain_name]
    states = env_info.vector_observations
    state_size = states.shape[1]

    agent = Agent(state_size, action_size)

    scores = train(env, agent, n_episodes=1000)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(1, len(scores) + 1), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.savefig('scores.png')

    env.close()
Esempio n. 7
0
def main():
    agent = Agent()
    agent.load()

    total_reward = 0
    obs = env.reset()
    env.render()
    for _ in range(10000):
        act = agent.predict(obs)
        obs, reward, done, _ = env.step(act)
        total_reward += reward
        env.render()
        if done:
            print(f'total_reward: {total_reward}')
            env.close()
            break
 def __init__(self, num_agents, state_size, action_size, random_seed):
     
     self.num_agents = num_agents
     self.state_size = state_size
     self.action_size = action_size
     
     self.agents = [
         Agent(state_size, action_size, random_seed, i) 
         for i in range(num_agents)
     ]
     self.memory = ReplayBuffer(state_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
Esempio n. 9
0
    def __init__(self, num_agents, state_size, action_size, random_seed):
        """ Initialize multiple Agents each with a Actor-Critic network
            but they share the replay buffer to learn from experience
        """
        self.num_agents = num_agents
        self.agents = []
        for _ in range(num_agents):
            agent = Agent(state_size, action_size, random_seed)
            self.agents.append(agent)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
Esempio n. 10
0
    def __init__(self, state_size, action_size, n_agents, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents
        self.seed = random.seed(seed)

        # Actor-Critic agents
        self.ActorCriticAgents = [
            Agent(state_size, action_size, n_agents, seed)
            for _ in range(n_agents)
        ]

        # Replay memory
        self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE,
                                   seed)
Esempio n. 11
0
def main():
    with tf.Session() as sess:
        while True:
            try:
                env = CarlaEnv()
                break
            except Exception as e:
                print(e)

        agent = Agent(sess=sess,
                      state_size=env.observation_space.shape[0],
                      action_size=env.action_space.shape[0])
        max_episodes = 1000
        max_steps = 1800

        for i in range(int(max_episodes)):

            state = env.reset()
            print(state.shape)
            ep_reward = 0
            ep_ave_max_q = 0
            # plt.clf()
            # if i:
            #     with open("ddpg_memory.pkl","wb") as hand:
            #         pickle.dump(replay_buffer,hand)
            #     actor.save_model()
            #     critic.save_model()
            #     print("Agent saved")

            for j in range(int(max_steps)):

                print("epoch: {}, step: {}".format(i, j))
                # env.render()

                # Added exploration noise
                # a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
                action = agent.get_action(state)
                # a = controller(s[0],s[1],s[3])
                # a = [a]
                next_state, reward, done, info = env.step(action)
                print("reward: {}".format(reward))

                agent.remember(state, action, reward, done, next_state)

                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                agent.train()
Esempio n. 12
0
def main():
    env = UnityEnvironment(file_name="./Tennis_Linux/Tennis.x86_64")

    # get action_size and state_size
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    action_size = brain.vector_action_space_size

    env_info = env.reset(train_mode=False)[brain_name]
    states = env_info.vector_observations
    state_size = states.shape[1]

    agent = Agent(state_size, action_size)
    agent.actor_local.load_state_dict(torch.load('files/checkpoint_actor.pth'))
    agent.critic_local.load_state_dict(
        torch.load('files/checkpoint_critic.pth'))

    play(env, agent)

    env.close()
Esempio n. 13
0
    def __init__(self, num_agents, state_size, action_size, random_seed):
        super(MADDPG, self).__init__()

        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.random_seed = random_seed

        self.maddpg_agent = [
            Agent(self.state_size, self.action_size,
                  self.num_agents * self.state_size,
                  self.num_agents * self.action_size, self.random_seed)
            for i in range(self.num_agents)
        ]

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        self.noise_amplitud = 1
        self.noise_reduction = 0.9995
        self.t_step = 0
Esempio n. 14
0
def main():
    world = World()
    agent = Agent(state_size=world.state_size, action_size=world.action_size)
    while True:
        loop(agent, world)
Esempio n. 15
0
from env.pid import PidEnv
import numpy as np
from ddpg import Agent
from OUNoise import Noise
import matplotlib.pyplot as plt

env = PidEnv(setpoint=20)
batch_size = 128
rewards = []
agent = Agent(num_states=5, num_actions=3)
noise = Noise(num_actions=3)

for episode in range(30):
    state = env.reset()
    noise.reset()
    eps_reward = 0
    for step in range(500):
        action = agent.get_action(state)
        action = noise.get_action(action, step)

        new_state, reward = env.step(action)

        agent.mem.push(state, action, reward, new_state)

        agent.learn(batch_size)

        state = new_state

        eps_reward += reward
    rewards.append(eps_reward)
Esempio n. 16
0
import gym
import os
import numpy as np
from ddpg import Agent
from utils import plot_learning_curve

if __name__ == '__main__':
    env = gym.make('Pendulum-v0')
    agent = Agent(input_dims=env.observation_space.shape,
                  env=env,
                  n_actions=env.action_space.shape[0])
    n_episodes: int = 250

    base_dir: str = os.path.dirname(__file__)

    figure_file = os.path.abspath(os.path.join(base_dir, 'plots/pendulum.png'))

    best_score = env.reward_range[0]
    score_history = []
    load_checkpoint = False

    if load_checkpoint:
        n_steps = 0
        while n_steps <= agent.batch_size:
            observation = env.reset()
            action = env.action_space.sample()
            observation_, reward, done, info = env.step(action)
            agent.remember(observation, action, reward, observation_, done)
            n_steps += 1
        agent.learn()
        agent.load_models()
Esempio n. 17
0
    env = gym.make('Pendulum-v0')
    env.reset()
    env.render()

    params = {
        'env': env,
        'gamma': 0.99,
        'actor_lr': 0.001,
        'critic_lr': 0.001,
        'tau': 0.02,
        'capacity': 10000,
        'batch_size': 32,
    }

    agent = Agent(**params)

    for episode in range(100):
        s0 = env.reset()
        episode_reward = 0

        for step in range(500):
            env.render()
            a0 = agent.act(s0)
            s1, r1, done, _ = env.step(a0)
            agent.put(s0, a0, r1, s1)

            episode_reward += r1
            s0 = s1

            agent.learn()
Esempio n. 18
0
from ddpg import Agent

env_name = 'Pendulum-v0'
env = gym.make(env_name)
env = env.unwrapped
env.seed(1)

state_shape = env.observation_space.shape
num_actions = env.action_space.shape[0]

MAX_EPISODES = 10000
MAX_STEPS = 500
n_iter = 0

action_scale = env.action_space.high[0]
learner = Agent(state_shape, num_actions, action_scale)


def exploration(mu, scale, size=None):
    return np.random.normal(mu, scale, size)


episode_history = deque(maxlen=100)
for i in xrange(MAX_EPISODES):

    # initialize
    state = env.reset()
    total_rewards = 0

    noise = exploration(0.0, 0.2, MAX_STEPS)
Esempio n. 19
0
from utils import fetch_protein
from protein import ProteinState
from ddpg import Agent, ReplayBuffer


EPISODES = 10000
STEPS = 500

if __name__ == "__main__":
    goal_state = fetch_protein("2jof")
    state_dim = goal_state.n_residues() * 2
    action_dim = goal_state.n_residues() * 2
    buffer = ReplayBuffer(10000)
    agent = Agent(state_dim, action_dim, (0, 360))

    for _ in range(EPISODES):
        data = {"state": ProteinState(n_residues=goal_state.n_residues())}
        for _ in range(STEPS):
            action = agent.get_action(data["state"])
            next_state = data["state"].do_action(action)
            reward = data["state"].eval_state() - next_state.eval_state()

            buffer.append(data["state"], action, reward, next_state)

            agent.update(buffer)

            print(data["state"].l2_norm(goal_state))
            data["state"] = ProteinState(angles=next_state.angles())
Esempio n. 20
0
def execute_ddpg(ddpg_agent: Agent, num_episodes: int = 3000, max_episode_t: int = 2000, learn_each: int = 5,
                 consec_learn_iter: int = 10) -> list:
    """
    DDPG - Execution Algorithm Implementation
    :param ddpg_agent: agent in charge of controling both Actor and Critic neural networks behaviour
    :param num_episodes: number of episodes the algorithm will train
    :param max_episode_t: maximum number of time steps to play at each episode
    :param learn_each: teps in a game before triggering the learning procedure
    :param consec_learn_iter: number of consecutive learning iterations
    :return: results obtained during the training procedure
    """
    # 1| Initialization
    global_score = []
    global_score_deque = deque(maxlen=100)

    # 2| Episode run
    for i_episode in range(1, num_episodes + 1):

        # 2.0| Initialization of episode
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(num_agents)
        ddpg_agent.reset()

        # 2.1| Episode Run
        for t_step in range(max_episode_t):
            # 2.1.1| Agent decision and interaction
            actions = ddpg_agent.act(states)
            env_info = env.step(actions)[brain_name]

            # 2.1.2| Feedback on action
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            # 2.1.3| Experience saving
            for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                ddpg_agent.memorize(state, action, reward, next_state, done)

            # 2.1.4| Update values
            scores += rewards
            states = next_states

            # 2.1.5| Agent learning
            if t_step % learn_each == 0:
                for _ in range(consec_learn_iter):
                    ddpg_agent.trigger_learning()

            # 2.1.6| Episode ending
            if np.any(dones):
                break

        # 2.2| Episode post-processing
        # 2.2.1| Scoring
        global_score_deque.append(np.max(scores))
        global_score.append(np.max(scores))

        if i_episode % 10 == 0:
            print('Episode {}\tTotal Average Score: {:.2f}\tMean: {:.2f}'.format(
                    i_episode, np.mean(global_score_deque), np.mean(scores)))

        if i_episode % 50 == 0:
            torch.save(ddpg_agent.actor_local.state_dict(),
                       model_dir + 'checkpoint__actor_local__episode_' + str(i_episode) + '.pth')
            torch.save(ddpg_agent.actor_target.state_dict(),
                       model_dir + 'checkpoint__actor_target__episode_' + str(i_episode) + '.pth')
            torch.save(ddpg_agent.critic_local.state_dict(),
                       model_dir + 'checkpoint__critic_local__episode_' + str(i_episode) + '.pth')
            torch.save(ddpg_agent.critic_target.state_dict(),
                       model_dir + 'checkpoint__critic_target__episode_' + str(i_episode) + '.pth')

        if np.mean(global_score_deque) >= 0.5 and i_episode >= 100:
            print('\rEpisode employed for completing the challenge {}'.format(i_episode))

            torch.save(ddpg_agent.actor_local.state_dict(),
                       model_dir + 'checkpoint__actor_local__episode_' + str(i_episode) + '.pth')
            torch.save(ddpg_agent.actor_target.state_dict(),
                       model_dir + 'checkpoint__actor_target__episode_' + str(i_episode) + '.pth')
            torch.save(ddpg_agent.critic_local.state_dict(),
                       model_dir + 'checkpoint__critic_local__episode_' + str(i_episode) + '.pth')
            torch.save(ddpg_agent.critic_target.state_dict(),
                       model_dir + 'checkpoint__critic_target__episode_' + str(i_episode) + '.pth')
            break

    return global_score
Esempio n. 21
0
env = CityLearn(data_path,
                building_attributes,
                weather_file,
                solar_profile,
                building_ids,
                buildings_states_actions=building_state_actions,
                cost_function=objective_function)
observations_spaces, actions_spaces = env.get_state_action_spaces()

# Provides information on Building type, Climate Zone, Annual DHW demand, Annual Cooling Demand, Annual Electricity Demand, Solar Capacity, and correllations among buildings
building_info = env.get_building_information()

# RL CONTROLLER
#Instantiating the control agent(s)
agents = Agent(env, building_info, observations_spaces, actions_spaces)

# Select many episodes for training. In the final run we will set this value to 1 (the buildings run for one year)
episodes = 10

k, c = 0, 0
cost, cum_reward = {}, {}
start = time.time()
# The number of episodes can be replaces by a stopping criterion (i.e. convergence of the average reward)
for e in range(episodes):
    cum_reward[e] = 0
    rewards = []
    state = env.reset()
    done = False
    while not done:
        if k % (1000) == 0:
if __name__ == "__main__":

    scores_mat = {}
    average_scores_CV = []
    parameters = {}
    env = gym.make('LunarLanderContinuous-v2')

    for j in range(10):
        score_j = []
        param_dist = {'alpha': float(np.random.uniform(0.00009, 0.000009, 1)),
                      'beta': float(np.random.uniform(0.0009, 0.00009, 1)),
                      'tau': float(np.random.uniform(0.009, 0.0007, 1)),
                      'gamma': float(np.random.uniform(1, 0.95, 1))}
        agent = Agent(alpha=param_dist.get('alpha'), beta=param_dist.get('beta'), input_dims=[8],
                      tau=param_dist.get('tau'),
                      env=env, gamma=param_dist.get('gamma'), batch_size=64, layer1_size=400, layer2_size=300,
                      n_actions=2)
        print('Iteration {} runs on following values: alpha={}, beta={}, tau={}, gamma={}'.format(j, param_dist.get(
            'alpha'),
                                                                                                  param_dist.get(
                                                                                                      'beta'),
                                                                                                  param_dist.get('tau'),
                                                                                                  param_dist.get(
                                                                                                      'gamma')))

        parameters[j]= [param_dist.get('alpha'), param_dist.get('beta'), param_dist.get('tau'), param_dist.get('gamma')]

        # agent.load_models()

        n_episodes = 1000
Esempio n. 23
0
num_agents = len(env_info.agents)
print('number of agents: ', num_agents)

action_size = brain.vector_action_space_size
print('action size: ', action_size)

states = env_info.vector_observations
state_size = states.shape[1]
print('state size: ', state_size)

#--------------------------------------------------------------------------------------

#buffer_type = 'standard'
buffer_type = 'prioritized'

agent = Agent(state_size, action_size, buffer_type)

Nepisodes = 1500
Nsteps = 1000

#--------------------------------------------------------------------------------------


def train_agent():

    mean_ep_rewards = []

    for ep in range(Nepisodes):

        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations
 def reset(self):
     for Agent in self.maddpg_agent:
         Agent.reset()
import gym
import numpy as np
from ddpg import Agent
from utils import plotLearning

env = gym.make('Pendulum-v0')

agent = Agent(alpha=0.0001,
              beta=0.001,
              input_dims=[3],
              tau=0.001,
              env=env,
              n_actions=1)

np.random.seed(0)
score_history = []

for episode in range(1000):

    state = env.reset()
    done = False
    score = 0

    while not done:
        action = agent.choose_action(state)

        next_state, reward, done, info = env.step(action)
        agent.remember(state, action, reward, next_state, int(done))

        agent.learn()
Esempio n. 26
0
env.seed(1)
start = time.time()

# 超参数
params = {
    'env': env,
    'gamma': 0.99,  # 增益折损
    'actor_lr': 0.001,  # 学习率
    'critic_lr': 0.001,
    'tau': 0.02,  # 软更新参数
    'capacity': 10000,  # 经验池容量
    'batch_size': 32,  # 随机梯度下降,经验池回放
    'train_with_render': True,  # 是否在训练时开启渲染
    'save_reward': -800,  # 在episode reward达到多少时,停止训练,储存模型
    'actor_model_path': 'model/DDPG_actor.pt',  # 模型储存位置
    'critic_model_path': 'model/DDPG_critic.pt',
    'Reset_parameters': False,   # 是否从0开始训练
}
agent = Agent(**params)
agent.train_model(200)  # 训练num幕
# agent.test_model(3)   # 测试num幕


train_time = time.time() - start
env.close()
print("Time: %.4f" % train_time)


# 查看LOSS变化,终端输入:
# tensorboard --logdir=./log --port=6007
Esempio n. 27
0
from env.pid_env import PidEnvSingle
import torch
import numpy as np
from ddpg import Agent
from OUNoise import Noise
import matplotlib.pyplot as plt

batch_size = 128
rewards = []
avg_rewards = []
env = PidEnvSingle()
agent = Agent(num_states=2, num_actions=3, gamma=0.99)
agent2 = Agent(num_states=2, num_actions=3, gamma=0.99)
noise = Noise(num_actions=3)
zeros = [0]
normalized = []
all_steps = [-1]*10
inlook = []
metalearn = False
random = False

setpoints = []
total_steps = 600

if metalearn == True:
    for i in range(20):
        curr = 20 if random == False else np.random.random()*100
        setpoints.append(curr)
    agent.metalearn(setpoints)

Esempio n. 28
0
env_info = env.reset(train_mode=True)[brain_name]

num_agents = len(env_info.agents)
print('number of agents: ', num_agents)

action_size = brain.vector_action_space_size
print('action size: ', action_size)

states = env_info.vector_observations
state_size = states.shape[1]
print('state size: ', state_size)

#--------------------------------------------------------------------------------------

agent = Agent(state_size, action_size)

Nepisodes = 5000
Nsteps = 5000

#--------------------------------------------------------------------------------------


def train_agent():

    mean_ep_rewards = []

    ibackup = 0
    thresh = 0.05

    for ep in range(Nepisodes):
Esempio n. 29
0
from ddpg import Agent
import gym
import numpy as np

env = gym.make('LunarLanderContinuous-v2')

agent = Agent(alpha = 0.000025, beta = 0.00025, input_dims = [8], tau = 0.001, env = env, batch_size = 64, layer1_size = 400, layer2_size = 300, n_actions = 2)

np.random.seed(42)
score_history = []

for i in range(1000):
	done = False
	score = 0 
	obs = env.reset()
	while not done:
		act = agent.choose_action(obs)
		new_state, reward, done, info = env.step(act)
		agent.remember(obs, act, reward, new_state, int(done))
		agent.learn()
		score += reward
		obs = new_state

	score_history.append(score)
	print("Episode - {} Score - {} 100 game average {}".format(i, score, np.mean(score_history[-100:])))

	if i % 25 == 0:
		agent.save_models()

filename = l
Esempio n. 30
0

#%% Load Tennis environment
env = UnityEnvironment(file_name="Tennis_Linux_NoVis/Tennis.x86_64")

# Get brain information
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
action_size = brain.vector_action_space_size

# Environment information
env_info = env.reset(train_mode=False)[brain_name]
state_size = env_info.vector_observations.shape[1]
num_agents = len(env_info.agents)


#%% DDPG - Agent Training
agent = Agent(state_size=state_size, action_size=action_size, random_seed=random_seed)
score = execute_ddpg(ddpg_agent=agent)

# Plot results
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(score)+1), score)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

#%% Environment- Close
env.close()