コード例 #1
0
ファイル: Core.py プロジェクト: LoveDoveDog/DDPG-Wolpertinger
    def __init__(self, state_dim, action_dim, message_num, channel_num, args):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.message_num = message_num
        self.channel_num = channel_num
        self.actor_node1 = args.actor_node1
        self.actor_node2 = args.actor_node2
        self.critic_node1 = args.critic_node1
        self.critic_node2 = args.critic_node2
        self.lr_actor = args.lr_actor
        self.lr_critic = args.lr_critic
        self.tau = args.tau
        self.gamma = args.gamma
        self.epsilon = args.epsilon
        self.buffer_size = args.buffer_size
        self.batch_size = args.batch_size
        self.folder = args.folder
        self.name = args.name

        self.device = args.device

        self.actor = Actor(self.state_dim, self.action_dim, self.actor_node1, self.actor_node2)
        self.actor_target = Actor(self.state_dim, self.action_dim, self.actor_node1, self.actor_node2)
        self.actor_opt = opt.Adam(self.actor.parameters(), lr=self.lr_actor)
        self.critic = Critic(self.state_dim, self.action_dim, self.critic_node1, self.critic_node2)
        self.critic_target = Critic(self.state_dim, self.action_dim, self.critic_node1, self.critic_node2)
        self.critic_opt = opt.Adam(self.critic.parameters(), lr=self.lr_critic)

        self.begin_cuda()

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
コード例 #2
0
ファイル: ddpg.py プロジェクト: kaushikb258/DRLND
    def __init__(self, state_size, action_size, buffer_type='standard'):
        self.state_size = state_size
        self.action_size = action_size
        self.epsilon = EPSILON
        self.buffer_type = buffer_type

        # actor
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)

        # critic
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)

        # optimizers
        self.actor_optim = optim.Adam(self.actor_local.parameters(), lr=LR_A)
        self.critic_optim = optim.Adam(self.critic_local.parameters(), lr=LR_C)

        # OU noise
        self.noise = OUNoise(action_size)

        # replay buffer
        if (self.buffer_type == 'standard'):
            self.memory = ReplayBuffer(BUFFER_SIZE, device)
        else:
            self.memory = PrioritizedReplayBuffer(BUFFER_SIZE)

        self.params_copy(self.actor_target, self.actor_local)
        self.params_copy(self.critic_target, self.critic_local)
コード例 #3
0
def main():
    # initialize OpenAI Gym env and dqn agent
    sess = tf.InteractiveSession()
    env = gym.make(ENV_NAME)
    actor = Actor(env, sess, actor_lr)
    critic = Critic(env, sess, critic_lr, critic_gamma)

    for episode in range(EPISODE):
        state = env.reset()
        # Train
        for step in range(STEP):
            action = actor.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            td_error = critic.learn(state, reward, next_state)
            actor.learn(state, action, td_error)
            state = next_state
            if done:
                break

        # Test every 100 episodes
        if episode % 100 == 0:
            total_reward = 0
            for i in range(TEST):
                state = env.reset()
                for j in range(STEP):
                    env.render()
                    action = actor.choose_action(
                        state)  # direct action for test
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            ave_reward = total_reward / TEST
            print('episode: ', episode, 'Evaluation Average Reward:',
                  ave_reward)
コード例 #4
0
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        # actor
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)

        # critic
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)

        # optimizers
        self.actor_optim = optim.Adam(self.actor_local.parameters(), lr=LR_A)
        self.critic_optim = optim.Adam(self.critic_local.parameters(), lr=LR_C)

        # OU noise
        self.noise = OUNoise(action_size, theta=0.15, sigma=0.1)

        # replay buffer
        self.memory = ReplayBuffer(BUFFER_SIZE, device)

        self.params_copy(self.actor_target, self.actor_local)
        self.params_copy(self.critic_target, self.critic_local)
コード例 #5
0
ファイル: Core.py プロジェクト: LoveDoveDog/DDPG-Wolpertinger
class Core(object):
    def __init__(self, state_dim, action_dim, message_num, channel_num, args):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.message_num = message_num
        self.channel_num = channel_num
        self.actor_node1 = args.actor_node1
        self.actor_node2 = args.actor_node2
        self.critic_node1 = args.critic_node1
        self.critic_node2 = args.critic_node2
        self.lr_actor = args.lr_actor
        self.lr_critic = args.lr_critic
        self.tau = args.tau
        self.gamma = args.gamma
        self.epsilon = args.epsilon
        self.buffer_size = args.buffer_size
        self.batch_size = args.batch_size
        self.folder = args.folder
        self.name = args.name

        self.device = args.device

        self.actor = Actor(self.state_dim, self.action_dim, self.actor_node1, self.actor_node2)
        self.actor_target = Actor(self.state_dim, self.action_dim, self.actor_node1, self.actor_node2)
        self.actor_opt = opt.Adam(self.actor.parameters(), lr=self.lr_actor)
        self.critic = Critic(self.state_dim, self.action_dim, self.critic_node1, self.critic_node2)
        self.critic_target = Critic(self.state_dim, self.action_dim, self.critic_node1, self.critic_node2)
        self.critic_opt = opt.Adam(self.critic.parameters(), lr=self.lr_critic)

        self.begin_cuda()

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

    def update_policy(self, experiences):
        states, channel_occupys, actions, rewards, next_states, next_channel_occupys = experiences
        states = states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        next_states = next_states.to(self.device)
        next_target_actions = self.wolpertinger_action(next_states, next_channel_occupys, status='target')
        next_target_actions = torch.from_numpy(next_target_actions).to(self.device)
        next_target_q_values = self.critic_target(next_states, next_target_actions)
        evaluated_q_values = rewards + self.gamma * next_target_q_values
        target_q_values = self.critic(states, actions)
        self.critic.zero_grad()
        critic_loss = fun.mse_loss(evaluated_q_values, target_q_values)
        critic_loss.backward()
        self.critic_opt.step()

        self.actor.zero_grad()
        actor_loss = -self.critic(states, self.actor(states))
        actor_loss = actor_loss.mean()
        actor_loss.backward()
        self.actor_opt.step()

        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def random_action(self, message_num, channel_occupy):
        action = np.zeros(len(channel_occupy), dtype=int)
        for index, i in enumerate(channel_occupy):
            if i == 0:
                action[index] = random.randint(0, message_num)
        return action

    def wolpertinger_action(self, states, channel_occupys, status='execute'):
        if status == 'execute':
            states_tensor = states.to(self.device)
            proto_actions_tensor = self.actor(states_tensor)
            proto_actions = proto_actions_tensor.cpu()
            next_actions = np.empty(proto_actions.size())
            for state, channel_occupy, proto_action, index in zip(states, channel_occupys, proto_actions,
                                                                  np.arange(next_actions.shape[0])):
                state = state.data.numpy()
                channel_occupy = channel_occupy.data.numpy()
                proto_action = proto_action.data.numpy()
                near_actions = nearest(proto_action, channel_occupy, self.message_num)
                state_list = np.vstack([state for i in np.arange(near_actions.shape[0])])
                state_list_tensor = torch.from_numpy(state_list).to(self.device)
                near_actions_tensor = torch.from_numpy(near_actions).to(self.device)
                near_values = self.critic(state_list_tensor, near_actions_tensor)
                action_index = torch.argmax(near_values)
                next_actions[index, :] = near_actions[action_index, :]
            next_actions = next_actions[0, :]
            next_actions = noise(next_actions, channel_occupys, self.message_num, self.epsilon)

        elif status == 'target':
            proto_actions_tensor = self.actor_target(states)
            states = states.cpu()
            proto_actions = proto_actions_tensor.cpu()
            next_actions = np.empty(proto_actions.size())
            for state, channel_occupy, proto_action, index in zip(states, channel_occupys, proto_actions,
                                                                  np.arange(next_actions.shape[0])):
                state = state.data.numpy()
                channel_occupy = channel_occupy.data.numpy()
                proto_action = proto_action.data.numpy()
                near_actions = nearest(proto_action, channel_occupy, self.message_num)
                state_list = np.vstack([state for i in np.arange(near_actions.shape[0])])
                state_list_tensor = torch.from_numpy(state_list).to(self.device)
                near_actions_tensor = torch.from_numpy(near_actions).to(self.device)
                near_values = self.critic_target(state_list_tensor, near_actions_tensor)
                action_index = torch.argmax(near_values)
                next_actions[index, :] = near_actions[action_index, :]

        return next_actions

    def begin_eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def begin_cuda(self):
        self.actor.to(self.device)
        self.actor_target.to(self.device)
        self.critic.to(self.device)
        self.critic_target.to(self.device)

    def to_device(self, tensor):
        if self.device is not None:
            tensor = tensor.to(self.device)
        return tensor

    def save_model(self):
        torch.save(self.actor.state_dict(), self.folder+'/'+self.name+'actor.pt')
        torch.save(self.critic.state_dict(), self.folder+'/'+self.name+'critic.pt')

    def load_model(self):
        self.actor.load_state_dict(torch.load(self.folder+'/'+self.name+'actor.pt'))
        self.critic.load_state_dict(torch.load(self.folder+'/'+self.name+'critic.pt'))
コード例 #6
0
ファイル: runMain.py プロジェクト: ypy516478793/newAL
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import h5py
import os

# set env
env = DataEnv()
s_dim = env.state_dim
a_dim = env.action_dim
clf = Classifier()

sess = tf.Session()

# set RL method
actor = Actor(sess, n_features=s_dim, n_actions=a_dim, lr=LR_A)
critic = Critic(sess, n_features=s_dim, lr=LR_C)

sess.run(tf.global_variables_initializer())

accHist = np.zeros([MAX_EPISODES, env.budgets + 1])
lossHist = np.zeros([MAX_EPISODES, env.budgets + 1])
rewardHist = np.zeros([MAX_EPISODES])
labelData = np.zeros([env.budgets, s_dim])

if OUTPUT_GRAPH:
    tf.summary.FileWriter("./logs/" + LOG + "/RL/", sess.graph)


def save_results(acc, probs):
    if not os.path.exists("./results/"):
コード例 #7
0
ファイル: ddpg.py プロジェクト: kaushikb258/DRLND
class Agent():
    def __init__(self, state_size, action_size, buffer_type='standard'):
        self.state_size = state_size
        self.action_size = action_size
        self.epsilon = EPSILON
        self.buffer_type = buffer_type

        # actor
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)

        # critic
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)

        # optimizers
        self.actor_optim = optim.Adam(self.actor_local.parameters(), lr=LR_A)
        self.critic_optim = optim.Adam(self.critic_local.parameters(), lr=LR_C)

        # OU noise
        self.noise = OUNoise(action_size)

        # replay buffer
        if (self.buffer_type == 'standard'):
            self.memory = ReplayBuffer(BUFFER_SIZE, device)
        else:
            self.memory = PrioritizedReplayBuffer(BUFFER_SIZE)

        self.params_copy(self.actor_target, self.actor_local)
        self.params_copy(self.critic_target, self.critic_local)

    def step(self, states, actions, rewards, next_states, dones, tstep):

        if (self.buffer_type == 'standard'):

            for state, action, reward, next_state, done in zip(
                    states, actions, rewards, next_states, dones):
                self.memory.add(state, action, reward, next_state, done)

            if len(self.memory) > BATCH_SIZE and tstep % UPDATE_EVERY == 0:
                for _ in range(NUM_UPDATES):
                    experiences = self.memory.sample(BATCH_SIZE)
                    self.train_model_s(experiences)

        elif (self.buffer_type == 'prioritized'):

            s = torch.FloatTensor(states).cuda()
            a = torch.FloatTensor(actions).cuda()
            r = torch.FloatTensor(rewards).cuda()
            s2 = torch.FloatTensor(next_states).cuda()
            d = torch.FloatTensor(dones).cuda()

            actions_next = self.actor_target(s2)
            Q_tp1 = self.critic_target(s2, actions_next).squeeze(1)
            y_t = r + (GAMMA * Q_tp1 * (1.0 - d))
            Q_t = self.critic_local(s, a).squeeze(1)
            TD_error = torch.abs(Q_t - y_t).cpu().data.numpy()

            for i, (state, action, reward, next_state, done) in enumerate(
                    zip(states, actions, rewards, next_states, dones)):
                self.memory.add(TD_error[i],
                                (state, action, reward, next_state, done))

            if self.memory.size() > BATCH_SIZE and tstep % UPDATE_EVERY == 0:
                for _ in range(NUM_UPDATES):
                    experiences, idxs, is_weight = self.memory.sample(
                        BATCH_SIZE)
                    experiences = self.convert_tuple_format(experiences)
                    self.train_model_p(experiences, GAMMA, idxs, is_weight)

    def convert_tuple_format(self, mini_batch):
        s, a, r, s2, d = [], [], [], [], []
        for b in mini_batch:
            s_, a_, r_, s2_, d_ = b
            s.append(s_)
            a.append(a_)
            r.append(r_)
            s2.append(s2_)
            d.append(d_)
        return (s, a, r, s2, d)

    def act(self, state, add_noise=True):

        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.epsilon * self.noise.sample()
        return action

    def reset(self):
        self.noise.reset()

    # standard exp replay
    def train_model_s(self, experiences):

        states, actions, rewards, next_states, dones = experiences

        # Q(s_t+1, a_t+1)
        actions_next = self.actor_target(next_states)
        Q_tp1 = self.critic_target(next_states, actions_next)

        # Q targets, y_t; Bellman equation
        y_t = rewards + (GAMMA * Q_tp1 * (1 - dones))

        Q_t = self.critic_local(states, actions)
        self.critic_loss = F.mse_loss(Q_t, y_t)

        self.critic_optim.zero_grad()
        self.critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optim.step()

        actions_pred = self.actor_local(states)
        self.actor_loss = -self.critic_local(states, actions_pred).mean()

        self.update_networks()

    # prioritized exp replay
    def train_model_p(self, experiences, gamma, idxs=None, is_weight=None):

        states, actions, rewards, next_states, dones = experiences

        s = torch.FloatTensor(states).cuda()
        a = torch.FloatTensor(actions).cuda()
        r = torch.FloatTensor(rewards).cuda()
        s2 = torch.FloatTensor(next_states).cuda()
        d = torch.FloatTensor(dones).cuda()

        # Q(s_t+1, a_t+1)
        actions_next = self.actor_target(s2)
        Q_tp1 = self.critic_target(s2, actions_next).squeeze(1)

        # Q targets, y_t; Bellman equation
        y_t = r + (GAMMA * Q_tp1 * (1.0 - d))
        Q_t = self.critic_local(s, a).squeeze(1)

        is_w = torch.FloatTensor(is_weight).cuda()
        self.critic_loss = torch.mean(is_w * (Q_t - y_t)**2)

        # update priority
        TD_errors = torch.abs(Q_t - y_t).cpu().data.numpy()
        for i in range(BATCH_SIZE):
            idx = idxs[i]
            self.memory.update(idx, TD_errors[i])

        # critic update; clip grad norm for stability
        self.critic_optim.zero_grad()
        self.critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 2.0)
        self.critic_optim.step()

        # actor loss; gradient of this term is the "policy gradient"
        actions_pred = self.actor_local(s)
        self.actor_loss = -self.critic_local(s, actions_pred).mean()

        self.update_networks()

    def update_networks(self):

        # actor update
        self.actor_optim.zero_grad()
        self.actor_loss.backward()
        self.actor_optim.step()

        # update target networks using TAU
        self.weighted_update(self.critic_local, self.critic_target, TAU)
        self.weighted_update(self.actor_local, self.actor_target, TAU)

        # linearly anneal epsilon; noise factor
        self.epsilon -= DELTA_EPSILON
        self.noise.reset()

    def weighted_update(self, source, target, tau):
        for t, s in zip(target.parameters(), source.parameters()):
            t.data.copy_(tau * s.data + (1.0 - tau) * t.data)

    def params_copy(self, target, source):
        for t, s in zip(target.parameters(), source.parameters()):
            t.data.copy_(s.data)
コード例 #8
0
class Agent():
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        # actor
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)

        # critic
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)

        # optimizers
        self.actor_optim = optim.Adam(self.actor_local.parameters(), lr=LR_A)
        self.critic_optim = optim.Adam(self.critic_local.parameters(), lr=LR_C)

        # OU noise
        self.noise = OUNoise(action_size, theta=0.15, sigma=0.1)

        # replay buffer
        self.memory = ReplayBuffer(BUFFER_SIZE, device)

        self.params_copy(self.actor_target, self.actor_local)
        self.params_copy(self.critic_target, self.critic_local)

    def step(self, states, actions, rewards, next_states, dones, tstep):
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > BATCH_SIZE and tstep % UPDATE_EVERY == 0:
            for _ in range(NUM_UPDATES):
                experiences = self.memory.sample(BATCH_SIZE)
                self.train_model(experiences)

    def act(self, state, epsilon, add_noise=True):

        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += epsilon * self.noise.sample()
        return action

    def reset(self):
        self.noise.reset()

    def train_model(self, experiences):

        states, actions, rewards, next_states, dones = experiences

        # Q(s_t+1, a_t+1)
        actions_next = self.actor_target(next_states)
        Q_tp1 = self.critic_target(next_states, actions_next)

        # Q targets, y_t; Bellman equation
        y_t = rewards + (GAMMA * Q_tp1 * (1 - dones))

        Q_t = self.critic_local(states, actions)
        self.critic_loss = F.mse_loss(Q_t, y_t)

        self.critic_optim.zero_grad()
        self.critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optim.step()

        actions_pred = self.actor_local(states)
        self.actor_loss = -self.critic_local(states, actions_pred).mean()

        self.update_networks()

    def update_networks(self):

        # actor update
        self.actor_optim.zero_grad()
        self.actor_loss.backward()
        self.actor_optim.step()

        # update target networks using TAU
        self.weighted_update(self.critic_local, self.critic_target, TAU)
        self.weighted_update(self.actor_local, self.actor_target, TAU)

        self.noise.reset()

    def weighted_update(self, source, target, tau):
        for t, s in zip(target.parameters(), source.parameters()):
            t.data.copy_(tau * s.data + (1.0 - tau) * t.data)

    def params_copy(self, target, source):
        for t, s in zip(target.parameters(), source.parameters()):
            t.data.copy_(s.data)
コード例 #9
0
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
universe.configure_logging()
#--------------------------------------------------------------#
import os
if not os.path.exists('_data'): os.makedirs('_data')
#--------------------------------------------------------------#


env = createEnv(env_id='internet.SlitherIO-v0', remotes=1)


#--------------------------------------------------------------#

sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
actor = Actor(sess, n_features=[150, 250, 1], n_actions=8)
critic = Critic(sess, n_features=[150, 250, 1])
init = tf.global_variables_initializer()
sess.run(init)
saver = tf.train.Saver()
saver.restore(sess, '_data/model.ckpt') # load model

#--------------------------------------------------------------#

MAX_EPISODE = 500
PRINT = bool(0)
RENDER = bool(1)

#--------------------------------------------------------------#

コード例 #10
0
import os
if not os.path.exists('_data'): os.makedirs('_data')
#--------------------------------------------------------------#

MAX_EPISODE = 500
VISUALIZE = bool(0)
PRINT = bool(1)
RENDER = bool(0)
LOAD = bool(1)

env = createEnv(env_id='internet.SlitherIO-v0', remotes=1)

#--------------------------------------------------------------#

sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
actor = Actor(sess, n_features=[150, 250, 1], n_actions=8)
critic = Critic(sess, n_features=[150, 250, 1])
init = tf.global_variables_initializer()
sess.run(init)
saver = tf.train.Saver()

if LOAD:
    saver.restore(sess, '_data/model.ckpt')  # load model
    reward_history = np.loadtxt(
        '_data/_reward_history.txt').tolist()  # load history
    timestep_history = np.loadtxt(
        '_data/_timestep_history.txt').tolist()  # load history
    system_message = 'Loading and continue training on existing model.'
else:
    reward_history = []
    timestep_history = []