def __init__(self,
                 base_model_paths,
                 switch_path,
                 device,
                 soft_choice=False):
        super(SwitchController, self).__init__()
        self.base_models = []
        for base_model_path in base_model_paths:
            base_model = Actor(state_size=2,
                               action_size=1,
                               seed=0,
                               fc1_units=25).to(device)
            base_model.load_state_dict(
                torch.load(base_model_path, map_location=device))
            base_model.eval()
            self.base_models.append(base_model)
        self.switch_model = DQN(2, 2).to(device)
        self.switch_model.load_state_dict(
            torch.load(switch_path, map_location=device))
        self.switch_model.eval()

        self.soft_choice = soft_choice
Beispiel #2
0
# this file is to record the NN controller parameters into a txt file to be used 
# for Bernstein polynomial approximation by the tool of ReachNN
from Model import IndividualModel, Actor
import torch
import numpy as np


# NAME = 'direct_distill'
# trained_model = IndividualModel(state_size=3, action_size=1, seed=0, fc1_units=25)
# trained_model.load_state_dict(torch.load('./'+ NAME +'.pth'))
# trained_model.eval()
trained_model = Actor(state_size=3, action_size=1, seed=0, fc1_units=25)
trained_model.load_state_dict(torch.load("./actors/actor_0.43600.pth"))
trained_model.eval()
bias_list = []
weight_list = []
for name, param in trained_model.named_parameters():
	if 'bias' in name:
		bias_list.append(param.detach().cpu().numpy())
		
	if 'weight' in name:
		weight_list.append(param.detach().cpu().numpy())
print(len(weight_list), np.linalg.norm(weight_list[0]), np.linalg.norm(weight_list[1]))
# assert False
all_param = []

for i in range(len(bias_list)):
	for j in range(len(bias_list[i])):
		for k in range(weight_list[i].shape[1]):
			all_param.append(weight_list[i][j, k])
		all_param.append(bias_list[i][j])
Beispiel #3
0
USE_CUDA = torch.cuda.is_available()
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda(
) if USE_CUDA else autograd.Variable(*args, **kwargs)
batch_size = 128
gamma = 0.99
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 3000
replay_buffer = ReplayBuffer(int(5e3))
epsilon_by_frame = lambda frame_idx: epsilon_final + (
    epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_1 = Actor(state_size=3, action_size=1, seed=0, fc1_units=25).to(device)
model_1.load_state_dict(torch.load("./actors/actor_0.43600.pth"))
model_1.eval()

# model_2 = IndividualModel(state_size=3, action_size=1, seed=0, fc1_units=50).to(device)
# model_2.load_state_dict(torch.load("./actors/actor_1.0_2800.pth"))
# model_2.eval()


def MController(state):
    action = 0.634 * state[0] - 0.296 * state[1] - 0.153 * state[
        2] + 0.053 * state[0]**2 - 1.215 * state[0]**3
    return action


Individual = IndividualModel(state_size=3, action_size=1, seed=0,
                             fc1_units=25).to(device)
		return len(self.buffer)

USE_CUDA = torch.cuda.is_available()
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
batch_size = 128
gamma = 0.99
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 3000
replay_buffer = ReplayBuffer(int(5e3))
epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_1 = Actor(state_size=2, action_size=1, seed=0, fc1_units=25, fc2_units=None).to(device)
model_1.load_state_dict(torch.load("./models/actor_2800.pth"))
model_1.eval()

model_2 = Actor(state_size=2, action_size=1, seed=0, fc1_units=25).to(device)
model_2.load_state_dict(torch.load("./0731actors/actor_2400.pth"))
model_2.eval()

Individual = Individualtanh(state_size=2, action_size=1, seed=0, fc1_units=25).to(device)

agent = Agent(state_size=2, action_size=2, random_seed=0, fc1_units=None, fc2_units=None, weighted=True)

ppo = PPO(2, 2, method = 'clip')
ppo.load_model(3000, 1)

def mkdir(path):
	folder = os.path.exists(path)
USE_CUDA = torch.cuda.is_available()
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda(
) if USE_CUDA else autograd.Variable(*args, **kwargs)
batch_size = 128
gamma = 0.99
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 3000
replay_buffer = ReplayBuffer(int(5e3))
epsilon_by_frame = lambda frame_idx: epsilon_final + (
    epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_1 = Actor(state_size=4, action_size=1, seed=0).to(device)
model_1.load_state_dict(torch.load("./actor5000_1.pth"))
model_1.eval()

model_2 = Actor(state_size=4, action_size=1, seed=0).to(device)
model_2.load_state_dict(torch.load("./actor4850_1.pth"))
model_2.eval()

Individual = Individualtanh(state_size=4, action_size=1, seed=0,
                            fc1_units=50).to(device)

agent = Agent(state_size=4, action_size=2, random_seed=0)

ppo = PPO(4, 2, method='penalty')
ppo.load_model(5499, 1)

Beispiel #6
0
class DDPG:
    def __init__(self,
                 env,
                 tau=1e-3,
                 gamma=0.99,
                 batch_size=64,
                 depsilon=50000):
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]

        self.policy = Actor(self.num_states, self.num_actions).train()
        self.policy_target = Actor(self.num_states, self.num_actions).eval()
        self.hard_update(self.policy, self.policy_target)

        self.critic = Critic(self.num_states, self.num_actions).train()
        self.critic_target = Critic(self.num_states, self.num_actions).eval()
        self.hard_update(self.critic, self.critic_target)

        self.critic_loss = nn.MSELoss()

        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.epsilon = 1.0
        self.depsilon = 1.0 / float(depsilon)

        self.opt_critic = torch.optim.Adam(self.critic.parameters(), lr=1e-3)
        self.opt_policy = torch.optim.Adam(self.policy.parameters(), lr=1e-4)

        self.policy.cuda()
        self.policy_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def train(self, buffer):
        b_state, b_action, b_reward, b_state_next, b_term = buffer.sample(
            self.batch_size)
        with torch.no_grad():
            action_target = self.policy_target(b_state_next)
            Q_prime = self.critic_target(b_state_next, action_target)

        self.opt_critic.zero_grad()
        Q = self.critic(b_state, b_action)
        L_critic = self.critic_loss(
            Q, b_reward + self.gamma * Q_prime * (1.0 - b_term))
        L_critic.backward()
        self.opt_critic.step()

        self.opt_policy.zero_grad()
        action = self.policy(b_state)
        L_Q = -1.0 * self.critic(b_state, action).mean()
        L_Q.backward()
        self.opt_policy.step()

        self.soft_update(self.critic, self.critic_target)
        self.soft_update(self.policy, self.policy_target)

        return L_critic.item(), L_Q.item()

    def get_entropy(self, buffer, m=5, n=100):
        # b_state, b_action, b_reward, b_state_next, b_term = buffer.sample(n)
        b_angle = torch.rand(n) * np.pi * 2.0
        b_speed = 2.0 * (torch.rand(n) - 0.5) * 8.0
        b_state = torch.stack(
            [torch.cos(b_angle),
             torch.sin(b_angle), b_speed], dim=1).to(device='cuda',
                                                     dtype=torch.float32)
        coef = torch.zeros(n, dtype=b_state.dtype, device=b_state.device)
        with torch.no_grad():
            action = self.policy(b_state)
            X, ind = torch.sort(action, dim=0)
            for i in range(n):
                if i < m:
                    c = 1
                    a = X[i + m]
                    b = X[0]
                elif i >= m and i < n - m:
                    c = 2
                    a = X[i + m]
                    b = X[i - m]
                else:
                    c = 1
                    a = X[n - 1]
                    b = X[i - m]
                coef[i] = float(n) * float(c) / float(m) * (a - b + 1E-5)

            S = torch.log(coef).mean()

        return S.item()

    def get_value(self, state, action):
        with torch.no_grad():
            return self.critic(state, action).item()

    def select_action(self, state, random_process):
        with torch.no_grad():
            action = self.policy(state)
        noise = max(self.epsilon, 0.0) * random_process.sample()
        self.epsilon -= self.depsilon

        action += torch.from_numpy(noise).to(device=action.device,
                                             dtype=action.dtype)
        action = torch.clamp(action, -1, 1)
        return action

    def random_action(self):
        m = Uniform(torch.tensor([-1.0 for i in range(self.num_actions)]),
                    torch.tensor([1.0 for i in range(self.num_actions)]))
        return m.sample()

    def soft_update(self, src, dst):
        with torch.no_grad():
            for src_param, dst_param in zip(src.parameters(),
                                            dst.parameters()):
                dst_param.copy_(self.tau * src_param +
                                (1.0 - self.tau) * dst_param)

    def hard_update(self, src, dst):
        with torch.no_grad():
            for src_param, dst_param in zip(src.parameters(),
                                            dst.parameters()):
                dst_param.copy_(src_param.clone())

    def load_weights(self, path):
        self.policy.load_state_dict(torch.load('{}/policy.pkl'.format(path)))
        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(path)))

    def save_model(self, path):
        torch.save(
            self.policy.to(device='cpu').state_dict(),
            '{}/policy.pkl'.format(path))
        torch.save(
            self.critic.to(device='cpu').state_dict(),
            '{}/critic.pkl'.format(path))
Beispiel #7
0
    num_train = 200000
    num_eval = 0
    buffer_length = 600000

    # env = NormalizedEnv(gym.make('Pendulum-v0'))
    GODOT_BIN_PATH = "InvPendulum/InvPendulum.x86_64"
    env_abs_path = "InvPendulum/InvPendulum.pck"
    env = NormalizedEnv(
        InvPendulumEnv(exec_path=GODOT_BIN_PATH,
                       env_path=env_abs_path,
                       render=True))

    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    policy = Actor(num_states, num_actions)
    policy.load_state_dict(torch.load('./policy.pkl'))

    state = env.reset()
    state = state.to(dtype=torch.float32)

    traced_policy = torch.jit.trace(policy, state)
    print(traced_policy.graph)
    print(traced_policy.code)
    traced_policy.save('ddpg_policy.jit')

    for step in range(1000):

        action = policy(state)
        #			torch.tensor([1.0 for i in range(num_actions)])).sample().to(device='cuda')
        time.sleep(0.02)
        # state_next, reward, term, _ = env.step(action.cpu().numpy())