Exemple #1
0
class I_MLP(nn.Module):
    def __init__(self):
        super(I_MLP, self).__init__()

        self.name = "models/inf_model.pt"
        self.n_steps = 10

        self.fc_1 = nn.Linear(4, 4)
        self.fc_2 = nn.Linear(4, 2)
        # model_dict = torch.load("models/h_model.pt", map_location='cpu')
        # self.load_state_dict(model_dict)

        self.rc_1 = nn.Linear(4, 8)
        self.rc_2 = nn.Linear(8, 8)
        self.rc_3 = nn.Linear(8, 2)

        # curr human
        self.human = MLP()
        model_dict = torch.load("models/h_model.pt", map_location='cpu')
        self.human.load_state_dict(model_dict)
        self.human.eval

    def prediction(self, x):
        h1 = self.fc_1(x)
        return torch.tanh(self.fc_2(h1))

    def policy(self, x):
        h1 = torch.tanh(self.rc_1(x))
        h2 = torch.tanh(self.rc_2(h1))
        return self.rc_3(h2)

    def rollout(self, s_star, s_0):
        error = 0.0
        s = torch.FloatTensor(s_0)
        for t in range(self.n_steps):
            x = torch.cat((s, s_star), 0)
            ah_hat = self.human.prediction(x).detach()
            ah = self.prediction(x)
            context = torch.cat((s, ah), 0)
            ar = self.policy(context)
            s = s + 0.1 * ar
            error += torch.norm(s - s_star)**2
            error += torch.tanh(ah[0]) + torch.tanh(ah[1])
            error += torch.norm(ah)
            error += 1.0 * torch.norm(ah - ah_hat)
        return error

    def loss(self):
        Q = 0.0
        g1 = torch.FloatTensor([1.0, 0.0])
        g2 = torch.FloatTensor([0.0, 1.0])
        for round in range(10):
            s_0 = np.random.random(2)
            for s_star in [g1, g2]:
                error = self.rollout(s_star, s_0)
                Q += error
        return Q
Exemple #2
0
def main():

    env = gym.make("LanderCustom-v0")

    qnetwork = QNetwork(state_size=8, action_size=4, seed=0)
    qnetwork.load_state_dict(torch.load('basic_lander.pth'))
    qnetwork.eval()

    human = MLP()
    human.load_state_dict(torch.load('expert_bc.pt'))
    human.eval()
    softmax = torch.nn.Softmax(dim=0)

    episodes = 30
    scores = []
    Q_threshold = 1e-2

    for episode in range(episodes):

        if episode < 10:
            force_x = 0.0
        elif episode < 20:
            force_x = +500.0
        else:
            force_x = -500.0

        env.start_state(force_x, 0.0)
        state = env.reset()
        score = 0

        while True:

            with torch.no_grad():
                state = torch.from_numpy(state).float()
                Q_values = qnetwork(state).data.numpy()
                action_pred_dist = softmax(human(state).data).numpy()
            action_star = np.argmax(Q_values)
            action = np.random.choice(np.arange(4), p=action_pred_dist)

            loss = Q_values[action_star] - Q_values[action]
            # if loss > Q_threshold:
            #     action = action_star

            # env.render()
            state, reward, done, _ = env.step(action)
            score += reward
            if done:
                print("episode: ", episode, "score: ", score)
                break

        scores.append(score)

    env.close()
    print("The average score is: ", np.mean(np.array(scores)))
Exemple #3
0
class I_MLP_MLP(nn.Module):
    def __init__(self):
        super(I_MLP_MLP, self).__init__()

        # curr human
        self.bc_human = MLP()
        model_dict = torch.load(self.bc_human.name, map_location='cpu')
        self.bc_human.load_state_dict(model_dict)
        self.bc_human.eval

        self.name = "models/inf_robot.pt"

        self.fc_1 = nn.Linear(4, 4)
        self.fc_2 = nn.Linear(4, 2)

        self.rc_1 = nn.Linear(4, 4)
        self.rc_2 = nn.Linear(4, 2)

    def human(self, x):
        h1 = torch.tanh(self.fc_1(x))
        return self.fc_2(h1)

    def robot(self, x):
        h1 = torch.tanh(self.rc_1(x))
        return self.rc_2(h1)

    def loss(self, n_samples):
        loss = 0.0
        regulate = 1.0
        for iteration in range(n_samples):
            state = torch.rand(2)
            target = torch.rand(2)
            error = target - state
            ah_star = torch.FloatTensor([-error[1], error[0]])
            ah_hat = self.bc_human.human(torch.cat((state, target),
                                                   0)).detach()
            ah_hat *= torch.norm(ah_star) / torch.norm(ah_hat)
            ah = self.human(torch.cat((state, target), 0))
            ar = self.robot(torch.cat((state, ah), 0))
            state += 1.0 * ar
            loss += torch.norm(target - state)**2
            loss += (1.0 - regulate) * torch.norm(ah_star - ah)**2
            loss += regulate * torch.norm(ah_hat - ah)**2
            loss += abs(torch.norm(ah) - torch.norm(ar))
        return loss
Exemple #4
0
def main():

    env = gym.make("LanderCustom-v0")
    fx_init = float(sys.argv[1])
    Q_threshold = float(sys.argv[2])
    savename = 'test1.pkl'

    joystick = Joystick()
    qnetwork = QNetwork(state_size=8, action_size=4, seed=0)
    qnetwork.load_state_dict(torch.load('basic_lander.pth'))
    qnetwork.eval()

    human = MLP()
    human.load_state_dict(torch.load('mlp_model.pt'))
    human.eval()

    episodes = 10
    scores = []
    data = []
    env.start_state(fx_init, 0.0)

    for episode in range(episodes):

        state = env.reset()
        env.render()
        score = 0

        while True:

            action, start, stop = joystick.input()
            if start:
                break

        while True:


            action, start, stop = joystick.input()
            data.append(list(state) + [action])

            with torch.no_grad():
                state = torch.from_numpy(state).float()
                Q_values = qnetwork(state).data.numpy()
                action_pred_dist = human(state).data.numpy()
            action_star = np.argmax(Q_values)
            action_pred = np.argmax(action_pred_dist)

            # action = action_pred

            loss = Q_values[action_star] - Q_values[action]
            if loss > Q_threshold:
                action = action_star

            env.render()
            state, reward, done, _ = env.step(action)
            score += reward

            if done or stop:
                print(episode, score)
                # pickle.dump(data, open(savename, "wb" ))
                break
            time.sleep(0.025)

        scores.append(score)

    env.close()
    print(scores)