Beispiel #1
0
def main():
    parser = argparse.ArgumentParser(
        'Behaviour cloning using pre-trained expert rollouts.')
    parser.add_argument('--rollout_file', type=str,
                        default='expert_data/Humanoid-v2.pkl')
    parser.add_argument('--envname', type=str, default='Humanoid-v2')
    parser.add_argument('--max_timesteps', type=int, default=1000)
    parser.add_argument('--training_epochs', type=int, default=2000)
    parser.add_argument('--save_model', type=str, default='./DAgger_Humanoid_lstm-v2.pth')
    parser.add_argument('--render', type=bool, default=True)

    args = parser.parse_args()

    # load expert rollout and model
    rollout = load_rollout(args.rollout_file)
    train = torch.tensor(rollout['observations'], dtype=torch.double)
    target = torch.tensor(rollout['actions'], dtype=torch.double)

    policy_net = load_policy.load_policy(args.expert_policy_file)

    train = train.to(dev)
    target = target.to(dev)

    db.printTensor(train)
    db.printTensor(target)

    # make the environment
    env = gym.make(args.envname)

    # build model
    model = Model(input_dim=env.observation_space.shape[0],
                  output_dim=env.action_space.shape[0])
    model.double()
    model.to(torch.device(dev))
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    # train
    for i in range(args.dagger_epochs):
        for epoch in range(args.training_epochs):
            data_generator = recurrent_generator(train, target, batch_size=20)
            t_start = time.time()
            for train_sample, target_sample in data_generator:
                out = model(train_sample)
                loss = criterion(out, target_sample)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            if epoch % 10 == 0:
                db.printInfo('Epoch: {} Loss: {} Time: {}'.format(epoch, loss, time.time()-t_start))

        obs = env.reset()
        done = False

    save(epoch, model, optimizer, loss, args.save_model)
Beispiel #2
0
    def testSampleAction(self):
        ob_dim, ac_dim, n_layers, hidden_size = 5, 3, 20, 10
        batch = 1
        neural_network_args = {
            'n_layers': n_layers,
            'ob_dim': ob_dim,
            'ac_dim': ac_dim,
            'discrete': False,
            'size': hidden_size
        }
        network = PolicyNet(neural_network_args)

        inputs, outputs = [batch, ob_dim], [batch, ac_dim]
        # dist = torch.distributions.Categorical(action_probs)
        obs = torch.randn(inputs)
        if neural_network_args['discrete']:
            out = network(obs)
            action_probs = torch.nn.functional.softmax(out, dim=-1)
            dist = torch.distributions.Categorical(action_probs)
            sampled_action = dist.sample()
            dist.log_prob(sampled_action)

            dist1 = torch.distributions.Categorical(probs=action_probs)
            dist2 = torch.distributions.Categorical(logits=out)
            db.printInfo(dist1)
            db.printInfo(dist2)
        else:

            ts_mean, ts_logstd = network(obs)

            ts_mean = torch.randn(2000, 6)
            ts_logstd = torch.randn(6)

            dist = torch.distributions.Normal(loc=ts_mean, scale=ts_logstd)

            # YOUR_CODE_HERE
            ts_logstd_na = []
            for _ in range(list(ts_mean.shape)[0]):
                ts_logstd_na.append(ts_logstd)
            ts_logstd_na = torch.stack(ts_logstd_na)
            # db.printInfo(ts_logstd_na)
            # db.printInfo(ts_logstd_na.exp())

            sampled_action = torch.normal(mean=ts_mean, std=ts_logstd_na.exp())

            # sampled_action = torch.distributions.Normal(mean, logstd.exp()).sample()

            # sampled_action = torch.normal(mean, logstd.exp())

            # sampled_action = torch.normal(mean=torch.tensor([0,0]),
            #                               std=1)
        db.printInfo(sampled_action)
        db.printInfo(obs)
        db.printInfo()
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        'Behaviour cloning using pre-trained expert rollouts.')
    parser.add_argument('--rollout_file',
                        type=str,
                        default='expert_data/Ant-v2.pkl')
    parser.add_argument('--envname', type=str, default='Ant-v2')
    parser.add_argument('--max_timesteps', type=int, default=1000)
    parser.add_argument('--training_epochs', type=int, default=2000)
    parser.add_argument('--save_model', type=str, default='./BC_Ant-v2.pth')
    parser.add_argument('--render', type=bool, default=True)
    parser.add_argument('--recurrent', type=bool, default=False)
    parser.add_argument('--hidden_size', type=int, default=512)

    args = parser.parse_args()

    # load expert rollout
    rollout = load_rollout(args.rollout_file)
    train = torch.tensor(rollout['observations'], dtype=torch.double)
    target = torch.tensor(rollout['actions'], dtype=torch.double)

    train = train.to(dev)
    target = target.to(dev)

    db.printTensor(train)
    db.printTensor(target)

    # make the environment
    env = gym.make(args.envname)

    # build model
    model = Model(input_dim=env.observation_space.shape[0],
                  output_dim=env.action_space.shape[0])
    model.double()
    model.to(torch.device(dev))
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    for epoch in range(args.training_epochs):
        data_generator = feed_forward_generator(train, target, batch_size=20)
        for train_sample, target_sample in data_generator:
            db.printTensor(train_sample)
            input()
            out = model(train_sample)
            loss = criterion(out, target_sample)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if epoch % 10 == 0:
            db.printInfo('Epoch: {} Loss: {:.4f}'.format(epoch, loss))
    save(epoch, model, optimizer, loss, args.save_model)
Beispiel #4
0
def save(epoch, model, optimizer, loss, path, overwrite=False):

    if overwrite:
        rev = 1
        while os.path.exists(path):
            path = path[:-4]
            path = path+'_'+str(rev)+'.pth'
            rev += 1

    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }, path)

    db.printInfo('Model saved to {}'.format(path))
Beispiel #5
0
def rtg_solution(re_n, gamma, reward_to_go=True):
    db.printTensor(re_n)
    # YOUR_CODE_HERE
    if reward_to_go:
        q_n = [
            scipy.signal.lfilter(b=[1], a=[1, -gamma], x=re[::-1])[::-1]
            for re in re_n
        ]
    else:
        q_n = [
            np.full_like(
                re,
                scipy.signal.lfilter(b=[1], a=[1, -gamma], x=re[::-1])[-1])
            for re in re_n
        ]
    db.printInfo(q_n)
    q_n = np.concatenate(q_n).astype(np.float32)
    db.printInfo(q_n)
    return q_n
Beispiel #6
0
def rtg(re_n, gamma, reward_to_go=True):
    db.printTensor(re_n)
    q_n = []
    if reward_to_go:
        for traj in re_n:
            q_path = []
            for reward in traj[::-1]:
                try:
                    q_path.append(q_path[-1] * gamma + reward)
                except IndexError:
                    q_path.append(reward)
            q_n.append(q_path[::-1])
    else:
        # for traj in re_n:
        #     q_path = 0
        #     for t, reward in enumerate(traj[::-1]):
        #         q_path = q_path*gamma + reward
        #         db.printInfo(q_path)
        #         # db.printInfo(q_path)
        #         # db.printInfo(t)
        #         # db.printInfo(gamma**t)

        #         # input()
        #     # do this to have the same return for each time step
        #     q_n.append([q_path for _ in range(len(traj))])
        for traj in re_n:
            for t, reward in enumerate(traj):
                try:
                    q_path = q_path + reward * gamma**t
                except:
                    q_path = reward
            # do this to have the same return for each time step
            q_n.append([q_path for _ in range(len(traj))])
    db.printInfo(q_n)
    q_n = np.concatenate(q_n).astype(np.float32)

    db.printInfo(q_n)
    return q_n
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser(
        'Behaviour cloning using pre-trained expert rollouts.')
    parser.add_argument('--save_file', type=str, default='./BC_Ant-v2.pth')
    parser.add_argument('--envname', type=str, default='Ant-v2')
    parser.add_argument('--iter', type=int, default=20)
    parser.add_argument('--max_timesteps', type=int, default=1000)
    parser.add_argument('--render', type=bool, default=False)

    args = parser.parse_args()

    env = gym.make(args.envname)

    # build model
    model = Model(input_dim=env.observation_space.shape[0],
                  output_dim=env.action_space.shape[0])
    model.double()
    model.load_state_dict(load_model(args.save_file))

    returns = []
    for i in range(args.iter):
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            with torch.no_grad():
                action = model(
                    torch.tensor(obs, dtype=torch.double).unsqueeze(0))
                action = action.numpy()
                obs, r, done, _ = env.step(action)
                totalr += r
                steps += 1
                if args.render:
                    env.render()
                if steps >= args.max_timesteps:
                    break
        db.printInfo("Iter {} {}/{} Reward: {:.2f}".format(
            i, steps, args.max_timesteps, totalr))
        returns.append(totalr)

    db.printInfo("Mean return {}".format(np.mean(returns)))
    db.printInfo("Std of return {}".format(np.std(returns)))
Beispiel #8
0
import torch
import torch.nn as nn
import torch.optim as optim

import load_policy
import print_custom as db

if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"

# dev = 'cpu'

db.printInfo(dev)


class Model(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_size=64):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.fc1 = nn.Linear(input_dim, 200)
        self.fc2 = nn.Linear(200, 100)
        self.fc3 = nn.Linear(100, input_dim)
        self.lstm = nn.LSTM(input_dim, hidden_size, batch_first=True)
        self.fc4 = nn.Linear(hidden_size, output_dim)

    def forward(self, inputs):
        hidden = (torch.zeros(1, len(inputs), self.hidden_size, dtype=torch.double).to(dev),
                  torch.zeros(1, len(inputs), self.hidden_size, dtype=torch.double).to(dev))
Beispiel #9
0

def mlp(input_size, output_size, n_layers, hidden_size, activation=nn.Tanh):
    layers = []
    layers.append(nn.Linear(input_size, hidden_size))
    layers.append(activation())
    for _ in range(n_layers):
        layers.append(nn.Linear(hidden_size, hidden_size))
        layers.append(activation())
    layers.append(nn.Linear(hidden_size, output_size))
    print(layers)


if __name__ == "__main__":
    # unittest.main()

    re_n = np.arange(1, 21).reshape(2, 10)
    re_n = np.ones(5).reshape(1, 5)
    re_n = np.arange(1, 5).reshape(1, 4)

    db.printInfo(re_n)
    # re_n = np.ones(10).reshape(1,10)
    q_n = rtg(re_n, 0.5, False)
    q_n_sol = rtg_solution(re_n, 0.5, False)
    # q_n_sol = rtq_v2(re_n,0.5, False)

    # db.printInfo('Equal {}'.format(False not in (q_n == q_n_sol)))
    # mlp_sol(5,3,2,64)
    # mlp(5,3,2,64)
    # module = build_mlp(5,3,3,64)
    # print(module)
    def run(self):
        print("Agent {} started, Process ID {}".format(self.name, os.getpid()))
        actions = []
        rewards = []
        states = []
        logprobs = []
        is_terminal = []
        timestep = 0
        # lists to collect agent experience
        # variables for logging
        running_reward = 0

        for i_episodes in range(1, self.max_episode + 2):
            state = self.env.reset()

            if i_episodes == self.max_episode + 1:
                db.printInfo("Max episodes reached")
                msg = MsgMaxReached(self.proc_id, True)
                self.pipe.send(msg)
                break

            for i in range(self.max_timestep):

                timestep += 1

                states.append(state)

                with torch.no_grad():
                    action, logprob = self.memory.agent_policy.act(
                        state, False)
                state, reward, done, _ = self.env.step(action)

                actions.append(action)
                logprobs.append(logprob)
                rewards.append(reward)
                is_terminal.append(done)

                running_reward += reward

                if timestep % self.update_timestep == 0:
                    stateT, actionT, logprobT, disReturn = \
                        self.experience_to_tensor(
                            states, actions, rewards, logprobs, is_terminal)

                    self.add_experience_to_pool(stateT, actionT, logprobT,
                                                disReturn)

                    msg = MsgUpdateRequest(int(self.proc_id), True)
                    self.pipe.send(msg)
                    msg = self.pipe.recv()
                    if msg == "RENDER":
                        self.render = True
                    timestep = 0
                    actions = []
                    rewards = []
                    states = []
                    logprobs = []
                    is_terminal = []

                if done:
                    break

                if self.render:
                    time.sleep(0.005)
                    self.env.render()

            if i_episodes % self.log_interval == 0:
                running_reward = running_reward / self.log_interval
                # db.printInfo("sending reward msg")
                msg = MsgRewardInfo(self.proc_id, i_episodes, running_reward)
                self.pipe.send(msg)
                running_reward = 0