def train(net, optimizer, discount_f):
    env = gym.make(env_name)
    # Iteration start
    for _ in range(num_iter):
        buff = []

        # episode start
        for i_episode in range(num_episode):
            obs = env.reset()

            next_obs = None
            reward = None
            total_reward = 0
            done = False
            traj = Trajectory()

            while not done:
                if next_obs is not None:
                    obs = next_obs
                obs = torch.tensor(obs).float()
                action = action_decide(net, obs)
                next_obs, reward, done, info = env.step(action)
                traj.add(obs, action, reward)
                total_reward += reward
                # if _ % 5 == 0 and i_episode == 0:
                #     env.render()
                if done:
                    buff.append(traj)
                    reward_list.append(total_reward)

        pg(net, optimizer, buff, discount_f)

    env.close()
    return reward_list
def main():
    epsilon = 0.5
    epsilon_end = 0.01
    epsilon_div = 1e4
    epsilon_step = ((epsilon - epsilon_end) / epsilon_div)

    env = atari_env(env_name)
    l_obs = env.observation_space.shape[0]
    n_action = env.action_space.n

    date = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
    dir = 'runs/Breakout_a2c_v2_experiment_epoch5000/' + date
    writer = SummaryWriter(dir)

    net = ConvNet_LSTM(l_obs, n_action).to(DEVICE)
    net.apply(model.weight_init)
    optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE)

    results_reward = []

    for i_iteration in range(NUM_ITER):
        buff = []

        net.reset_lstm()

        obs = env.reset()
        obs = torch.Tensor(obs).unsqueeze(0)
        # obs = torch.Tensor(obs)

        total_reward = 0
        done = False
        traj = Trajectory()

        while not done:
            action = action_decide(net, obs, epsilon)
            next_obs, reward, done, _ = env.step(action)
            traj.add(obs, action, reward)
            total_reward += reward

            obs = next_obs
            obs = torch.Tensor(obs).unsqueeze(0)
            # if i_episode == 0:
            #     env.render()
            #     time.sleep(0.03)
            if done:
                results_reward.append(total_reward)
                writer.add_scalar("Reward/epoch", total_reward,
                                  i_iteration + 1)
        print('iteration: ', i_iteration + 1, '/ ', NUM_ITER, ' reward: ',
              total_reward)
        A2C(net, optimizer, traj)
        if epsilon > epsilon_end:
            epsilon -= epsilon_step
        else:
            epsilon = epsilon_end
    env.close()
    writer.flush()
    writer.close()
    return results_reward
def main():
    env = gym.make(env_name)
    l_obs = 1
    n_action = env.action_space.n

    date = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
    dir = 'runs/Breakout_a2c_v2_experiment_epoch5000/' + date
    writer = SummaryWriter(dir)

    net = ConvNet_LSTM(l_obs, n_action, device=DEVICE).to(DEVICE)
    net.apply(model.weight_init)
    optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE)

    results_reward = []

    for i_iteration in range(NUM_ITER):
        buff = []
        avg_reward = 0

        for i_episode in range(NUM_EPISODE):
            net.reset_lstm()

            obs = env.reset()
            obs = process_frame(obs)
            obs = torch.Tensor(obs).unsqueeze(0)

            total_reward = 0
            done = False
            traj = Trajectory()

            while not done:
                action = action_decide(net, obs)
                next_obs, reward, done, _ = env.step(action)
                traj.add(obs, action, reward)
                total_reward += reward

                obs = next_obs
                obs = process_frame(obs)
                obs = torch.Tensor(obs).unsqueeze(0)
                # if i_episode == 0:
                #     env.render()
                #     time.sleep(0.03)
                if done:
                    buff.append(traj)
                    results_reward.append(total_reward)
                    avg_reward += total_reward
                    writer.add_scalar(
                        "Reward/epoch", total_reward,
                        i_iteration * NUM_EPISODE + (i_episode + 1))
        print('iteration: ', i_iteration + 1, '/ ', NUM_ITER,
              ' average reward: ', avg_reward / NUM_EPISODE)
        A2C(net, optimizer, buff)
    env.close()
    writer.flush()
    writer.close()
    return results_reward
Ejemplo n.º 4
0
def main():
    env = gym.make(env_name)
    l_obs = env.observation_space.shape[0] - 1
    n_action = env.action_space.n

    date = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
    dir = 'runs/cartpole_pg_v6_experiment_epoch5000/' + date
    writer = SummaryWriter(dir)

    if use_lstm:
        net = Net_LSTM(64, 32, l_obs, n_action)
    else:
        net = Net(64, 32, l_obs, n_action)
    net.apply(model.weight_init)
    optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE)

    results_reward = []

    for i_iteration in range(NUM_ITER):
        buff = []

        for i_episode in range(NUM_EPISODE):
            net.reset_lstm()

            obs = env.reset()
            obs = np.delete(obs, 1)
            obs = torch.Tensor(obs).unsqueeze(0).unsqueeze(0)

            next_obs = None
            reward = None
            total_reward = 0
            done = False
            traj = Trajectory()

            while not done:
                action = action_decide(net, obs)
                next_obs, reward, done, _ = env.step(action)
                traj.add(obs, action, reward)
                total_reward += reward

                next_obs = np.delete(next_obs, 1)
                obs = next_obs
                obs = torch.Tensor(obs).unsqueeze(0).unsqueeze(0)
                # if _ % 5 == 0 and i_episode == 0:
                #     env.render()
                if done:
                    print('iteration: ', i_iteration, ' episode: ', i_episode, ' reward: ', total_reward)
                    buff.append(traj)
                    results_reward.append(total_reward)
                    writer.add_scalar("Reward/epoch", total_reward, i_iteration * NUM_EPISODE + (i_episode + 1))
        train(net, optimizer, buff)
    env.close()
    writer.flush()
    writer.close()
    return results_reward
Ejemplo n.º 5
0
def main():
    # Init
    # Init env
    env = gym.make('CartPole-v0')

    # Init net model
    # TODO: Find a method to get the shape of obs
    l_obs = 4
    n_action = env.action_space.n
    net = Net(128, 128, l_obs, n_action)
    net.apply(model.weight_init)

    # Init optim
    optimizer = torch.optim.Adam(net.parameters(), lr=0.01)

    # episode start
    for i_episode in range(num_episode):
        obs = env.reset()

        next_obs = None
        reward = 0
        total_reward = 0
        done = False
        traj = Trajectory()

        while not done:
            if next_obs is not None:
                obs = next_obs
            obs = torch.tensor(obs).float()
            action = action_decide(net, obs)
            next_obs, reward, done, info = env.step(action)
            traj.add(obs, action, reward)
            total_reward += reward
            if i_episode % 100 == 0:
                env.render()
            if done:
                train(net, optimizer, traj)

        reward_list.append(total_reward)
    env.close()
    return reward_list
Ejemplo n.º 6
0
def main():
    # Init env
    env = gym.make(env_name)

    l_obs = env.observation_space.shape[0]
    n_action = env.action_space.n

    # Init net model
    net = Net(num_fc_a, num_fc_a, l_obs, n_action)
    net.apply(model.weight_init)

    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    # Iteration start
    for iteration in range(num_iter):
        obs = env.reset()

        next_obs = None
        total_reward = 0
        done = False
        traj = Trajectory()

        while not done:
            if next_obs is not None:
                obs = next_obs
            obs = torch.Tensor(obs).float()
            action = action_decide(net, obs)
            next_obs, reward, done, info = env.step(action)
            next_obs = torch.tensor(next_obs).float()
            traj.add(obs, action, reward)
            total_reward += reward
            # if iteration == 90:
            #     env.render()
            if done:
                reward_list.append(total_reward)
        A2C(net, optimizer, traj)
    env.close()
    return reward_list
def main(lr):

    # Init
    # Init env
    env = gym.make('LunarLander-v2')

    # Init net model
    # TODO: Find a method to get the shape of obs
    l_obs = env.observation_space.shape[0]
    n_action = env.action_space.n
    date = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
    dir = 'runs/cartpole_pg_v2_experiment_episode1000/' + date
    writer = SummaryWriter(dir)
    net = MetaNet(128, 128, l_obs, n_action)
    net.apply(model.weight_init)

    # # Inspect the net architecture
    # dummy_input = torch.rand(4)
    # writer.add_graph(net, dummy_input)

    # Init optim
    from utils import Adam_Optim
    optimizer = Adam_Optim(net)

    reward_list = []

    # Iteration start
    for _ in range(num_iter):
        buff = []

        # episode start
        for i_episode in range(num_episode):
            obs = env.reset()

            next_obs = None
            reward = 0
            total_reward = 0
            done = False
            traj = Trajectory()

            while not done:
                if next_obs is not None:
                    obs = next_obs
                obs = torch.tensor(obs).float()
                action = action_decide(net, obs)
                next_obs, reward, done, info = env.step(action)
                traj.add(obs, action, reward)
                total_reward += reward
                if _ % 5 == 0 and i_episode == 0:
                    env.render()
                if done:
                    buff.append(traj)
                    reward_list.append(total_reward)
                    writer.add_scalar("Reward/epoch", total_reward,
                                      _ * num_episode + (i_episode + 1))

        train(net, optimizer, buff)
        # if epsilon > epsilon_end:
        #     epsilon -= epsilon_step
    env.close()
    writer.flush()
    writer.close()
    return reward_list