Ejemplo n.º 1
0
def main(args):

    myLander = LunarLander()
    myLander.set_discount(args.discount)

    myrl, trainRewards, totalLoss = train_QL(myLander,
                                             numTrials=args.num_train_trials,
                                             numEpochs=args.num_epochs,
                                             memsize=args.memsize)

    print("Training completed. Switching to testing.")

    plt.plot(trainRewards)
    plt.ylabel('trainingReward')
    plt.xlabel('Trial No.')
    plt.savefig('plots/trainprogress_memSz' + str(args.memsize) + '_epochs' +
                str(args.num_epochs) + '.png')
    #plt.show()

    plt.clf()
    plt.plot(totalLoss)
    plt.savefig('plots/loss_v_time_memSz_' + str(args.memsize) + '_epochs' +
                str(args.num_epochs) + '.png')
    #plt.show()

    #Now test trained model:
    myrl.explorationProb = 0
    #Can simulate from here:
    simulate(myLander,
             myrl,
             memD=None,
             numTrials=args.num_test_trials,
             do_training=False,
             verbose=False)
Ejemplo n.º 2
0
def main():
    myLander = LunarLander()
    myrl, trainRewards = train_QL(myLander,
                                  improvedSmallFeatureExtractor,
                                  numTrials=1000000)
    #myrl, trainRewards = train_QL( myLander, improvedFeatureExtractor, numTrials=500000 )
    # myrl, trainRewards = train_QL( myLander, roundedFeatureExtractor, numTrials=500 )

    print("Training completed. Switching to testing.")

    plt.plot(trainRewards)
    plt.ylabel('trainingReward')
    plt.xlabel('Trial No.')
    plt.savefig("output/trainprogress" + time.strftime("%m%d_%H%M"))
    plt.show()

    #Now test trained model:
    myrl.explorationProb = 0
    #Can simulate from here:
    testRewards = simulate(myLander, myrl, numTrials=100, do_training=False)

    #plot progress from testing
    plt.clf()
    plt.plot(testRewards)
    plt.ylabel('testReward')
    plt.xlabel('Trial No.')
    plt.savefig("output/testprogress" + time.strftime("%m%d_%H%M"))
    plt.show()
Ejemplo n.º 3
0
def test_lander(weight_dict, featureExtractor):
    newLander = LunarLander()
    myrl = QLearningAlgorithm(newLander.actions, newLander.discount,
                              featureExtractor)
    myrl.weights = weight_dict
    myrl.explorationProb = 0.0

    simulate(newLander, myrl, numTrials=100, do_training=False, do_render=True)
Ejemplo n.º 4
0
def main():
    """
    Train and evaluate agent.

    This function basically does the same as the checker that evaluates your agent.
    You can use it for debugging your agent and visualizing what it does.
    """
    from lunar_lander import LunarLander
    from gym.wrappers.monitoring.video_recorder import VideoRecorder

    env = LunarLander()

    agent = Agent(env)
    agent.train()

    rec = VideoRecorder(env, "policy.mp4")
    episode_length = 300
    n_eval = 100
    returns = []
    print("Evaluating agent...")

    for i in range(n_eval):
        print(f"Testing policy: episode {i+1}/{n_eval}")
        state = env.reset()
        cumulative_return = 0
        # The environment will set terminal to True if an episode is done.
        terminal = False
        env.reset()
        for t in range(episode_length):
            # if i <= 10:
            #     rec.capture_frame()
            # Taking an action in the environment
            action = agent.get_action(
                torch.as_tensor(state, dtype=torch.float32))
            state, reward, terminal = env.transition(action)
            cumulative_return += reward
            if terminal:
                break
        returns.append(cumulative_return)
        print(f"Achieved {cumulative_return:.2f} return.")
        # if i == 10:
        #     rec.close()
        #     print("Saved video of 10 episodes to 'policy.mp4'.")
    env.close()
    print(f"Average return: {np.mean(returns):.2f}")
Ejemplo n.º 5
0
    def __init__(self,
                 env=LunarLander(),
                 QNet=QNetwork,
                 exploration_type=0,
                 epsilon=0.9,
                 discount=0.99,
                 max_episodes=1000,
                 max_episode_length=1000,
                 batch_size=32,
                 discount_decay_episodes=300,
                 plot_point=25,
                 num_policy_exe=10,
                 continue_learning=False,
                 execute_policy=0,
                 filepath=os.path.abspath(os.path.dirname(sys.argv[0])) +
                 '\\Weights\\'):

        self.env = env
        self.exploration_type = exploration_type
        self.epsilon = epsilon if not exploration_type == 2 else 2
        self.phi = 100 if exploration_type == 2 else 1
        self.discount = discount
        self.max_episodes = max_episodes
        self.max_episode_length = max_episode_length
        self.batch_size = batch_size
        self.continue_learning = continue_learning
        self.execute_policy = execute_policy
        self.buffer = ReplayBuffer(100000)
        self.rew_plotter = GraphCollector()
        self.loss_plotter = GraphCollector()
        self.num_actions = self.env.action_space.n
        self.NUM_COMPONENT = self.env.num_reward_components
        self.discount_decay_episodes = discount_decay_episodes
        self.plot_point = plot_point
        self.main_nn = []
        self.target_nn = []
        self.optimizer = []
        self.mse = tf.keras.losses.MeanSquaredError()
        self.num_policy_exe = num_policy_exe
        self.filepath = filepath
        self.explainer = Explainer(self)

        for c in range(self.NUM_COMPONENT):
            self.main_nn.append(QNet(64, self.num_actions))
            self.target_nn.append(QNet(64, self.num_actions))
            self.optimizer.append(tf.optimizers.Adam(0.01))  #5e-4
Ejemplo n.º 6
0
        help="Collect the data in a pickle file.",
    )

    args = parser.parse_args()

    samples = {
        "state": [],
        "state_img": [],
        "next_state": [],
        "next_state_img": [],
        "reward": [],
        "action": [],
        "terminal": [],
    }

    env = LunarLander()
    env.render()
    env.viewer.window.on_key_press = key_press
    env.viewer.window.on_key_release = key_release

    a = np.array([0])

    episode_rewards = []
    steps = 0
    while True:
        episode_reward = 0
        state = env.reset()
        state_img = env.render(
            mode="rgb_array")[::4, ::4, :]  # downsampling (every 4th pixel).

        while True:
Ejemplo n.º 7
0
        if done or step > max_timesteps:
            break

    return episode_reward


if __name__ == "__main__":

    # important: probably it doesn't work for you to set rendering to False for evaluation
    rendering = True

    conf = Config()
    agent = BCAgent(conf)
    model_name = 'agent_2020-03-07--19-42.pt'
    agent.load(f"models/{model_name}", to_cpu=True)
    env = LunarLander()

    episode_rewards = []
    for i in range(conf.n_test_episodes):
        episode_reward = run_episode(env, agent, conf, rendering=rendering)
        episode_rewards.append(episode_reward)

    # save results in a dictionary and write them into a .json file
    results = dict()
    results["episode_rewards"] = episode_rewards
    results["mean"] = np.array(episode_rewards).mean()
    results["std"] = np.array(episode_rewards).std()

    timestamp = model_name.split(sep='_')[1][0:-3]
    fname = f"results/results_bc_agent-{timestamp}.json"
    fh = open(fname, "w")
Ejemplo n.º 8
0
    EPS_END = opt.min_epsilon
    EPS_DECAY = opt.epsilon_decay
    EPS_OFFSET = opt.initial_memory_size
    TARGET_SYNC = opt.sync_freq
    LOG_FREQ = opt.log_freq
    RENDER = opt.render
    MAX_FRAMES = opt.frames
    LR = opt.lr
    INITIAL_MEMORY = opt.initial_memory_size
    MEMORY_SIZE = opt.memory_size
    PLAY_STEPS = opt.play_steps
    HUMAN = opt.human

    # create environment
    if 'lunar' in opt.env:
        env = LunarLander()
        RAM = True
    else:
        env = gym.make(env_id)
        if not opt.evaluate:
            env = ptan.common.wrappers.wrap_dqn(env)
        else:
            env = ptan.common.wrappers.wrap_dqn(env, episodic_life=False, reward_clipping=False)
        RAM = False

    N_ACTIONS = env.action_space.n

    # human control mode and saliency rendering
    if HUMAN:
        RENDER = True
Ejemplo n.º 9
0
def plot_io_bounds(x, y, vx, vy, theta, omega, a, steps, discrete=True):
    import matplotlib.pyplot as plt

    statebox = [x, y, vx, vy, theta, omega]
    centerstate = [box[0] + .5 * (box[1] - box[0]) for box in statebox]
    envstate = [i for i in centerstate]

    # Zero order hold on actions if needed
    if discrete and isinstance(a, int):
        a = a * np.ones(steps, dtype=np.int32)
    elif not discrete:
        a = [np.array(a) for i in range(steps)]

    # System IDed model trajectory
    centerstatehist = [centerstate]
    for i in range(steps):
        centerstate = lander_dynamics(*centerstate, a=a[i], discrete=discrete)
        centerstatehist.append(centerstate)

    # Actual openai gym model trajectory
    envstatehist = [envstate]
    if discrete:
        from lunar_lander import LunarLander
        env = LunarLander()
    else:
        from lunar_lander import LunarLanderContinuous
        env = LunarLanderContinuous()
    s = env.reset(envstate)
    for i in range(steps):
        s, _, _, _ = env.step(a[i])
        envstatehist.append(s[0:6])

    # Overapproximated trajectory
    stateboxhist = [statebox]
    for i in range(steps):
        statebox = lander_box_dynamics(*statebox,
                                       a=a[i],
                                       steps=1,
                                       discrete=discrete)
        stateboxhist.append(statebox)

    centerstatehist = np.array(centerstatehist)
    envstatehist = np.array(envstatehist)
    stateboxhist = np.array(stateboxhist)

    t = np.linspace(0, steps, steps + 1)
    fig, axs = plt.subplots(6, 1, figsize=(4, 9))

    # fig.set_size_inches(5,7,forward=True)

    limits = [[-1, 1], [0, 1], [-1, 1], [-1, 1], [-np.pi / 3, np.pi / 3],
              [-.5, .5]]
    for i in range(6):
        axs[i].fill_between(t,
                            stateboxhist[:, i, 0],
                            stateboxhist[:, i, 1],
                            alpha=0.3)
        axs[i].plot(centerstatehist[:, i], 'r')
        axs[i].plot(envstatehist[:, i], 'b.')
        axs[i].set_ylim(bottom=limits[i][0], top=limits[i][1])
        axs[i].set_yticks(np.linspace(limits[i][0], limits[i][1], 17),
                          minor=True)
        axs[i].grid(which='minor', alpha=.4)

    axs[0].set_title('Action {0}'.format(a))
    plt.show()
Ejemplo n.º 10
0
from cartpole import TimedCartPoleEnv
from lunar_lander import LunarLander
from simple_lander import SimpleLander
from model import QVModel
from kerlas import ReplayMemory
from kerlas.policies import BoltzmannQPolicy, GreedyEpsPolicy
import numpy as np, random
from time_limit import TimeLimit

np.set_printoptions(precision=4, suppress=True)

Gamma = 0.99
#game_env = SimpleEnv(200)

# lunar lander
env = LunarLander()
game_env = TimeLimit(env, time_limit=300, timeout_reward=-1.0)

# simple lander
game_env = SimpleLander()

state_dim = game_env.observation_space.shape[-1]
nactions = game_env.action_space.n

print("state_dim:", state_dim)

qvmodel = QVModel(state_dim, nactions, Gamma)
memory = ReplayMemory(10000)

NGames = 100000
NextTrain = TrainInterval = 5  # train after 5 games
Ejemplo n.º 11
0
from lunar_lander import LunarLander, FPS
import random, time, getopt
import numpy as np

np.set_printoptions(precision=3, suppress=True)

if __name__ == "__main__":

    env = LunarLander()
    dt = 1.0 / FPS

    obs = env.reset()
    env.render()
    done = False
    t = 0
    t0 = time.time()
    while not done and t < 500:
        a = random.randint(0, 3)
        s1, r, done, info = env.step(a)
        print(s1, r, info)
        #time.sleep(dt*10)
        env.render()
        t += 1
    print("rate:", t / (time.time() - t0))
Ejemplo n.º 12
0
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)
        self.epsilon = 0.3
        #self.model = load_model(name)
        self.model.summary()

    def save(self, name):
        self.model.save_weights(name)


if __name__ == "__main__":
    #    env = gym.make('LunarLander-v3')
    env = LunarLander()

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    #agent = DQNAgent(state_size, action_size)
    done = False
    batch_size = 1
    if 0:
        agent.load("model.dat")
    for e in range(EPISODES):
        #agent.load("../../Downloads/model_900.h5")
        #agent.epsilon = 0.0
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        tot_rew = 0
        for time in range(300):
Ejemplo n.º 13
0
from lunar_lander import demo_heuristic_lander, LunarLander

total_reward_array = []

myLunarLander = LunarLander()

dorender = True
num_iters = 100
isdumb = True

for i in range(0, num_iters):
    end_reward = demo_heuristic_lander(myLunarLander,
                                       render=dorender,
                                       dumb=isdumb)
    total_reward_array.append(end_reward)
    myLunarLander.reset()
    print("Iteration: " + str(i))

print("Average Rewards Over " + str(num_iters) + " trials ")
average_reward = sum(total_reward_array) / len(total_reward_array)
print(average_reward)