def main(): # All agents use a tabular model will initial values of 0 # Updates are done via TD learning with a fixed learning rate # Action_FA of discrete max means the agent chooses the action with the highest utility from a discrete array base_agent = Agent(model=TabularModel(mean=0, std=0), action_fa=DiscreteMaxFA(), optimiser=TemporalDifference(learning_rate=FixedDecay(0.2))) # Randomly select the next action random_agent = copy.deepcopy(base_agent) random_agent.exploration = RandomExploration() # Always select the best action seen so far (is default behaviour for agents) greedy_agent = copy.deepcopy(base_agent) # Always select the best action seen so far with optimistic starting values optimistic_greedy_agent = copy.deepcopy(base_agent) optimistic_greedy_agent.model = TabularModel(mean=1, std=0) # Select a random action with decaying likelihood egreedy_agent = copy.deepcopy(base_agent) egreedy_agent.exploration = EpsilonGreedy(FixedDecay(1, 0.995, 0.01)) # Select a random action with fixed likelihood fixed_egreedy_agent = copy.deepcopy(base_agent) fixed_egreedy_agent.exploration = EpsilonGreedy(FixedDecay(0.2)) # Explores using softmax boltzmann_agent = copy.deepcopy(base_agent) boltzmann_agent.exploration = Softmax(FixedDecay(2, 0.995, 0.1)) agents = [random_agent, greedy_agent, optimistic_greedy_agent, egreedy_agent, fixed_egreedy_agent, boltzmann_agent] labels = ['Random', 'Greedy', 'Optimistic Greedy', 'E-Greedy Decay', 'E-Greedy Fixed', 'Boltzmann'] agent_reward = [] max_reward = [] episodes = 100 for agent in agents: path = "/tmp/rlagents/" am = AgentManager(agent=agent) em = EnvManager('BanditTenArmedUniformDistributedReward-v0', am) em.run(n_episodes=episodes, print_stats=False, path=path, video_callable=False) max_reward.append(max(em.env.r_dist)) results = load_results(path) agent_reward.append(results['episode_rewards']) for i, ar in enumerate(agent_reward): percent_correct = [agent_reward[i][:j].count(max_reward[i])/float(j) for j in range(1, episodes)] plt.plot(range(1, episodes), percent_correct, label=labels[i]) plt.xlabel('Steps') plt.ylabel('% Optimal Arm Pulls') plt.ylim(-0.2, 1.5) plt.legend(loc=2) plt.show()
def hillclimbing_continuouslinear(): agent = Agent(model=WeightedLinearModel(), action_fa=ClipFA()) am = AgentManager(HillClimbing()) am.add(agent, number=2) return am
def random_discrete(): agent = Agent(model=DefaultModel(), exploration=RandomExploration(), action_fa=DiscreteMaxFA()) am = AgentManager(agent=agent) return am
def geneticalgorithm_continuouslinear(): agent = Agent(model=WeightedLinearModel(), action_fa=ClipFA()) am = AgentManager(GeneticAlgorithm()) am.add(agent, number=2) return am
def simulatedannealing_continuouslinear(): agent = Agent(model=WeightedLinearModel(), action_fa=ClipFA()) am = AgentManager(SimulatedAnnealing()) am.add(agent, number=1) return am
def hillclimbing_discretelinear(): agent = Agent(model=WeightedLinearModel(), action_fa=DiscreteMaxFA()) am = AgentManager(HillClimbing()) am.add(agent, number=2) return am
def crossentropy_continuouslinear(): agent = Agent(model=WeightedLinearModel(), action_fa=ClipFA()) am = AgentManager(CrossEntropy()) am.add(agent, number=2) return am
def simulatedannealing_discretelinear(): agent = Agent(model=WeightedLinearModel(), action_fa=DiscreteMaxFA()) am = AgentManager(SimulatedAnnealing()) am.add(agent, number=1) return am
def geneticalgorithm_discretelinear(): agent = Agent(model=WeightedLinearModel(), action_fa=DiscreteMaxFA()) am = AgentManager(GeneticAlgorithm()) am.add(agent, number=2) return am
def crossentropy_discretelinear(): agent = Agent(model=WeightedLinearModel(), action_fa=DiscreteMaxFA()) am = AgentManager(CrossEntropy()) am.add(agent, number=2) return am
def random_default(): agent = Agent(model=DefaultModel(), exploration=RandomExploration()) am = AgentManager(agent) return am
def random_continuous(): agent = Agent(model=DefaultModel(), exploration=RandomExploration(), action_fa=ClipFA()) am = AgentManager(agent=agent) return am